In [6]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [7]:
DATA_DIR = os.path.join(os.pardir, '175Project')

COL_NAMES = ['character', 'browsing_page_url', 'word_url', 'word', 'definition', 'sentence']

def load_urban_dataset():
    file_paths = []
    for root, dirs, files in os.walk(os.path.join(DATA_DIR, 'Urban')):
        for f in files:
            if f.endswith('.csv') and f.startswith('urban_data'):
                file_paths.append(os.path.join(root, f))
    df_urban = pd.concat([pd.read_csv(f, names=COL_NAMES) for f in file_paths])

    df_nulls = df_urban[(df_urban.isnull().any(axis=1)) | (df_urban.isna().any(axis=1))]
    df_urban = df_urban.drop(df_nulls.index)

    return df_urban

In [8]:
urban_dictionary = load_urban_dataset()
print(f"Shape of urban dictionary dataset: {urban_dictionary.shape}")
ud_sample = urban_dictionary[['word', 'definition', 'sentence']].sample(1)
for i in ud_sample.values:
    print("Word: ", i[0])
    print("Meaning: ", i[1])
    print("Sentence: ", i[2])

Shape of urban dictionary dataset: (2175494, 6)
Word:  Blow glass
Meaning:  Yacht  exscursion in the a.m watching/ snapping the water spray out the back of the  yacht  as you leave the  harbor .
Sentence:  Snapchat a  clip  of the moment with a  caption . “Do you  blow glass ?”


In [18]:
urban_data = urban_dictionary[['word', 'definition', 'sentence']]
train_u, test_u = train_test_split(urban_data, test_size=0.2, random_state=42, shuffle=True)
#example of what the data looks like
row = train_u.iloc[0]
print(row)
print()
print("The full item")
print()
print(row.values)

word                                                     Adeogo
definition    A beautiful tall black girl who's future job i...
sentence      Adeogo has been  playing the violin   for 7   ...
Name: 17480, dtype: object

The full item

['Adeogo'
 "A beautiful tall black girl who's future job is a fashion model for high and popular brands, like Gucci, Dior,  MCM , Louis Vuitton,  Balenciaga  and more. Her horoscope sign is cancer. She could  play the violin  really well. She could also be annoying sometimes, but she's smart and always loves to watch movies and is always kind and loves to spend time with family and friends."
 'Adeogo has been  playing the violin   for 7   years  since she was 6.']


In [20]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))
corpus.print_summary_stats()

No configuration file found at C:\Users\brock/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading movie-corpus to C:\Users\brock\.convokit\saved-corpora\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [21]:
corpus.print_summary_stats()

Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [26]:
conversations_raw = []

# Iterate over all conversation IDs
for convo_id in corpus.get_conversation_ids():
    convo = corpus.get_conversation(convo_id)
    # Extract the textual content of each utterance in the conversation
    convo_text = [utt.text for utt in convo.iter_utterances()]
    conversations_raw.append(convo_text)

print(f"Total conversations: {len(conversations_raw)}")
print("Example conversation:", conversations_raw[0])

Total conversations: 83097
Example conversation: ['They do not!', 'They do to!']


In [27]:
pairs = []
for convo in conversations_raw:
    for i in range(len(convo)-1):
        pairs.append((convo[i], convo[i+1]))

print(f"Total pairs: {len(pairs)}")
print("Sample pair:", pairs[0])

Total pairs: 221616
Sample pair: ('They do not!', 'They do to!')


In [28]:
train_convos, test_convos = train_test_split(conversations_raw, test_size=0.2, random_state=42)

Train conversations: 66477, Test conversations: 16620


In [29]:
all_texts = [utt.text for utt in corpus.iter_utterances()]

print(f"Total utterances: {len(all_texts)}")
print("Example utterances:", all_texts[:5])

Total utterances: 304713
Example utterances: ['They do not!', 'They do to!', 'I hope so.', 'She okay?', "Let's go."]


In [30]:
train_texts, test_texts = train_test_split(all_texts, test_size=0.2, random_state=42)