In [1]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
DATA_DIR = os.path.join(os.pardir, '175Project')

COL_NAMES = ['character', 'browsing_page_url', 'word_url', 'word', 'definition', 'sentence']

def load_urban_dataset():
    file_paths = []
    for root, dirs, files in os.walk(os.path.join(DATA_DIR, 'Urban')):
        for f in files:
            if f.endswith('.csv') and f.startswith('urban_data'):
                file_paths.append(os.path.join(root, f))
    df_urban = pd.concat([pd.read_csv(f, names=COL_NAMES) for f in file_paths])

    df_nulls = df_urban[(df_urban.isnull().any(axis=1)) | (df_urban.isna().any(axis=1))]
    df_urban = df_urban.drop(df_nulls.index)

    return df_urban

In [3]:
urban_dictionary = load_urban_dataset()
print(f"Shape of urban dictionary dataset: {urban_dictionary.shape}")
ud_sample = urban_dictionary[['word', 'definition', 'sentence']].sample(1)
for i in ud_sample.values:
    print("Word: ", i[0])
    print("Meaning: ", i[1])
    print("Sentence: ", i[2])

Shape of urban dictionary dataset: (2175494, 6)
Word:  ALWAYS4TUESDAYS
Meaning:  Anything  that is  yang   in your life
Sentence:  Damn homie ! That  Big Mac  was  always4tuesdays !!


In [4]:
urban_data = urban_dictionary[['word', 'definition', 'sentence']]
train_u, test_u = train_test_split(urban_data, test_size=0.2, random_state=42, shuffle=True)
#example of what the data looks like
row = train_u.iloc[0]
print(row)
print()
print("The full item")
print()
print(row.values)

word                                                   rawrivan
definition                                 Awesomely cool   guy
sentence      Did you   see  rawrivan? I  want to  be just l...
Name: 17480, dtype: str

The full item

<ArrowStringArray>
[                                              'rawrivan',
                                   'Awesomely cool   guy',
 'Did you   see  rawrivan? I  want to  be just like him.']
Length: 3, dtype: str


In [5]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))
corpus.print_summary_stats()

Downloading movie-corpus to /Users/hidemk/.convokit/saved-corpora/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [6]:
corpus.print_summary_stats()

Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [7]:
conversations_raw = []

# Iterate over all conversation IDs
for convo_id in corpus.get_conversation_ids():
    convo = corpus.get_conversation(convo_id)
    # Extract the textual content of each utterance in the conversation
    convo_text = [utt.text for utt in convo.iter_utterances()]
    conversations_raw.append(convo_text)

print(f"Total conversations: {len(conversations_raw)}")
print("Example conversation:", conversations_raw[0])

Total conversations: 83097
Example conversation: ['They do not!', 'They do to!']


In [8]:
pairs = []
for convo in conversations_raw:
    for i in range(len(convo)-1):
        pairs.append((convo[i], convo[i+1]))

print(f"Total pairs: {len(pairs)}")
print("Sample pair:", pairs[0])

Total pairs: 221616
Sample pair: ('They do not!', 'They do to!')


In [9]:
train_convos, test_convos = train_test_split(conversations_raw, test_size=0.2, random_state=42)

In [10]:
all_texts = [utt.text for utt in corpus.iter_utterances()]

print(f"Total utterances: {len(all_texts)}")
print("Example utterances:", all_texts[:5])

Total utterances: 304713
Example utterances: ['They do not!', 'They do to!', 'I hope so.', 'She okay?', "Let's go."]


In [11]:
train_texts, test_texts = train_test_split(all_texts, test_size=0.2, random_state=42)

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(train_convos, show_progress_bar=True)
print(f"Shape of embeddings: {embeddings.shape}")

  from .autonotebook import tqdm as notebook_tqdm
W0212 19:23:18.224000 77207 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
Batches: 100%|██████████| 2078/2078 [01:32<00:00, 22.44it/s]


Shape of embeddings: (66477, 384)


In [13]:
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[ 1.0000,  0.1206,  0.0736,  ...,  0.1332,  0.0051,  0.0891],
        [ 0.1206,  1.0000,  0.1462,  ..., -0.0054, -0.0103,  0.0212],
        [ 0.0736,  0.1462,  1.0000,  ...,  0.1846,  0.0885,  0.2024],
        ...,
        [ 0.1332, -0.0054,  0.1846,  ...,  1.0000,  0.1002,  0.1454],
        [ 0.0051, -0.0103,  0.0885,  ...,  0.1002,  1.0000,  0.1599],
        [ 0.0891,  0.0212,  0.2024,  ...,  0.1454,  0.1599,  1.0000]])


In [22]:
import faiss

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

def retrieve(query, k=2):
    distances, indices = index.search(embeddings, k)
    return [train_convos[i] for i in indices[0]]

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [19]:
def generate_response(query):
    retrieved_docs = retrieve(query)
    context = "\n".join(retrieved_docs)

    prompt = f"""
    You are an assistant trying to utilize slang.
    Use only the context below to generate slang usage.

    Context:
    {context}

    Question:
    {query}

    Answer:
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, temperature=0.1,max_length=50, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [23]:
answer = generate_response("Generate a usage for a slang word that you came across in the training data.")

: 