In [1]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
DATA_DIR = os.path.join(os.pardir, '175Project')

COL_NAMES = ['character', 'browsing_page_url', 'word_url', 'word', 'definition', 'sentence']

def load_urban_dataset():
    file_paths = []
    for root, dirs, files in os.walk(os.path.join(DATA_DIR, 'Urban')):
        for f in files:
            if f.endswith('.csv') and f.startswith('urban_data'):
                file_paths.append(os.path.join(root, f))
    df_urban = pd.concat([pd.read_csv(f, names=COL_NAMES) for f in file_paths])

    df_nulls = df_urban[(df_urban.isnull().any(axis=1)) | (df_urban.isna().any(axis=1))]
    df_urban = df_urban.drop(df_nulls.index)

    return df_urban

In [3]:
urban_dictionary = load_urban_dataset()
print(f"Shape of urban dictionary dataset: {urban_dictionary.shape}")
ud_sample = urban_dictionary[['word', 'definition', 'sentence']].sample(1)
for i in ud_sample.values:
    print("Word: ", i[0])
    print("Meaning: ", i[1])
    print("Sentence: ", i[2])

Shape of urban dictionary dataset: (2175494, 6)
Word:  Date definitions
 At least they are  not sex  definitionsng with  name definitions  and they are also  plagued  all over the dictionary
 October 18 :u either slap someone ass or kiss some and ask to get love or all And if you go vvote then they are almost everything is  name definitions  and the thing that I am talking about


In [4]:
urban_data = urban_dictionary[['word', 'definition', 'sentence']]
train_u, test_u = train_test_split(urban_data, test_size=0.2, random_state=42, shuffle=True)
#example of what the data looks like
row = train_u.iloc[0]
print(row)
print()
print("The full item")
print()
print(row.values)

word                                                     Adeogo
definition    A beautiful tall black girl who's future job i...
sentence      Adeogo has been  playing the violin   for 7   ...
Name: 17480, dtype: object

The full item

['Adeogo'
 "A beautiful tall black girl who's future job is a fashion model for high and popular brands, like Gucci, Dior,  MCM , Louis Vuitton,  Balenciaga  and more. Her horoscope sign is cancer. She could  play the violin  really well. She could also be annoying sometimes, but she's smart and always loves to watch movies and is always kind and loves to spend time with family and friends."
 'Adeogo has been  playing the violin   for 7   years  since she was 6.']


In [5]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))
corpus.print_summary_stats()

Downloading movie-corpus to C:\Users\brock\.convokit\saved-corpora\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [6]:
corpus.print_summary_stats()

Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [7]:
conversations_raw = []

# Iterate over all conversation IDs
for convo_id in corpus.get_conversation_ids():
    convo = corpus.get_conversation(convo_id)
    # Extract the textual content of each utterance in the conversation
    convo_text = [utt.text for utt in convo.iter_utterances()]
    conversations_raw.append(convo_text)

print(f"Total conversations: {len(conversations_raw)}")
print("Example conversation:", conversations_raw[0])

Total conversations: 83097
Example conversation: ['They do not!', 'They do to!']


In [8]:
pairs = []
for convo in conversations_raw:
    for i in range(len(convo)-1):
        pairs.append((convo[i], convo[i+1]))

print(f"Total pairs: {len(pairs)}")
print("Sample pair:", pairs[0])

Total pairs: 221616
Sample pair: ('They do not!', 'They do to!')


In [9]:
train_convos, test_convos = train_test_split(conversations_raw, test_size=0.2, random_state=42)

In [10]:
all_texts = [utt.text for utt in corpus.iter_utterances()]

print(f"Total utterances: {len(all_texts)}")
print("Example utterances:", all_texts[:5])

Total utterances: 304713
Example utterances: ['They do not!', 'They do to!', 'I hope so.', 'She okay?', "Let's go."]


In [11]:
train_texts, test_texts = train_test_split(all_texts, test_size=0.2, random_state=42)

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(train_convos, show_progress_bar=True)
print(f"Shape of embeddings: {embeddings.shape}")

Skipping import of cpp extensions due to incompatible torch version 2.10.0+cpu for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info
W0212 22:14:59.678000 16444 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.





modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2078 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[ 1.0000,  0.1206,  0.0736,  ...,  0.1332,  0.0051,  0.0891],
        [ 0.1206,  1.0000,  0.1462,  ..., -0.0054, -0.0103,  0.0212],
        [ 0.0736,  0.1462,  1.0000,  ...,  0.1846,  0.0885,  0.2024],
        ...,
        [ 0.1332, -0.0054,  0.1846,  ...,  1.0000,  0.1002,  0.1454],
        [ 0.0051, -0.0103,  0.0885,  ...,  0.1002,  1.0000,  0.1599],
        [ 0.0891,  0.0212,  0.2024,  ...,  0.1454,  0.1599,  1.0000]])


In [22]:
import faiss

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

def retrieve(query, k=2):
    distances, indices = index.search(embeddings, k)
    return [train_convos[i] for i in indices[0]]

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [19]:
def generate_response(query):
    retrieved_docs = retrieve(query)
    context = "\n".join(retrieved_docs)

    prompt = f"""
    You are an assistant trying to utilize slang.
    Use only the context below to generate slang usage.

    Context:
    {context}

    Question:
    {query}

    Answer:
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, temperature=0.1,max_length=50, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [13]:
#answer = generate_response("Generate a usage for a slang word that you came across in the training data.")

In [None]:
#I stopped the embeddings on my computer becuase it was throttling too much
#Heres the start of the validation stuff

Semantic Fluency - POS Tagging, N-gram analysis, Frequency Distribution

In [86]:
import nltk
from nltk import bigrams
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.util import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist, LaplaceProbDist

In [87]:
# Run once if not already downloaded
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

In [88]:
#example usage, to be edited once we can generate expressions
text = "That was a piece of cake!"

tokens = word_tokenize(text)
tagged = pos_tag(tokens)
#POS sequences tell you whether the sentence is syntactically plausible.
#"The cat sat on the mat" → POS tags: DT NN VBD IN DT NN → typical sequence.
print(tagged)

[('That', 'DT'), ('was', 'VBD'), ('a', 'DT'), ('piece', 'NN'), ('of', 'IN'), ('cake', 'NN'), ('!', '.')]


In [89]:
#bigram frequencies give you insight into word co-occurrence:
#Frequent bigrams (the cat, sat on) → likely fluent.
bigrams = list(ngrams(tokens, 2))
print(bigrams)

[('That', 'was'), ('was', 'a'), ('a', 'piece'), ('piece', 'of'), ('of', 'cake'), ('cake', '!')]


In [90]:
bigram_freq = FreqDist(bigrams)
print(bigram_freq.most_common(5))

[(('That', 'was'), 1), (('was', 'a'), 1), (('a', 'piece'), 1), (('piece', 'of'), 1), (('of', 'cake'), 1)]


In [80]:
#lets work with the movie conversations for sentence structure
tokenized_texts = [word_tokenize(text.lower()) for text in all_texts]


In [93]:
#calculates what we would call the average bigram frequency or coverage score
all_tokens = [token for sublist in tokenized_texts for token in sublist]

# Corpus bigrams
corpus_bigrams = list(ngrams(all_tokens, 2))
corpus_bigram_freq = FreqDist(corpus_bigrams)

expression = "That was a piece of cake!"
expr_tokens = word_tokenize(expression.lower())
expr_bigrams = list(ngrams(expr_tokens, 2))

present_count = sum(1 for bg in expr_bigrams if bg in corpus_bigram_freq)
proportion_present = present_count / len(expr_bigrams)

print("Proportion of bigrams present in corpus:", proportion_present)

Proportion of bigrams present in corpus: 1.0


In [96]:
expression = "de org bah banana blitz monkey ball Hello there"
expr_tokens = word_tokenize(expression.lower())
expr_bigrams = list(ngrams(expr_tokens, 2))

present_count = sum(1 for bg in expr_bigrams if bg in corpus_bigram_freq)
proportion_present = present_count / len(expr_bigrams)

print("Proportion of bigrams present in corpus:", proportion_present)

Proportion of bigrams present in corpus: 0.125
