# Word Embeddings

Created by Owen Fava

In [None]:
import nltk
import pandas as pd
from gensim import utils
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

nltk.download("punkt")

In [None]:
dataset = pd.read_csv("data/GraphData_sampled_new.csv")
print(dataset.head(5))
print(len(dataset))

In [None]:
def combine_data_columns(data: str, columns: list[str], new_column_name: str):
    combined_data = pd.concat([data[column] for column in columns], ignore_index=True)
    combined_cleaned_data = pd.DataFrame({new_column_name: combined_data})

    return combined_cleaned_data

data = combine_data_columns(dataset, ["title", "selfText"], "combined_title_selftext")
print(data.head(5))
print(len(data))

In [None]:
def keep_alphabetical_words(data):
    if pd.isna(data):
        return ''
    
    words = data.split()
    clean_words = [word for word in words if word.isalpha()]
    
    return ' '.join(clean_words)

data["combined_title_selftext"] = data["combined_title_selftext"].apply(keep_alphabetical_words)

# Remove empty rows
data = data[data["combined_title_selftext"].str.len() > 0]
data = data.reset_index(drop=True)

print(data.head(5))
print(len(data))

In [None]:
def generate_ngrams(sentences, n):
    ngram_list = []
    for sentence in sentences:
        if isinstance(sentence, str):
            tokens = word_tokenize(sentence.lower())
            n_grams = list(ngrams(tokens, n))
            ngram_list.extend(n_grams)
    return ngram_list

n_grams_data = generate_ngrams(sentences=data["combined_title_selftext"], n=2)
print("First 10 n-grams:", n_grams_data[:10])

Reference of Word2Vec: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

In [None]:
model_data = []

for index, row in data.iterrows():
    model_data.append(utils.simple_preprocess(row.iloc[0]))

print(data.head(10))
print(model_data[:10])

model = Word2Vec(sentences=model_data, vector_size=100, window=2, min_count=3)

model.save("word2vec_embeddings.bin")

In [None]:
# Accessing the vocabulary
vocabulary = model.wv.key_to_index
print(f"Words in Vocabulary: {len(vocabulary)}")

# Print each word of within the vocabulary
# print("\nVocabulary:")
# for word in vocabulary:
#     print(word)

# Get the word vector for a specific word
word_to_find = "anxiety"
word_vector = model.wv[word_to_find]
print(f"\nVector for {word_to_find}: ", word_vector)

similar_words = model.wv.most_similar(word_to_find)	
print("Similar words: ", similar_words)