# Word Embeddings

Created by Owen Fava

In [1]:
import nltk
import pandas as pd
from gensim import utils
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

nltk.download("punkt")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adamd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
dataset = pd.read_csv("./data/GraphData_test.csv")
print(dataset.head(5))
print(len(dataset))

  Subreddit     Word1 Dependency     Word2  \
0   Anxiety   knowing       dobj       day   
1       BPD   boosted       dobj  finances   
2   Anxiety    taking       dobj     carbs   
3   Anxiety  universe      nsubj     death   
4       BPD     heard      nsubj      help   

                                            MHlabels  \
0  {'SYMPTOMS': ['panic attack', 'anxiety'], 'SLE...   
1  {'ANXIETY DISORDERS': ['anxiety'], 'DEPRESSIVE...   
2  {'SYMPTOMS': ['anxiety'], 'ANXIETY DISORDERS':...   
3                                                 {}   
4                            {'SYMPTOMS': ['anger']}   

                                     title  \
0       worried sleeping holiday away home   
1                               lost ether   
2     thought diet crucial dealing anxiety   
3  stop thinking existentialism death time   
4                         grief motherload   

                                            selfText  
0  15 typing night holiday 'staycation' hours awa...  

In [3]:
def combine_data_columns(data: str, columns: list[str], new_column_name: str):
    combined_data = pd.concat([data[column] for column in columns], ignore_index=True)
    combined_cleaned_data = pd.DataFrame({new_column_name: combined_data})

    return combined_cleaned_data

data = combine_data_columns(dataset, ["title", "selfText"], "combined_title_selftext")
print(data.head(5))
print(len(data))

                   combined_title_selftext
0       worried sleeping holiday away home
1                               lost ether
2     thought diet crucial dealing anxiety
3  stop thinking existentialism death time
4                         grief motherload
1318


In [4]:
def keep_alphabetical_words(data):
    if pd.isna(data):
        return ''

    words = data.split()
    clean_words = [word for word in words if word.isalpha()]

    return ' '.join(clean_words)

data["combined_title_selftext"] = data["combined_title_selftext"].apply(keep_alphabetical_words)

# Remove empty rows
data = data[data["combined_title_selftext"].str.len() > 0]
data = data.reset_index(drop=True)

print(data.head(5))
print(len(data))

                   combined_title_selftext
0       worried sleeping holiday away home
1                               lost ether
2     thought diet crucial dealing anxiety
3  stop thinking existentialism death time
4                         grief motherload
1308


In [5]:
def generate_ngrams(sentences, n):
    ngram_list = []
    for sentence in sentences:
        if isinstance(sentence, str):
            tokens = word_tokenize(sentence.lower())
            n_grams = list(ngrams(tokens, n))
            ngram_list.extend(n_grams)
    return ngram_list

n_grams_data = generate_ngrams(sentences=data["combined_title_selftext"], n=2)
print("First 10 n-grams:", n_grams_data[:10])

First 10 n-grams: [('worried', 'sleeping'), ('sleeping', 'holiday'), ('holiday', 'away'), ('away', 'home'), ('lost', 'ether'), ('thought', 'diet'), ('diet', 'crucial'), ('crucial', 'dealing'), ('dealing', 'anxiety'), ('stop', 'thinking')]


Reference of Word2Vec: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

In [6]:
model_data = []

for index, row in data.iterrows():
    model_data.append(utils.simple_preprocess(row.iloc[0]))

print(data.head(10))
print(model_data[:10])

model = Word2Vec(sentences=model_data, window=2, min_count=3)

model.save("./data/word2vec_embeddings.bin")

                             combined_title_selftext
0                 worried sleeping holiday away home
1                                         lost ether
2               thought diet crucial dealing anxiety
3            stop thinking existentialism death time
4                                   grief motherload
5  friend coworker quitting ecstatic terrible person
6                       feel frustrated idealization
7            family ruined life know fix point think
8                            stop frowning gibberish
9  thoughts accepting reality living modern socie...
[['worried', 'sleeping', 'holiday', 'away', 'home'], ['lost', 'ether'], ['thought', 'diet', 'crucial', 'dealing', 'anxiety'], ['stop', 'thinking', 'existentialism', 'death', 'time'], ['grief', 'motherload'], ['friend', 'coworker', 'quitting', 'ecstatic', 'terrible', 'person'], ['feel', 'frustrated', 'idealization'], ['family', 'ruined', 'life', 'know', 'fix', 'point', 'think'], ['stop', 'frowning', 'gibberish'], [



In [8]:
# Accessing the vocabulary
vocabulary = model.wv.vocab
print(f"Words in Vocabulary: {len(vocabulary)}")

# Print each word of within the vocabulary
# print("\nVocabulary:")
# for word in vocabulary:
#     print(word)

# Get the word vector for a specific word
word_to_find = "anxiety"
word_vector = model.wv[word_to_find]
print(f"\nVector for {word_to_find}: ", word_vector)

similar_words = model.wv.most_similar(word_to_find)
print("Similar words: ", similar_words)

Words in Vocabulary: 4108

Vector for anxiety:  [ 0.91743124 -0.15346275  0.5612611   0.12110069 -0.36633033 -0.027039
  0.766288   -0.01436418  0.5586183  -0.30250457 -0.5072509  -0.10233949
 -0.05358497 -0.19796601  0.11952449 -0.05286039 -0.81796783  0.06031595
  0.993565    0.32394123  0.01161638 -0.43665498 -0.19606447 -0.5011399
 -0.28716308 -0.33631784  0.30174196 -0.48437116  0.00257769 -0.13986856
  0.13831638  0.5599948   0.02161547 -0.65895987  0.29117948 -0.09961222
  0.52416235  0.47608513  0.13760327  0.10166731  0.04950099 -0.2869896
  0.04365897 -0.41008204 -0.55863047  0.34194645  0.09776545 -0.40645927
  0.37375703 -0.41520548 -0.2868897   0.6229352   0.18483339 -0.68089205
  0.180556   -0.37897637 -0.4665601   0.16920866 -0.41236973 -0.32825047
 -0.37534845 -0.26796955  0.19134629  0.41265154  0.06596199 -0.13089637
 -0.33334473 -0.06814519  0.25660405 -0.17864305  0.2386284  -0.13914566
 -0.09507644 -0.37319258  0.28276768 -0.24656092 -0.10530248 -0.2839538
 -0.4912