objective: to train skip gram model to convert words into vectors, once train we can provide words and it can give similar words based on vector repr

In [36]:
#importing libraries
import pandas as pd
import re

In [85]:
#function to clean text
def preprocess_text(text):
    text=text.lower()  #converting into lower case
    text=re.sub(r'[^a-z\s]','',text) #only keep letters and spaces
    words=text.split()
    return words

In [102]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Dense
import numpy as np

# --- 1. Load & preprocess
params = {
    'input_file': 'mental_health_data.csv',
    'output_file': 'embeddings.txt',
    'window_size': 1,
    'embedding_dim': 100,
    'epochs': 50
}

df = pd.read_csv(params['input_file'])
text = ' '.join(df['selftext'].dropna().astype(str).tolist())

# --- 2. Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0]
word2idx = tokenizer.word_index
vocab_size = len(word2idx) + 1


In [103]:
pairs, labels = skipgrams(sequences, vocabulary_size=vocab_size, window_size=params['window_size'])
target_words, context_words = zip(*pairs)

In [104]:
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, params['embedding_dim'], input_length=1, name='embedding_layer')

target_embed = embedding(input_target)
context_embed = embedding(input_context)

dot_product = Dot(axes=-1)([target_embed, context_embed])
dot_product = Reshape((1,))(dot_product)

output = Dense(1, activation='sigmoid')(dot_product)

model = Model([input_target, input_context], output)
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()




In [105]:
model.fit([np.array(target_words), np.array(context_words)],
          np.array(labels),
          epochs=params['epochs'],
          verbose=1)


Epoch 1/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4ms/step - loss: 0.6329
Epoch 2/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step - loss: 0.3638
Epoch 3/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5ms/step - loss: 0.2012
Epoch 4/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - loss: 0.1489
Epoch 5/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5ms/step - loss: 0.1274
Epoch 6/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5ms/step - loss: 0.1186
Epoch 7/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5ms/step - loss: 0.1130
Epoch 8/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - loss: 0.1058
Epoch 9/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - loss: 0.1045
Epoch 10/50
[1m4797/4797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1a3d95f0ed0>

In [106]:
weights = model.get_layer('embedding_layer').get_weights()[0]

with open(params['output_file'], 'w', encoding='utf-8') as f:
    for word, i in word2idx.items():
        vector = ' '.join(map(str, weights[i]))
        f.write(f"{word} {vector}\n")


In [109]:
# First run this block once
import numpy as np

# Load embeddings
def load_embeddings(file):
    embs = {}
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype='float32')
            embs[word] = vec
    return embs

embeddings = load_embeddings('embeddings.txt')

# Cosine similarity
def get_similar(word, top_n=5):
    if word not in embeddings:
        return f"'{word}' not found in vocab"
    
    w_vec = embeddings[word]
    sims = {
        other: np.dot(w_vec, vec) / (np.linalg.norm(w_vec) * np.linalg.norm(vec))
        for other, vec in embeddings.items() if other != word
    }
    return sorted(sims.items(), key=lambda x: x[1], reverse=True)[:top_n]

In [110]:
get_similar("anxiety")

[('done', 0.37784165),
 ('do', 0.3407945),
 ('friends', 0.34033915),
 ('alcohol', 0.3311222),
 ('faze', 0.31826755)]

In [112]:
get_similar("mental")

[('beer', 0.4320064),
 ('age', 0.42814207),
 ('monthly', 0.42456794),
 ('isolation', 0.4041774),
 ('girlfriend', 0.39877856)]

In [117]:
get_similar("live")

[('noticed', 0.45851675),
 ('blood', 0.45535794),
 ('call', 0.4541829),
 ('friend', 0.44422624),
 ('isolated', 0.43215582)]

In [118]:
get_similar("death")

[('nicotine', 0.47712353),
 ('bit', 0.42128196),
 ('feet', 0.41915596),
 ('lot', 0.38061348),
 ('sound', 0.38026083)]