## this notebook will accomplish the ff:
- generate word embeddings from hate-speech-data-cleaned.csv
- see words similar in context
- save word embeddings in separate file for later use in training a sentiment classifier

## import libraries

In [609]:
import gensim
from gensim.models import Word2Vec
from nltk.lm import Vocabulary

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from utilities.data_preprocessors import read_preprocess, series_to_1D_array
from utilities.data_visualizers import view_words, view_word_frequency

import json
import ast

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## loading data
recall that the the comments have designated labels which are 2, 1, and 0 representing derogatory, non-derogatory, and offensive comments respectively

In [610]:
df = pd.read_csv('./data/hate-speech-data-cleaned.csv', index_col=0)
df = read_preprocess(df)
df

Unnamed: 0,comment,label
0,"[woman, complain, cleaning, house, man, always...",1
1,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",0
2,"[dawg, ever, fuck, bitch, start, cry, confused...",0
3,"[look, like, tranny]",0
4,"[shit, hear, might, true, might, faker, bitch,...",0
...,...,...
65775,"[from, the, midnight, sun, where, the, hot, sp...",1
65776,"[do, not, say, am, not, your, type]",1
65777,"[and, therefor, never, send, to, know, for, wh...",1
65778,"[and, cannot, stand, anoth, day]",1


In [611]:
# extract diagnosis as Y then
# transform Y to 2-dim 1 x m matrix
Y = df['label']
Y = Y.to_numpy().reshape(Y.shape[0], -1)

# extract comment column
X = df['comment']

In [612]:
type(X.iloc[0])

list

In [613]:
X.iloc[0]
# X[0]

['woman', 'complain', 'cleaning', 'house', 'man', 'always', 'take', 'trash']

## model architecture and initialization
- here the window or window size is the amount of words to use as context and target are indicated, as well as the min_count which indicates if a word length lower than its value is still to be considered part of the window, vector_size is the number of features each resulting word embedding would have, and workers which represent the number of threads to use in training the model

In [626]:
model = Word2Vec(window=5, min_count=2, vector_size=300, workers=4)
model.build_vocab(X, progress_per=1000)
print(model.corpus_count)

#### train model

In [627]:
model.train(X, total_examples=model.corpus_count, epochs=50)

The first number (39226987) indicates the total number of processed words during training, while the second number (44743900) represents the total number of words in the corpus that the model has seen so far.

## save words and their respective vectors to dictionary then .json file

In [628]:
vocab, vectors = model.wv.key_to_index, model.wv.vectors
vocab

In [629]:
vectors.shape

In [630]:
word_vec = dict(zip(vocab.keys(), vectors.tolist()))

In [631]:
len(word_vec)

In [632]:
with open('./data/word_vec.json', 'w') as out_file:
    json.dump(word_vec, out_file)

## exploratory data analysis for the generated embeddings

In [633]:
# shows the vectorized version of a word in this case 'fuck'
model.wv['fuck']

In [634]:
# show all the top 10 positively similar words
# to the given word in this case 'fuck'
model.wv.most_similar(positive=['fuck'], topn=10)

In [635]:
# now in this case generate the most
# negatively similar words to 'fuck'
# or in other words the opposite of 'fuck'
model.wv.most_similar(negative=['fuck'], topn=10)

#### the cosine similarity of the same word or perhaps words that are almost the same will be always 1 or approximating

In [636]:
# see the cosine similarity between two words or in other
# words the measurement of how closely related words are
model.wv.similarity('fuck', 'fuck')

In [637]:
model.wv.similarity('fuck', 'faggot')

In [638]:
model.wv.similarity('fuck', 'fucking')

In [639]:
word_vec

In [640]:
view_words(word_vec, word_range=500)