## this notebook will accomplish the ff:
- generate word embeddings from hate-speech-data-cleaned.csv
- see words similar in context
- save word embeddings in separate file for later use in training a sentiment classifier

## import libraries

In [50]:
import gensim
from gensim.models import Word2Vec

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from utilities.data_preprocessors import read_preprocess

import json

nltk.download('stopwords')
nltk.download('wordnet')

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
df = pd.read_csv('./data/hate-speech-data-cleaned.csv', index_col=0)
df

Unnamed: 0,comment,label
0,"['woman', 'complain', 'cleaning', 'house', 'ma...",1
1,"['boy', 'dat', 'coldtyga', 'dwn', 'bad', 'cuff...",0
2,"['dawg', 'ever', 'fuck', 'bitch', 'start', 'cr...",0
3,"['look', 'like', 'tranny']",0
4,"['shit', 'hear', 'might', 'true', 'might', 'fa...",0
...,...,...
65775,"['from', 'the', 'midnight', 'sun', 'where', 't...",1
65776,"['do', 'not', 'say', 'am', 'not', 'your', 'type']",1
65777,"['and', 'therefor', 'never', 'send', 'to', 'kn...",1
65778,"['and', 'cannot', 'stand', 'anoth', 'day']",1


In [52]:
X, Y = read_preprocess(df)
X.dtype

dtype('O')

In [53]:
type(X.iloc[0])

list

In [54]:
X.iloc[0]

['woman', 'complain', 'cleaning', 'house', 'man', 'always', 'take', 'trash']

## model architecture and initialization
- here the window or window size is the amount of words to use as context and target are indicated, as well as the min_count which indicates if a word length lower than its value is still to be considered part of the window, vector_size is the number of features each resulting word embedding would have, and workers which represent the number of threads to use in training the model

In [55]:
model = Word2Vec(window=5, min_count=2, vector_size=300, workers=4)
model.build_vocab(X, progress_per=1000)
print(model.corpus_count)

65780


#### train model

In [56]:
model.train(X, total_examples=model.corpus_count, epochs=50)

(39226462, 44743900)

The first number (39226987) indicates the total number of processed words during training, while the second number (44743900) represents the total number of words in the corpus that the model has seen so far.

## save words and their respective vectors to dictionary then .json file

In [57]:
vocab, vectors = model.wv.key_to_index, model.wv.vectors
vocab

{'nigger': 0,
 'faggot': 1,
 'bitch': 2,
 'tranny': 3,
 'like': 4,
 'people': 5,
 'would': 6,
 'get': 7,
 'word': 8,
 'say': 9,
 'one': 10,
 'fuck': 11,
 'hoe': 12,
 'black': 13,
 'shit': 14,
 'know': 15,
 'think': 16,
 'fucking': 17,
 'call': 18,
 'time': 19,
 'guy': 20,
 'make': 21,
 'white': 22,
 'got': 23,
 'gay': 24,
 'u': 25,
 'want': 26,
 'even': 27,
 'go': 28,
 'called': 29,
 'someone': 30,
 'really': 31,
 'thing': 32,
 'said': 33,
 'pussy': 34,
 'as': 35,
 'nigga': 36,
 'use': 37,
 'look': 38,
 'right': 39,
 'trannies': 40,
 'woman': 41,
 'see': 42,
 'saying': 43,
 'racist': 44,
 'man': 45,
 'way': 46,
 'cannot': 47,
 'good': 48,
 'calling': 49,
 'mean': 50,
 'still': 51,
 'never': 52,
 'also': 53,
 'lol': 54,
 'person': 55,
 'hate': 56,
 'much': 57,
 'need': 58,
 'could': 59,
 'going': 60,
 'used': 61,
 'love': 62,
 'back': 63,
 'year': 64,
 'day': 65,
 'girl': 66,
 'something': 67,
 'well': 68,
 'op': 69,
 'friend': 70,
 'bad': 71,
 'every': 72,
 'let': 73,
 'actually': 74,


In [58]:
vectors.shape

(21624, 300)

In [62]:
word_vec = dict(zip(vocab.keys(), vectors.tolist()))

In [63]:
len(word_vec)

21624

In [64]:
with open('./data/word_vec.json', 'w') as out_file:
    json.dump(word_vec, out_file)