This notebook aims to augment the already existing pre-trained word embeddings online which maybe GloVe, Word2Vec etc, which are generalized word embeddings together with the generated hate_speech_dataset which aims to leverage these existing word embeddings to generate new word embeddings for these new words in the hate_speech_dataset which may not Exist in the vocabulary of these word embeddings themselves

# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import gensim
from mittens import GloVe, Mittens

from utilities.data_loaders import glove2dict
from utilities.data_preprocessors import read_preprocess, series_to_1D_array

%load_ext autoreload
%autoreload 2

# Load the data
* after loadign the data see also the number of all unique words in the dataframe
* see all the words that occur without using the constraint of the words having to be unique
* see the unique words themselves

In [2]:
df_1 = pd.read_csv('./data/hate-speech-data-cleaned.csv', index_col=0)
df_1 = read_preprocess(df_1)

In [3]:
all_words = pd.Series(series_to_1D_array(df_1['comment']))
all_unique_words_counts = all_words.value_counts()
all_unique_words = all_words.unique()

In [4]:
len(all_words)

894878

In [5]:
len(all_unique_words)

47921

In [11]:
all_unique_words

array(['woman', 'complain', 'cleaning', ..., 'pronounc', 'mademoisell',
       'maupin'], dtype=object)

In [6]:
all_unique_words_counts

nigger         16186
faggot         14812
bitch          12246
tranny         11850
like           11657
               ...  
pty                1
vocorp             1
sometimesin        1
shon               1
maupin             1
Length: 47921, dtype: int64

# Getting important variables
* get the list in the dataframe with the greatest amount of words or with the longest sequence, this will be used later for generating the embeddings
* reassign again the df but this time instead of lists of words in the comment column join them, this will be again used later for generating the indexed reprsetnations of the sequences of words 

In [7]:
# before joining again get array in df with longest length first
max_len_1 = len(max(df_1['comment'], key=len))

In [8]:
df_2 = df_1
df_2['comment'] = df_2['comment'].apply(lambda comment: " ".join(comment))
df_2

Unnamed: 0,comment,label
0,woman complain cleaning house man always take ...,1
1,boy dat coldtyga dwn bad cuffin dat hoe st place,0
2,dawg ever fuck bitch start cry confused shit,0
3,look like tranny,0
4,shit hear might true might faker bitch told ya,0
...,...,...
65775,from the midnight sun where the hot spring blow,1
65776,do not say am not your type,1
65777,and therefor never send to know for whom the b...,1
65778,and cannot stand anoth day,1


In [9]:
df_2.loc[0, 'comment']

'woman complain cleaning house man always take trash'

In [10]:
# train_sents, test_sents, train_labels, test_labels = train_test_split(df['comment'], df['label'], test_size=0.3, random_state=0)
sents = df_2['comment']
max_len_2 = 50

num_words_1 = df_2.shape[0]
num_words_2 = len(all_words)
num_words_3 = len(all_unique_words)

# Get all words not occuring in the pre-trained word embeddings
* in this important phase we will have to get all words not occuring in the dictionary we have of the words and their already existing embeddings
* we also generate an important matrix called the co-occurence matrix in order to train our word embedding model with the use of the existign weights/embeddings of GloVes dictionary to unseen words in our hate speech dataset

In [12]:
pre_glove = glove2dict('./embeddings/glove.42B.300d.txt')

# get all the words in our current corpus that is not 
# in our dictionary of words and their respective embeddings
oov = [token for token in all_unique_words if token not in pre_glove.keys()]

# here all tokens are joined to form a list of sentences which is the document


# these are the words out of our dictionary of words 
# and their already pre-trained respective embeddings
oov_vocab = list(set(oov))
print(f'list of words not in glove: {oov_vocab}')

list of words not in glove: ['ampwhen', 'wastedenergy', 'meaningyou', 'turntttt', 'jewmallow', 'awarefrequent', 'scientocrat', 'halfassedly', 'cerfew', 'pittsburghduck', 'gttacticool', 'labbeled', 'griiind', 'unrelateable', 'gmula', 'fatbands', 'poopyheads', 'rkia', 'womentherefore', 'nasoic', 'otherfacing', 'uskangz', 'sawcsms', 'ungrat', 'assretive', 'waitis', 'niggeraroo', 'tranniesladyboys', 'lavatori', 'teabaggerswho', 'sonthese', 'nonpoliticallycorrect', 'nigmenog', 'ciswife', 'cheesydaba', 'oboogyma', 'offerup', 'isiswe', 'eeehhhhh', 'tourfor', 'textingsending', 'flikeimafuckingniggerinmyhouseanthony', 'monthsk', 'gohawks', 'fuxker', 'mardawg', 'lolhoes', 'reddass', 'neveraskablackperson', 'shitlib', 'drugaddicted', 'gottttta', 'confusingamusing', 'anymoreproud', 'valuesnot', 'notnigger', 'pussyit', 'easteners', 'kennies', 'hitlerstalins', 'racysim', 'groupreddit', 'infoyankee', 'rtraaaaaa', 'refsshut', 'libshits', 'anoyin', 'hatesub', 'boutcha', 'queenzflip', 'killfuckdie', 'th

In [13]:
print(len(oov_vocab))

11790


#### building co-occurence matrix

In [None]:
# this will convert the collection of text documents 
# or sentences to a matrix of token/word counts
cv = CountVectorizer(ngram_range=(1, 1), vocabulary=oov_vocab)

# this will create the matrix of token counts
X = cv.fit_transform(df_2['comment'])

# matrix multiply X's transpose to X
Xc = X.T * X

# set the diagonals to be zeroes as it's pointless to be 1
Xc.setdiag(0)

# finally convert Xc to an array once self.setdiag is called
# this will be our co-occurence matrix to be fed to Mittens
co_occ_matrix = Xc.toarray()

#### training the Mittens model with the new words

In [None]:
# # since our GloVe word embeddings are basically 300 in length our n 
# # arg here should be also 300 we also set max_iter or our number of 
# # epochs to train our word embedding model to maybe 1000 to 5000 epochs
# mittens_model = Mittens(n=50, max_iter=1000)

# # this will return only the words not existing in our pre-trained word embeddings
# # but the good thing is we can reshape adn save this file to resemble that of our
# # pretrained word embeddings file
# new_embeddings = mittens_model.fit(
#     co_occ_matrix,
#     vocab=oov_vocab,
#     initial_embedding_dict=pre_glove
# )

# newglove = dict(zip(oov_vocab, new_embeddings))
# f = open("repo_glove.pkl","wb")
# pickle.dump(newglove, f)
# f.close()