In [None]:
import numpy as np
import pickle

from mittens import GloVe, Mittens

from utilities.data_loaders import load_co_occ_matrix, view_and_load_data, glove2dict
from utilities.data_preprocessors import rejoin_data
%load_ext autoreload
%autoreload 2

In [None]:
def train_new_embeddings(pre_glove, oov_vocab, co_occ_matrix, dim=300, epochs=1000):
    """
    Training the Mittens model with the new words
    * since our GloVe word embeddings are basically 300 in length our n 
    arg here should be also 300 we also set max_iter or our number of 
    epochs to train our word embedding model to maybe 1000 to 5000 epochs
    """

    # instantiate the Mittens class
    mittens_model = Mittens(n=dim, max_iter=epochs)

    # this will return only the words not existing in our pre-trained word embeddings
    # but the good thing is we can reshape adn save this file to resemble that of our
    # pretrained word embeddings file
    new_embeddings = mittens_model.fit(
        co_occ_matrix,
        vocab=oov_vocab,
        initial_embedding_dict=pre_glove
    )

    post_glove = dict(zip(oov_vocab, new_embeddings))
    
    with open("./embeddings/hate_speech_glove.txt","wb") as file:
        pickle.dump(post_glove, file)
        file.close()

# load pre glove and create OOV

In [None]:
# load the cleaned dataset
data_path = './data/hate-speech-data-cleaned.csv'
df_1, all_words, all_unique_words, all_unique_words_counts = view_and_load_data(data_path)

# here all tokens/words are joined to form a list of all
# the joined words or the sentences themselves which on
# the whole is the document
df_2 = rejoin_data(df_1)

# Get all words not occuring in the pre-trained word embeddings
# in this important phase we will have to get all words not 
# occuring in the dictionary we have of the words and their 
# already existing embeddings. We also generate an important 
# matrix called the co-occurence matrix in order to train our
# word embedding model with the use of the existign weights/embeddings 
# of GloVes dictionary to unseen words in our hate speech dataset
pre_glove = glove2dict('./embeddings/glove.42B.300d.txt')

# get all the words in our current corpus that is not 
# in our dictionary of words and their respective embeddings
oov = [token for token in all_unique_words if token not in pre_glove.keys()]
oov_vocab = list(set(oov))
print(f'list of words not in glove: \n{oov_vocab}\n')
print(f'length of OOV words: {len(oov_vocab)}\n')

# Load co-occurence matrix

In [None]:
# load the saved co-occurence matrix
co_occ_path = "./embeddings/hate_co_occ_matrix.txt"
co_occ_matrix = load_co_occ_matrix(co_occ_path)
print(f'the co-occurence matrix: \n{co_occ_matrix}\n')
print(f'shape of the co-occurence matrix: {co_occ_matrix.shape}\n')

# Train Mittens

In [None]:
train_new_embeddings(pre_glove, oov_vocab, co_occ_matrix)