# Experiment 3
## Comparing Text Featurization between Word2Vec and Doc2Vec

In [1]:
import keras
import numpy as np
import pandas as pd
import pickle
import sklearn
import tensorflow as tf
import utils
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from sklearn.metrics import classification_report

In [13]:
# File paths

# Data Directory
DATA_DIR = "data"

# Balanced datasets
BALANCED_TRAIN_DATASET = "data/balanced_dataset.pickle"
BALANCED_TEST_DATASET = "data/balanced_test_dataset.pickle"

# Preprocessed balanced data
PREPROCESSED_BAL_TRAIN_DATASET = "data/preprocessed_train.pickle"
PREPROCESSED_BAL_TEST_DATASET = "data/preprocessed_test.pickle"

# Word2Vec model
WORD2VEC_MODEL = "models/word2vec_model"

# Doc2Vec model
DOC2VEC_MODEL = "models/doc2vec_model"

In [4]:
# Function to save data as a .pickle file
# Params: 
    # List or Dataframe - @data: Data to be saved as pickle
    # Str - @folder: folder name
    # Str - file name
# Output: Pickle file in directory/repo 
def save_pickle(data, folder, file_name):
    with open("{0}/{1}.pickle".format(folder, file_name), 'wb') as f:
        pickle.dump(data, f)
    print(f"Saved data is stored in \'{folder}\' in the form of {file_name}.pickle")
    #pickle.dump(data, open("data/{0}.pickle".format(file_name),"wb"))

# Function to load pickle file
# Params:
    # Str - @file_path: File path of pickle file
# Output:
    # Saved object in original file type (list/dataframe)
def load_pickle(file_path):
    return pickle.load(open(file_path, "rb"))

In [5]:
# Load datasets

# Get preprocessed train dataset
bal_train_dataset = load_pickle(PREPROCESSED_BAL_TRAIN_DATASET)

# Get preprocessed test dataset
bal_test_dataset = load_pickle(PREPROCESSED_BAL_TEST_DATASET)

# Get train_y
bal_train_y = pd.read_pickle(BALANCED_TRAIN_DATASET)
bal_train_y = bal_train_y.drop(columns="comment_text")

# Get test_y
bal_test_y = pd.read_pickle(BALANCED_TEST_DATASET)
bal_test_y = bal_test_y.drop(columns="comment_text")

In [6]:
# Experiment-specific imports
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Word2Vec

In [7]:
# Word2Vec constants
W2V_SIZE = 100          # default is 100
W2V_WINDOW_SIZE = 5     # default is 5
W2V_MIN_COUNT = 5       # default is 5
W2V_SG = 0              # default is 0
W2V_EPOCHS = 10

In [8]:
# Function to create and save Word2Vec model
# Params: 
    # @sentences, @vector_size, @window, @min_count and @sg are gensim Word2Vec model params
    # List      - @sentences:   tokens that have been fully pre-processed
    # Int       - @size:        dimensionality of word vectors (typically between 100-300)
    # Int       - @window_size: max distance between current and predicted word in a sentence
    # Int       - @min_count:   ignores all words with total frequency lower than this
    # Binary    - @sg:          training algorithm, 0 - CBOW, 1 - skip-gram 
    # Str       - @file_name:   model name
# Output: Model file in directory/repo 
def word2vec_create_model(sentences, size, window, min_count, sg, file_name):
    model = Word2Vec(sentences=sentences, size=size, window=window, min_count=min_count, sg=sg)
    model.save("{0}.model".format(file_name))

# Function to load Word2Vec model
# Params: Str - file name
# Returns: Model - word2vec model
def word2vec_load_model(file_name):
    return Word2Vec.load("{0}.model".format(file_name))

The following cell creates the Word2Vec model that is trained once, then it can be saved and loaded without the need for running the model creation and training it again.

In [15]:
# File path to save the model
SAVE_WORD2VEC_MODEL = "models/word2vec_model.model"

# Create Word2Vec CBOW Model
word2vec_create_model(bal_train_dataset, W2V_SIZE, W2V_WINDOW_SIZE, W2V_MIN_COUNT, W2V_SG, WORD2VEC_MODEL)

# Initialize model
word2vec_model = word2vec_load_model(WORD2VEC_MODEL)

# Train model
word2vec_model.train(bal_train_dataset, total_examples=word2vec_model.corpus_count, epochs = 10)

# Save trained model
word2vec_model.save(SAVE_WORD2VEC_MODEL)

In [16]:
# Load trained model
word2vec_model = word2vec_load_model(WORD2VEC_MODEL)

In [17]:
# Print vocab to test between vocab and dataset
vocab = list(word2vec_model.wv.vocab)
print(vocab[:100])
print(bal_train_dataset[:2])

['cocksucker', 'piss', 'around', 'work', 'gay', 'white', 'two', 'way', 'erase', 'comment', 'ww', 'holocaust', 'jew', 'head', 'go', 'meeting', 'doubt', 'word', 'bible', 'homosexuality', 'sin', 'make', 'forehead', 'mass', 'pal', 'first', 'last', 'warn', 'fuck', 'wont', 'appreciate', 'nazi', 'would', 'write', 'page', 'dont', 'wish', 'talk', 'anymore', 'dark', 'side', 'stupid', 'peace', 'shit', 'stop', 'delete', 'stuff', 'asshole', 'die', 'fall', 'hole', 'hell', 'hi', 'back', 'undo', 'edits', 'pair', 'weiner', 'think', 'fagget', 'get', 'burn', 'hate', 'sorry', 'cant', 'sex', 'im', 'run', 'reply', 'loser', 'un', 'defines', 'vietnam', 'part', 'southeast', 'asia', 'far', 'know', 'use', 'french', 'country', 'anyway', 'culture', 'always', 'influence', 'sea', 'han', 'chinese', 'proper', 'fringe', 'indigenous', 'tribe', 'admit', 'vietnamese', 'bunch', 'wannabe', 'crap', 'people', 'east', 'asian']
[['cocksucker', 'piss', 'around', 'work'], ['gay', 'antisemmitian', 'archangel', 'white', 'tiger', 'm

In [19]:
# Printing vectors and vocab 
w1 = bal_train_dataset[0][0] # 1st word of 1st document
w2 = bal_train_dataset[0][1] # 2nd word of 1st document
print(f"Shape of w1 \'{w1}\': {word2vec_model.wv.get_vector(w1).shape}")
print(f"Shape of w2 \'{w2}\': {word2vec_model.wv.get_vector(w2).shape}")
 
print("Vocab:", len(word2vec_model.wv.vocab))

# Print the size of the word2vec vector for one word
w3 = bal_train_dataset[1][3]
print(f"Length of the vector for w3 \'{w3}\':", len(word2vec_model.wv.get_vector(w3)))

Shape of w1 'cocksucker': (100,)
Shape of w2 'piss': (100,)
Vocab: 4679
Length of the vector for w3 'white': 100


In [20]:
# Remove words not in word2vec model
# Params: 
#   Word2Vec Model  - @model:           Word2Vec Model
#   List            - @all_comments:    Pre-processed tokens (2D List)
# Output: List - Tokens with only words in model's vocab (2D List)
def word2vec_remove_words_outside_vocab(model, all_comments):
    # Remove words not in w2v cbow model vocab
    doc = []
    for comment in all_comments:
        temp = []
        for word in comment:
            if word in model.wv.vocab:
                temp.append(word)
        doc.append(temp)
    return doc

# Average word vectors of each comment
# Params:
#   Word2Vec Model  - @model:             Word2Vec Model 
#   List            - @comment_vocab:     Tokens with only words in model's vocab (2D List)
# Output: 
#   Numpy Array of average vector of each comment
def word2vec_average_vectors(model, comment_vocab):

    average_list = []

    for comment in comment_vocab:

        comment_vectors = []

        for word in comment:
            comment_vectors.append((model.wv.get_vector(word)))
        
        comment_vectors_np = np.asarray(comment_vectors, dtype='float32')
        mean = np.mean(comment_vectors_np, keepdims=True)
        average_list.append(mean)
    
    return np.asarray(average_list, dtype='float32')

In [21]:
# Remove words that are not in Word2Vec vocab
word2vec_removed_words = word2vec_remove_words_outside_vocab(word2vec_model, bal_train_dataset)

# Print word2vec_removed_words to compare against original text
print(word2vec_removed_words[:2])
print(bal_train_dataset[:2])

[['cocksucker', 'piss', 'around', 'work'], ['gay', 'white', 'two', 'way', 'erase', 'comment', 'ww', 'holocaust', 'jew', 'head', 'go', 'meeting', 'doubt', 'word', 'bible', 'homosexuality', 'sin', 'make', 'forehead', 'go', 'mass', 'gay', 'pal', 'first', 'last', 'warn', 'fuck', 'gay', 'wont', 'appreciate', 'nazi', 'would', 'write', 'page', 'dont', 'wish', 'talk', 'anymore', 'dark', 'side']]
[['cocksucker', 'piss', 'around', 'work'], ['gay', 'antisemmitian', 'archangel', 'white', 'tiger', 'meow', 'greetingshhh', 'uh', 'two', 'way', 'erase', 'comment', 'ww', 'holocaust', 'brutally', 'slay', 'jew', 'gaysgypsysslavsanyone', 'antisemitian', 'shave', 'head', 'bald', 'go', 'skinhead', 'meeting', 'doubt', 'word', 'bible', 'homosexuality', 'deadly', 'sin', 'make', 'pentagram', 'tatoo', 'forehead', 'go', 'satanistic', 'mass', 'gay', 'pal', 'first', 'last', 'warn', 'fuck', 'gay', 'wont', 'appreciate', 'nazi', 'shwain', 'would', 'write', 'page', 'dont', 'wish', 'talk', 'anymore', 'beware', 'dark', 's

In [23]:
# Calculate the average of each comment
word2vec_average = word2vec_average_vectors(word2vec_model, word2vec_removed_words)
# Warning appears because some documents are empty

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [24]:
# Constants for keras model
NUM_WORDS = 20000
MAX_LEN = 100

In [25]:
# Pad vectors as needed that cannot be used with Keras' sequential padding
# Params: 
    # Numpy array - @vector_array: Vectors of data to be used in keras converted to np array
# Outputs: 
    # Numpy array - @padded_array: Padded vectors
def pad(vector_array, max_len):
    padded_array = np.zeros((max_len, vector_array.shape[-1]))
    padded_array[:len(vector_array),:] = vector_array
    return padded_array

# Function taken from utils, slightly modified
def modified_build_model():
    EPOCHS = 30
    INIT_LR = 1e-3

    model = Sequential()

    #model.add(Embedding(num_words, 128))
    model.add(Dropout(0.4))
    model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
    model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))

    adam = tf.keras.optimizers.Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)

    model.compile(loss='binary_crossentropy',
                optimizer=adam,
                metrics=['accuracy'])

    return model

In [26]:
# Pad word2vec_average to use for training
word2vec_train_x = np.stack(list(map(lambda x: pad(x, MAX_LEN), word2vec_average)))

In [27]:
# Train model
model_w2v = modified_build_model()

model_w2v.fit(word2vec_train_x, bal_train_y, batch_size=60, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1c37dc01760>

In [28]:
# Prepare test dataset

# Remove words that are not in Word2Vec vocab
word2vec_remove_test = word2vec_remove_words_outside_vocab(word2vec_model, bal_test_dataset)

# Calculate the average of each comment
word2vec_average_test = word2vec_average_vectors(word2vec_model, word2vec_remove_test)

# Pad word2vec_average to use for training
word2vec_test_x = np.stack(list(map(lambda x: pad(x, MAX_LEN), word2vec_average_test)))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [29]:
# Evaluate model for Doc2Vec
model_w2v.evaluate(word2vec_test_x, bal_test_y, batch_size=60)



[0.4596764147281647, 0.009314989671111107]

# Doc2Vec

In [30]:
# Gensim Doc2Vec constants
D2V_SIZE = 100          # default is 100
D2V_WINDOW_SIZE = 5     # default is 5
D2V_MIN_COUNT = 5       # default is 5
D2V_DM_MEAN = 1
D2V_DBOW_WORDS = 0
D2V_EPOCHS = 10

In [31]:
# Function to create and save Doc2Vec model
# Params: 
    # @documents, @vector_size, @window, @min_count, @dm_mean and @dbow_words are Doc2Vec model params
    # TaggedDocument    - @documents:   input corpus with pre-processed tokens in 2D list form
    # Int               - @size:        dimensionality of word vectors (typically between 100-300)
    # Int               - @window: max distance between current and predicted word in a sentence
    # Int               - @min_count:   ignores all words with total frequency lower than this
    # Binary            - @dm_mean:     sum or mean of word vectors; 0 - sum, 1 - mean
    # Binary            - @dbow_words:  training algorithm, 0 - bow, 1 - skip-gram and bow 
    # Str               - @file_name:   model name
# Output: Model file in directory/repo 
def doc2vec_create_model(documents, size, window, min_count, dm_mean, dbow_words, file_name):
    model = Doc2Vec(documents=documents, vector_size=size, window=window, min_count=min_count, dm_mean=dm_mean, dbow_words=dbow_words)
    model.save("{0}.model".format(file_name))

# Function to load Doc2Vec model
# Params: Str - file name
# Returns: Model - doc2vec model
def doc2vec_load_model(file_name):
    return Word2Vec.load("{0}.model".format(file_name))

In [32]:
# Set up comments for Gensim Doc2Vec Model
comments = [TaggedDocument(comment, [i]) for i, comment in enumerate(bal_train_dataset)]

In [35]:
# File path to save the model
SAVE_DOC2VEC_MODEL = "models/doc2vec_model.model"

# Create Doc2Vec model
doc2vec_create_model(comments, D2V_SIZE, D2V_WINDOW_SIZE, D2V_MIN_COUNT, D2V_DM_MEAN, D2V_DBOW_WORDS, DOC2VEC_MODEL)

# Initialize model
doc2vec_model = doc2vec_load_model(DOC2VEC_MODEL)

# Train model
doc2vec_model.train(comments, total_examples=doc2vec_model.corpus_count, epochs=D2V_EPOCHS)

# Save trained model
doc2vec_model.save(SAVE_DOC2VEC_MODEL)

In [37]:
# Load trained model
doc2vec_model = doc2vec_load_model(DOC2VEC_MODEL)

In [39]:
# Printing vectors and vocab
w1 = bal_train_dataset[0][0] # 1st word of 1st document
w2 = bal_train_dataset[0][1] # 2nd word of 1st document
print(f"Shape of w1 \'{w1}\': {doc2vec_model.wv.get_vector(w1).shape}")
print(f"Shape of w2 \'{w2}\': {doc2vec_model.wv.get_vector(w2).shape}")

print("Vocab:", len(doc2vec_model.wv.vocab))

# Print the size of the word2vec vector for one word
w3 = bal_train_dataset[1][3]
print(f"Length of the vector for w3 \'{w3}\':", len(doc2vec_model.wv.get_vector(w3)))

# Print the vector of 1st document
print("Vector of doc 0:", doc2vec_model.docvecs[0].shape)

Shape of w1 'cocksucker': (100,)
Shape of w2 'piss': (100,)
Vocab: 4679
Length of the vector for w3 'white': 100
Vector of doc 0: (100,)


In [42]:
# Assessing the model with Gensim Doc2Vec tutorial

# Infer new vectors for each document of the training corpus, compare them against the actual vectors, then return the rank of the document based on self-similarity
ranks = []
second_ranks = []

for doc_id in range(len(comments)):
    inferred_vector = doc2vec_model.infer_vector(comments[doc_id].words)
    sims = doc2vec_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 5145, 1: 367, 2: 196, 3: 118, 4: 60, 5: 56, 6: 46, 8: 43, 7: 35, 10: 29, 13: 26, 9: 23, 12: 22, 17: 21, 11: 19, 19: 14, 15: 14, 14: 14, 16: 13, 22: 13, 20: 12, 27: 12, 26: 12, 18: 11, 35: 11, 24: 11, 32: 11, 25: 10, 21: 10, 29: 10, 28: 10, 39: 10, 38: 9, 31: 8, 34: 7, 37: 7, 23: 7, 30: 7, 47: 7, 58: 6, 40: 6, 43: 5, 51: 5, 33: 5, 45: 5, 64: 5, 91: 5, 46: 5, 41: 5, 70: 5, 87: 4, 48: 4, 80: 4, 263: 4, 79: 4, 94: 4, 71: 4, 110: 4, 42: 4, 95: 4, 115: 4, 53: 4, 81: 3, 36: 3, 430: 3, 57: 3, 107: 3, 100: 3, 84: 3, 44: 3, 63: 3, 66: 3, 92: 3, 61: 3, 55: 3, 86: 3, 105: 3, 102: 3, 54: 3, 111: 3, 83: 3, 56: 3, 50: 3, 60: 3, 49: 3, 293: 3, 284: 2, 199: 2, 132: 2, 471: 2, 933: 2, 2901: 2, 2121: 2, 198: 2, 203: 2, 231: 2, 74: 2, 357: 2, 127: 2, 90: 2, 652: 2, 185: 2, 178: 2, 176: 2, 98: 2, 138: 2, 162: 2, 108: 2, 302: 2, 853: 2, 89: 2, 422: 2, 301: 2, 160: 2, 67: 2, 171: 2, 950: 2, 112: 2, 135: 2, 282: 2, 191: 2, 633: 2, 147: 2, 144: 2, 211: 2, 131: 2, 62: 2, 113: 2, 137: 2, 303: 2, 272:

In [45]:
# Assessing the model with Gensim Doc2Vec tutorial

# Test most & least similar documents against random document
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(comments[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % doc2vec_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(comments[sims[index][0]].words)))

Document (7131): «really dont think understand come idea bad right away kind community go bad idea go away instead help rewrite»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc5,s0.001,t3):

MOST (185, 0.785494327545166): «people pretty overzealous whole free thing get fuck life fuck nigger»

SECOND-MOST (6424, 0.7661423683166504): «bad behaviour»

MEDIAN (5526, 0.3718322515487671): «image requestedpeople kansasamerican football people»

LEAST (899, -0.7124570608139038): «fuck u weak as niggaz»



In [46]:
# Prepare training data for model
# Convert doc2vec vectors to numpy array for Keras to use
d2v_train_x = np.array([doc2vec_model.docvecs[i] for i, comment in enumerate(comments)])
# Pad train_x
d2v_padded_x = np.stack(list(map(lambda x: pad(x, MAX_LEN), d2v_train_x)))

In [47]:
# Fit model for Doc2Vec
model_d2v = modified_build_model()

model_d2v.fit(d2v_padded_x, bal_train_y, batch_size=60, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1c37e4abdc0>

In [50]:
# Infer vectors for test_x based on Doc2Vec model
def prepare_vectors(test_x):
    
    vector_x = [doc2vec_model.infer_vector(comment) for comment in test_x]

    return vector_x

In [51]:
# Prepare test_x for evaluation and prediction
d2v_vector_x = prepare_vectors(bal_test_dataset)

# Convert doc2vec vectors to numpy array for Keras to use
d2v_np_x = np.array(d2v_vector_x)

# Pad test_x
d2v_test_x = np.stack(list(map(lambda x: pad(x, max_len=100), d2v_np_x)))

In [52]:
# Evaluate model for Doc2Vec
model_d2v.evaluate(d2v_test_x, bal_test_y, batch_size=60)



[0.47548913955688477, 0.30194899439811707]