In [61]:
import pandas as pd
import pickle
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [62]:
# Word2Vec constants
W2V_SIZE = 100          # default is 100
W2V_WINDOW_SIZE = 5     # default is 5
W2V_MIN_COUNT = 5       # default is 5
W2V_SG = 0              # default is 0
W2V_EPOCHS = 10

In [63]:
# Get lemmatized comments
comment_lemma = pickle.load(open("comment_lemma.pickle", "rb"))

In [64]:
# Function to create and save Word2Vec model
# Params: 
    # @sentences, @vector_size, @window, @min_count and @sg are gensim Word2Vec model params
    # List      - @sentences:   tokens that have been fully pre-processed
    # Int       - @size:        dimensionality of word vectors (typically between 100-300)
    # Int       - @window_size: max distance between current and predicted word in a sentence
    # Int       - @min_count:   ignores all words with total frequency lower than this
    # Binary    - @sg:          training algorithm, 0 - CBOW, 1 - skip-gram 
    # Str       - @file_name:   model name
# Output: Model file in directory/repo 
def word2vec_create_model(sentences, size, window, min_count, sg, file_name):
    model = Word2Vec(sentences=sentences, size=size, window=window, min_count=min_count, sg=sg)
    model.save("{0}.model".format(file_name))

# Function to load Word2Vec model
# Params: Str - file name
# Returns: Model - word2vec model
def word2vec_load_model(file_name):
    return Word2Vec.load("{0}.model".format(file_name))

In [66]:
# Create Word2Vec CBOW Model
word2vec_create_model(comment_lemma, W2V_SIZE, W2V_WINDOW_SIZE, W2V_MIN_COUNT, W2V_SG, "word2vec_cbow_model")

In [67]:
# Initialize model and build vocab
word2vec_cbow_model = word2vec_load_model("word2vec_cbow_model")

In [68]:
sentences_count = len(comment_lemma)
# Train model
word2vec_cbow_model.train(comment_lemma, total_examples=sentences_count, epochs = 10)

(3127332, 4703120)

In [69]:
# Print vocab
vocab = list(word2vec_cbow_model.wv.vocab)
print(vocab[:100])
print(comment_lemma[:2])

['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work', 'be', 'gay', 'or', 'white', 'there', 'two', 'way', 'why', 'do', 'erase', 'comment', 'about', 'ww', 'that', 'holocaust', 'of', 'jew', 'and', 'not', 'if', 'than', 'your', 'head', 'go', 'to', 'the', 'meeting', 'doubt', 'word', 'bible', 'homosexuality', 'a', 'sin', 'make', 'forehead', 'mass', 'with', 'pal', 'first', 'last', 'warn', 'fuck', 'i', 'wont', 'appreciate', 'any', 'more', 'nazi', 'would', 'write', 'in', 'page', 'dont', 'wish', 'talk', 'anymore', 'dark', 'side', 'stupid', 'peace', 'shit', 'stop', 'delete', 'stuff', 'asshole', 'die', 'fall', 'hole', 'hell', 'hi', 'back', 'again', 'undo', 'edits', 'pair', 'weiner', 'think', 'fagget', 'get', 'burn', 'hate', 'm', 'sorry', 'we', 'cant', 'have', 'sex', 'im', 'run', 'out', 'reply', 'above']
[['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work'], ['you', 'be', 'gay', 'or', 'antisemmitian', 'archangel', 'white', 'tiger', 'meow', 'greetingshhh', 'uh', 'ther

In [70]:
# Test
w1 = "cocksucker"
w2 = "piss"
print(word2vec_cbow_model.wv.get_vector(w1).shape)
# Vocab 
print("Vocab: {0}", len(word2vec_cbow_model.wv.vocab))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
print(len(word2vec_cbow_model.wv.get_vector('you')))

(100,)
Vocab: {0} 4784
Length of the vector generated for a word
100


In [71]:
# Remove words not in word2vec model
# Params: 
#   Word2Vec Model  - @model:           Word2Vec Model
#   List            - @all_comments:    Pre-processed tokens (2D List)
# Output: List - Tokens with only words in model's vocab (2D List)
def word2vec_remove_words_outside_vocab(model, all_comments):
    # Remove words not in w2v cbow model vocab
    doc = []
    for comment in all_comments:
        temp = []
        for word in comment:
            if word in model.wv.vocab:
                temp.append(word)
        doc.append(temp)
    return doc

# Average word vectors of each comment
# Params:
#   Word2Vec Model  -   @model:             Word2Vec Model 
#   List -              @comment_vocab:     Tokens with only words in model's vocab (2D List)
# Output: 
def word2vec_average_vectors(model, comment_vocab):
    return np.mean(model.wv.get_vector, axis=1)
    #doc = [word for comment in all_comments for word in comment if word in gensim_cbow_model.wv.vocab]
    #print(doc[:10])
    #return np.mean(word2vec_model[doc], axis=0)

# Average word vectors of each comment with TFIDF

In [72]:
print(word2vec_remove_words_outside_vocab(word2vec_cbow_model, comment_lemma)[:3])

[['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work'], ['you', 'be', 'gay', 'or', 'white', 'there', 'be', 'two', 'way', 'why', 'you', 'do', 'erase', 'my', 'comment', 'about', 'ww', 'that', 'holocaust', 'be', 'of', 'jew', 'and', 'not', 'if', 'you', 'be', 'than', 'your', 'head', 'and', 'go', 'to', 'the', 'meeting', 'if', 'you', 'doubt', 'word', 'of', 'the', 'bible', 'that', 'homosexuality', 'be', 'a', 'sin', 'make', 'a', 'on', 'your', 'forehead', 'go', 'to', 'the', 'mass', 'with', 'your', 'gay', 'pal', 'first', 'and', 'last', 'warn', 'you', 'fuck', 'gay', 'i', 'wont', 'appreciate', 'if', 'any', 'more', 'nazi', 'would', 'write', 'in', 'my', 'page', 'i', 'dont', 'wish', 'to', 'talk', 'to', 'you', 'anymore', 'of', 'the', 'dark', 'side'], ['stupid', 'peace', 'of', 'shit', 'stop', 'delete', 'my', 'stuff', 'asshole', 'go', 'die', 'and', 'fall', 'in', 'a', 'hole', 'go', 'to', 'hell']]


In [73]:
# Gensim Doc2Vec constants
D2V_SIZE = 100          # default is 100
D2V_WINDOW_SIZE = 5     # default is 5
D2V_MIN_COUNT = 5       # default is 5
D2V_DM_MEAN = 1
D2V_DBOW_WORDS = 0
D2V_EPOCHS = 10

In [82]:
# Function to create and save Doc2Vec model
# Params: 
    # @documents, @vector_size, @window, @min_count, @dm_mean and @dbow_words are Doc2Vec model params
    # TaggedDocument    - @documents:   input corpus with pre-processed tokens in 2D list form
    # Int               - @size:        dimensionality of word vectors (typically between 100-300)
    # Int               - @window: max distance between current and predicted word in a sentence
    # Int               - @min_count:   ignores all words with total frequency lower than this
    # Binary            - @dm_mean:     sum or mean of word vectors; 0 - sum, 1 - mean
    # Binary            - @dbow_words:  training algorithm, 0 - bow, 1 - skip-gram and bow 
    # Str               - @file_name:   model name
# Output: Model file in directory/repo 
def doc2vec_create_model(documents, size, window, min_count, dm_mean, dbow_words, file_name):
    model = Doc2Vec(documents=documents, vector_size=size, window=window, min_count=min_count, dm_mean=dm_mean, dbow_words=dbow_words)
    model.save("{0}.model".format(file_name))

# Function to load Doc2Vec model
# Params: Str - file name
# Returns: Model - doc2vec model
def doc2vec_load_model(file_name):
    return Word2Vec.load("{0}.model".format(file_name))

In [83]:
# Set up comments for Gensim Doc2Vec Model
comments = [TaggedDocument(comment, [i]) for i, comment in enumerate(comment_lemma)]

# Create Doc2Vec model
doc2vec_create_model(comments, D2V_SIZE, D2V_WINDOW_SIZE, D2V_MIN_COUNT, D2V_DM_MEAN, D2V_DBOW_WORDS, "doc2vec_dbow_model")

In [84]:
doc2vec_load_model("doc2vec_dbow_model")

<gensim.models.doc2vec.Doc2Vec at 0x1c2e55a3850>