In [1]:
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
import multiprocessing
import pandas as pd
import numpy as np
import pickle
import random
from time import time

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
with open('data/train.pkl', 'rb') as f:
    train = pickle.load(f)

In [25]:
def clean_descriptions(tokens):
    '''
    Remove stopwords;

    
    Remove punctuation
    '''
    stop_words = stopwords.words('english')
    res = [w.lower() for w in tokens if not w in stop_words] ## Remove stopwords
    res_punc = [w for w in res if w.isalpha()] ## Remove punctuation
    return res_punc

In [26]:
tokens = [word_tokenize(x) for x in train]
clean_tokens = [clean_descriptions(t) for t in tokens]

In [27]:
cores = multiprocessing.cpu_count() ## numbers of cores in the computer
model = Word2Vec(min_count=1,
                window=5,
                size=50,
                sample=6e-5,
                alpha=0.03,
                min_alpha=0.0007,
                negative=20,
                workers=cores-1)

In [28]:
model.build_vocab(clean_tokens)

In [29]:
t = time()
model.train(clean_tokens, total_examples=model.corpus_count, epochs=100)
print(f"Time to train the model: {round((time() - t) / 60, 4)} mins")

Time to train the model: 0.5962 mins


In [30]:
''' Checking for similar words '''
model.wv.most_similar(['guns'], topn=10)

[('gun', 0.620539128780365),
 ('shui', 0.6091277003288269),
 ('owners', 0.6024693250656128),
 ('mass', 0.5929380655288696),
 ('feng', 0.5868189334869385),
 ('carry', 0.5839115381240845),
 ('shootings', 0.5771595239639282),
 ('educate', 0.5662846565246582),
 ('nightclubs', 0.5634212493896484),
 ('educators', 0.5628822445869446)]

In [31]:
model.save('models/word2vec_model.model')

### Load test data

In [38]:
with open('data/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [45]:
test = random.sample(test, 100)

### Prepare test data

In [46]:
tokens = [word_tokenize(x) for x in test]
clean_test_data = [clean_descriptions(t) for t in tokens]

In [47]:
words, vectors = [], []
for desc in clean_test_data: ## Iterate over descriptions in test data
    for word in desc: ## Iterate over words in descriptions
        try:
            vectors.append(model.wv.get_vector(word))
            words.append(word)
        except KeyError:
            print(f'Word {word} not found in vocab')

Word logline not found in vocab
Word cho not found in vocab
Word disbanded not found in vocab
Word garagiste not found in vocab
Word fried not found in vocab
Word reposted not found in vocab
Word steenburgen not found in vocab
Word chastising not found in vocab
Word maye not found in vocab
Word alunageorge not found in vocab
Word cosima not found in vocab
Word deke not found in vocab
Word dickerson not found in vocab
Word attic not found in vocab
Word romancing not found in vocab


In [48]:
len(vectors)

911

In [50]:
np.savetxt('data/words.txt', words, fmt='%s')
np.save('data/vectors.npy', vectors)