In [1]:
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
import multiprocessing
import pandas as pd
import numpy as np
import pickle
import random
from time import time

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
with open('data/train.pkl', 'rb') as f:
    train = pickle.load(f)

In [3]:
def clean_descriptions(tokens):
    '''
    Remove stopwords;

    
    Remove punctuation
    '''
    stop_words = stopwords.words('english')
    res = [w.lower() for w in tokens if not w in stop_words] ## Remove stopwords
    res_punc = [w for w in res if w.isalpha()] ## Remove punctuation
    return res_punc

In [4]:
tokens = [word_tokenize(x) for x in train]
clean_tokens = [clean_descriptions(t) for t in tokens]

In [5]:
cores = multiprocessing.cpu_count() ## numbers of cores in the computer
model = Word2Vec(min_count=1,
                window=5,
                size=50,
                sample=6e-5,
                alpha=0.03,
                min_alpha=0.0007,
                negative=20,
                workers=cores-1)

In [6]:
model.build_vocab(clean_tokens)

In [7]:
t = time()
model.train(clean_tokens, total_examples=model.corpus_count, epochs=100)
print(f"Time to train the model: {round((time() - t) / 60, 4)} mins")

Time to train the model: 0.6256 mins


In [9]:
''' Checking for similar words '''
model.wv.most_similar(['obama'], topn=10)

[('barack', 0.7298361659049988),
 ('donald', 0.6451386213302612),
 ('trump', 0.642876148223877),
 ('administration', 0.6392666697502136),
 ('president', 0.6343703866004944),
 ('environmental', 0.5987725257873535),
 ('congress', 0.5691197514533997),
 ('deliver', 0.5623358488082886),
 ('policy', 0.5582756996154785),
 ('immigration', 0.5485448241233826)]

In [10]:
model.save('models/word2vec_model.model')

### Load test data

In [11]:
with open('data/test.pkl', 'rb') as f:
    test = pickle.load(f)

### Prepare test data

In [12]:
'''
For a batter visualization
we're going to use 100 random headlines
'''
test = random.sample(test, 100)

In [13]:
tokens = [word_tokenize(x) for x in test]
clean_test_data = [clean_descriptions(t) for t in tokens]

In [14]:
words, vectors = [], []
for desc in clean_test_data: ## Iterate over descriptions in test data
    for word in desc: ## Iterate over words in descriptions
        try:
            vectors.append(model.wv.get_vector(word))
            words.append(word)
        except KeyError:
            print(f'Word {word} not found in vocab')

Word unedited not found in vocab
Word hashimoto not found in vocab
Word illusions not found in vocab
Word mantel not found in vocab
Word romanticizing not found in vocab
Word boone not found in vocab
Word drunkenly not found in vocab
Word durango not found in vocab
Word gustave not found in vocab
Word sats not found in vocab
Word patriarch not found in vocab
Word weaponizes not found in vocab


In [15]:
len(vectors)

883

In [16]:
np.savetxt('data/words.txt', words, fmt='%s')
np.save('data/vectors.npy', vectors)