In [1]:
import numpy as np
import pickle
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from scipy.spatial import cKDTree
from sklearn.datasets import fetch_20newsgroups
from gensim.parsing.preprocessing import preprocess_documents

This notebook walks through how the data was gathered and preprocessed for this project. This can be used as a guide on how other data sets or word vector models should be substituted in for these.

## Fetch `20newsgroups` dataset

In [2]:
newsgroups = fetch_20newsgroups(
    subset='all',
    remove=('headers', 'footers', 'quotes')
)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
%%time

text_preprocessed = preprocess_documents(newsgroups.data)

CPU times: user 13.4 s, sys: 62.9 ms, total: 13.4 s
Wall time: 13.5 s


In [4]:
text_preprocessed = np.array([np.array(doc) for doc in text_preprocessed])

In [5]:
def element_length(x):
    return len(x)

In [6]:
elv = np.vectorize(element_length)

In [7]:
doc_lengths = elv(text_preprocessed)

In [8]:
idx_del = np.where(doc_lengths == 0)[0]

In [9]:
text_preprocessed = np.delete(arr=text_preprocessed, obj=idx_del)

In [10]:
corpus_raw = np.array(newsgroups.data)

In [11]:
corpus_raw = np.delete(arr=corpus_raw, obj=idx_del)

In [12]:
corpus_raw = newsgroups.data

with open('hltm_welda/model/data/newsgroups_raw_data.pickle', 'wb') as f:
    pickle.dump(obj=corpus_raw, file=f)

In [13]:
with open('hltm_welda/model/data/newsgroups_preprocessed_data.pickle', 'wb') as f:
    pickle.dump(obj=text_preprocessed, file=f)

## Fetch Google's `word2vec` model trained on Wikipedia

Google's word2vec model pretrained on Wikipedia can be found [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit). Be sure to unzip the file before loading it. Be sure to use the correct filepath to load the model.

In [3]:
w2v_300d = KeyedVectors.load_word2vec_format(
    fname='~/Downloads/GoogleNews-vectors-negative300.bin',
    binary=True,
)

In [5]:
w2v_vocab_sorted = sorted(list(w2v_300d.vocab.keys()))

In [6]:
w2v_300d_vocab_sorted_vects = np.zeros(shape=(len(w2v_vocab_sorted), 300), dtype=np.float64) 

In [8]:
%%time

for index, word in enumerate(w2v_vocab_sorted):
    w2v_300d_vocab_sorted_vects[index] = w2v_300d.wv[word]

CPU times: user 23.4 s, sys: 8.11 s, total: 31.5 s
Wall time: 36.5 s


In [21]:
pca = PCA(
    n_components=2,
    random_state=42,
)

In [22]:
%%time

w2v_pca = pca.fit_transform(X=w2v_300d_vocab_sorted_vects)

CPU times: user 1min 31s, sys: 1min 4s, total: 2min 35s
Wall time: 2min 25s


In [23]:
%%time

w2v_pca_dict = {
    w2v_vocab_sorted[index]: w2v_pca[index]
    for index in range(len(w2v_vocab_sorted))
}

CPU times: user 3.12 s, sys: 1.97 s, total: 5.1 s
Wall time: 6.1 s


In [24]:
from scipy.spatial import cKDTree

In [25]:
%%time

tree = cKDTree(
    data=w2v_pca,
    leafsize=16,
    compact_nodes=True,
    balanced_tree=True,
)

CPU times: user 5.23 s, sys: 57.4 ms, total: 5.29 s
Wall time: 5.29 s


In [26]:
w2v_id2token = {
    index: word
    for index, word
    in enumerate(w2v_300d.vocab.keys())
}

In [27]:
w2v_token2id = {
    word: index
    for index, word
    in w2v_id2token.items()
}

## Save to `hl_welda_tmp` project

In [28]:
with open('hltm_welda/model/data/w2v_wikipedia_pca_dict.pickle', 'wb') as f:
    pickle.dump(obj=w2v_pca_dict, file=f)

In [29]:
with open('hltm_welda/model/data/cKDTree_w2v_wikipedia_pca.pickle', 'wb') as f:
    pickle.dump(obj=tree, file=f)

In [30]:
with open('hltm_welda/model/data/w2v_wikipedia_id2token.pickle', 'wb') as f:
    pickle.dump(obj=w2v_id2token, file=f)

In [31]:
with open('hltm_welda/model/data/w2v_wikipedia_token2id.pickle', 'wb') as f:
    pickle.dump(obj=w2v_token2id, file=f)