In [11]:
import multiprocessing
import os
import re
import nltk
import gensim.models.word2vec as w2v
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [12]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\0x6f736f646f\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\0x6f736f646f\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
with open("../Data/Combined.txt", encoding="utf-8") as f:
    data = f.read()

In [16]:
tokenized_sent = nltk.tokenize.word_tokenize(language="english", text=data)

In [17]:
def sentence_to_word(corpus):
    clean = re.sub("[^a-zA-Z]"," ", corpus)
    words = clean.split()
    words = [word.lower() for word in words]
    return words

In [18]:
sentences = []
for tokenized_s in tokenized_sent:
    if len(tokenized_s) > 0:
        sentences.append(sentence_to_word(tokenized_s))

In [19]:
token_count = sum([len(sentence) for sentence in sentences])
print("The corpus contains {0:,} tokens".format(token_count))

The corpus contains 2,830,704 tokens


In [20]:
num_features = 10000
min_word_count = 4
num_workers = multiprocessing.cpu_count()
context_size = 8
downsampling = 1e-3
seed = 42

In [21]:
model = w2v.Word2Vec(sg=2, seed=seed, workers=num_workers, size=num_features, min_count=min_word_count, window=context_size, sample=downsampling)

  "C extension not loaded, training will be slow. "


In [22]:
model.build_vocab(sentences)

In [23]:
print("len of vocab is %d" %(len(model.wv.vocab)))

len of vocab is 33568


In [24]:
model.train(sentences,total_examples=model.corpus_count, epochs=model.iter)

  """Entry point for launching an IPython kernel.


(10425565, 14153520)

In [25]:
model.save(os.path.join("../model", "model.w2v"))

In [None]:
model = w2v.Word2Vec.load(os.path.join("../model","model.w2v"))

In [None]:
pca = PCA(n_components=2)
all_word_vectors = model.wv.syn0
all_word_vectors_2d = pca.fit_transform(all_word_vectors)

In [None]:
sns.set_context("poster")
data.plot.scatter("x","y",s=10,figsize=(20,12))

In [None]:
def plot_region(x_bounds,y_bounds):
    slice = data[
        (x_bounds[0] <= data.x) &
        (data.x <= x_bounds[1]) & 
        (y_bounds[0] <= data.y) &
        (data.y <= y_bounds[1])
    ]
#     print slice
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
plot_region((0,0.05),(0,0.2))

In [None]:
model.most_similar("hili")

In [None]:
model.most_similar("taifa")

In [None]:
model.most_similar("lugha")