# Bgt2Vec

Original code is generated from © Yuriy Guts, 2016

## Imports

In [None]:
from __future__ import absolute_import, division, print_function

In [None]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [None]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
%pylab inline

**Set up logging**

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

**Download NLTK tokenizer models (only the first time)**

In [None]:
nltk.download("punkt")

## Prepare Corpus

In [None]:
# change the current directory to read the data
os.chdir(r"C:\Users\Sultan\Desktop\data\PreprocessedData") 

In [None]:
df = pd.read_csv('CombinedData.csv', engine='python')

In [None]:
df.head()

In [None]:
# Rename col 0
df.columns = ['word','organization','year']
df.head()

In [None]:
corpus = df.word
# Join the elements and sperate them by a single space
corpus = ' '.join(word for word in corpus)

In [None]:
corpus[:196]

In [None]:
# change the current directory to read the data
os.chdir(r"C:\Users\Sultan\Desktop\data\PreprocessedData\TextFiles") 

# Creating a text file
text_data = open("CombinedData.txt","a") 

# Writing the string to the file
text_data.write(corpus)

# Closing the file
text_data.close() 

**Load files**

In [None]:
bgt_filename = "CombinedData.txt"

In [None]:
corpus_raw = u""
print("Reading '{0}'...".format(bgt_filename))
with codecs.open(bgt_filename, "r", "utf-8") as book_file:
    corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

**Split the corpus into sentences**

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [None]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("corpus contains {0:,} tokens".format(token_count))

## Train Word2Vec

In [None]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

In [None]:
bgt2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
    
)

In [None]:
bgt2vec.build_vocab(sentences)

In [None]:
print("Word2Vec vocabulary length:", len(bgt2vec.wv.vocab))

**Start training**

In [None]:
bgt2vec.train(sentences, total_examples=bgt2vec.corpus_count, epochs=50)

**Save to file, can be useful later**

In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
bgt2vec.save(os.path.join("trained", "bgt2vec.w2v"))

## Explore the trained model.

In [None]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "bgt2vec.w2v"))

### Compress the word vectors into 2D space and plot them

In [None]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [None]:
all_word_vectors_matrix = bgt2vec.wv.syn0

**Train t-SNE**

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

**Plot the big picture**

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[bgt2vec.wv.vocab[word].index])
            for word in bgt2vec.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [None]:
points.head(10)

In [None]:
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(20, 12))

**Zoom in to some interesting places**

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

**words related endup together**

In [None]:
plot_region(x_bounds=(5, 10), y_bounds=(-0.5, -0.1))

### Explore semantic similarities between words

**Words closest to the given word**

In [None]:
bgt2vec.most_similar("guilford")

In [None]:
bgt2vec.most_similar("budget")

**Linear relationships between word pairs**

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = bgt2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("guilford","budget","documents")