## Gensim Word2Vec example

from : https://rare-technologies.com/word2vec-tutorial/

In [43]:
import gensim
import os
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import string


In [60]:
class SentenceReader:
    
    def __init__(self, dir_name, file_names = None):
        self.dir_name = dir_name
        self.file_names = file_names
    
    def __iter__(self):
        if self.file_names:
            for file_name in self.file_names:
                with open("{}/{}".format(self.dir_name,file_name), "r") as file:
                    for line in file:
                        quote = line.split("+++$+++")[-1]
                        yield self.parse_line(quote)
        else:
            for file_name in os.listdir(self.dir_name):
                if "." != file_name[0]:
                    with open(os.path.join(self.dir_name, file_name)) as file:
                        for line in file:
                            quote = line.split("+++$+++")[-1]
                            yield self.parse_line(quote)
                            
    def parse_line(self, line):
        line = line.lower()
        words = line.split()
        clean = [self.parse_word(word) for word in words]
        return clean
    
    @staticmethod
    def parse_word(word):
        return "".join(ch for ch in word if ch in string.ascii_lowercase)
                        

In [61]:
reader = SentenceReader("cornell movie-dialogs corpus", ["utf-8_movie_lines.txt"])
# corpus at: http://www.mpi-sws.org/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [62]:
model = gensim.models.Word2Vec(iter = 1)

In [63]:
model.build_vocab(reader)

In [64]:
model.train(reader)

2291060

In [65]:
model["god"]

array([ 0.1635423 , -0.58971685,  0.29543209, -0.17094728, -0.42626172,
       -0.57157815, -0.22132151, -0.05173911,  0.32470861,  0.05904179,
        0.64839745,  0.31419957, -0.77372396, -0.5285005 , -0.39857912,
        0.20024008,  0.44225854,  0.50174326, -0.5831908 , -0.2788851 ,
       -0.08613026, -0.42187759, -0.02481276, -0.43225208, -0.33815849,
        0.72863638,  0.59567386, -0.61421734, -0.22237566,  0.64171827,
        1.48421168, -0.37601259,  0.83808547, -0.20386536,  0.03576282,
       -0.29986972,  0.16733167,  0.47755608, -0.43766591,  0.6023891 ,
        0.35160473, -0.42721191,  0.14636001, -0.22811335, -0.41458517,
       -0.72565502, -0.39204457,  0.50356185,  0.42858922,  0.11908549,
       -0.52616674,  0.15215506,  0.37406671,  0.72888631,  0.78472912,
       -0.37038398,  0.54259282,  0.39926222, -0.3164078 ,  0.44766462,
        0.66929013, -0.52785265, -0.68453068,  1.63732898,  0.27019677,
        0.49352893, -0.51375264, -0.02014878,  0.39105985,  0.08

In [66]:
Vectors = np.asarray([model[key] for key in model.vocab.keys()])

In [67]:
i = 0
for key in model.vocab.keys():
    print(key)
    i += 1
    if i == 5:
        break


recovered
stimulating
info
miserable


In [68]:
tsne_rep = TSNE()

In [69]:
embedding = tsne_rep.fit_transform(Vectors[0: 100])

In [70]:
embedding_x = [v[0] for v in embedding]
embedding_y = [v[1] for v in embedding]
embedding_labels = [key for key in model.vocab.keys()]

In [73]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y)
for i, tag in enumerate(embedding_labels[0: 100]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = "xx-small")

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 25
fig_size[1] = 10
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("movie quotes word embedding", dpi = 1000)