## Gensim Word2Vec example

from : https://rare-technologies.com/word2vec-tutorial/

In [1]:
import gensim
import os
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import string


In [2]:
class SentenceReader:
    
    def __init__(self, dir_name, file_names = None):
        self.dir_name = dir_name
        self.file_names = file_names
    
    def __iter__(self):
        if self.file_names:
            for file_name in self.file_names:
                with open("{}/{}".format(self.dir_name,file_name), "r") as file:
                    for line in file:
                        quote = line.split("+++$+++")[-1]
                        yield self.parse_line(quote)
        else:
            for file_name in os.listdir(self.dir_name):
                if "." != file_name[0]:
                    with open(os.path.join(self.dir_name, file_name)) as file:
                        for line in file:
                            quote = line.split("+++$+++")[-1]
                            yield self.parse_line(quote)
                            
    def parse_line(self, line):
        line = line.lower()
        words = line.split()
        clean = [self.parse_word(word) for word in words]
        return clean
    
    @staticmethod
    def parse_word(word):
        return "".join(ch for ch in word if ch in string.ascii_lowercase)
                        

In [3]:
reader = SentenceReader("cornell movie-dialogs corpus", ["utf-8_movie_lines.txt"])
# corpus at: http://www.mpi-sws.org/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [4]:
model = gensim.models.Word2Vec(iter = 5)

In [5]:
model.build_vocab(reader)

In [6]:
model.train(reader)

11451210

In [7]:
model["god"]

array([-0.17159574, -1.38211524, -0.65839279,  0.66371918,  0.25008303,
       -0.87137908,  0.46099725, -0.40143147,  0.42487606,  0.67892998,
       -0.47953248, -0.57823902,  0.89605874,  1.28453493, -2.19975948,
        1.50941861, -0.73519313,  1.4789629 , -0.0514161 ,  0.17376542,
       -0.77308714,  1.57945395, -0.62382013,  0.45165995,  0.27362987,
        0.20029201,  0.10952923,  0.33495393, -1.5909481 , -0.74223483,
       -0.21992083,  0.66005391, -0.15553699,  1.2604543 , -1.928038  ,
        2.13802052, -0.12578124,  0.64992714,  0.70353693,  0.26033747,
        0.93664891, -0.77797729, -1.19883513, -0.873191  ,  0.57891065,
       -1.00570858, -0.21571285, -0.19170851,  0.97861749, -0.38185877,
       -1.48081958, -0.86015868,  0.75768232,  0.77532715,  0.24341521,
        0.2304181 ,  3.11559248,  0.24336527, -0.5964914 ,  0.1098717 ,
        0.60708249, -0.58356047, -0.94773459,  0.81611025,  0.73523206,
       -1.11317515, -0.07464366, -0.10873374, -0.54244339,  0.08

In [8]:
Vectors = np.asarray([model[key] for key in model.vocab.keys()])

In [9]:
i = 0
for key in model.vocab.keys():
    print(key)
    i += 1
    if i == 5:
        break


viceroy
ally
cycle
foremost


In [10]:
tsne_rep = TSNE()

In [11]:
words_qty = 10000

In [12]:
embedding = tsne_rep.fit_transform(Vectors[0: words_qty])

In [13]:
embedding_x = [v[0] for v in embedding]
embedding_y = [v[1] for v in embedding]
embedding_labels = [key for key in model.vocab.keys()]

In [14]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(embedding_labels[0: words_qty]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("movie quotes word embedding 10,000", dpi = 1000)

In [15]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(embedding_labels[0: 5000]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("movie quotes word embedding 5,000", dpi = 1000)

In [16]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(embedding_labels[0: 1000]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("movie quotes word embedding 1,000", dpi = 1000)