## Gensim Word2Vec example

from : https://rare-technologies.com/word2vec-tutorial/

In [1]:
import gensim
import os
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import string


In [2]:
class SentenceReader:
    
    def __init__(self, dir_name, file_names = None):
        self.dir_name = dir_name
        self.file_names = file_names
    
    def __iter__(self):
        if self.file_names:
            for file_name in self.file_names:
                print(file_name)
                with open("{}/{}".format(self.dir_name,file_name), "r") as file:
                    i = 0
                    for line in file:
                        if i % 1000000 == 0:
                            print(".", end = "")
                        if i == 5000000:
                            raise StopIteration
                        yield self.parse_line(line)
                        i += 1
                    print("\n")
        else:
            for file_name in os.listdir(self.dir_name):
                if "." != file_name[0]:
                    with open(os.path.join(self.dir_name, file_name)) as file:
                        for line in file:
                            yield self.parse_line(line)
                            
    def parse_line(self, line):
        line = line.lower()
        words = line.split()
        clean = [self.parse_word(word) for word in words]
        return clean
    
    @staticmethod
    def parse_word(word):
        return "".join(ch for ch in word if ch in string.ascii_lowercase)
                        

In [3]:
reader1 = SentenceReader("Sentences", ["utf-8_movie_quotes.txt", "utf-8_supreme_court_quotes.txt", "utf-8_congress1.txt"])
# corpus at: http://www.mpi-sws.org/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [4]:
model = gensim.models.Word2Vec(iter = 2, min_count = 25)

In [5]:
model.build_vocab(reader1)

utf-8_movie_quotes.txt
.

utf-8_supreme_court_quotes.txt
.

utf-8_congress1.txt
......

In [6]:
model.train(reader1)

utf-8_movie_quotes.txt
.

utf-8_supreme_court_quotes.txt
.

utf-8_congress1.txt
......utf-8_movie_quotes.txt
.

utf-8_supreme_court_quotes.txt
.

utf-8_congress1.txt
......

172918209

In [7]:
model["god"]

array([ 3.45946646,  1.74204123,  2.08415318,  3.52995658, -2.45585394,
        1.5413512 , -1.34915471, -0.32728785,  1.30443788, -1.84741306,
       -1.40759993, -0.62682223, -0.51812202, -2.85968113,  3.78868079,
       -0.24303038,  0.42511192,  0.62268603, -2.61488843, -2.01616597,
        3.9499526 ,  1.76219869, -2.41688704, -0.21294375,  0.58989346,
        2.73736143, -0.12912902, -6.74059296,  1.19471288,  3.59105897,
        2.61028314, -2.77319026, -0.85091722,  1.19799924,  2.93369222,
        1.40380621,  0.93731457, -0.54000568,  0.50826752,  1.8099215 ,
       -4.21683359, -1.70523465, -8.06800652, -1.85582972,  2.07072902,
        0.83078301,  1.45601261, -1.38770604,  2.18412066, -0.46504655,
       -2.65539408,  2.17604017,  1.3139751 , -2.26392579,  0.68120408,
        1.85080278,  1.44322467,  0.7653352 ,  0.75531322,  1.7513535 ,
       -1.18536985,  7.03028584, -0.25995138,  2.64513445, -5.00492716,
       -2.22460008,  0.18032372,  0.08367028,  0.64354825, -1.09

In [14]:
keys = model.vocab.keys()
Vectors = np.asarray([model[key] for key in keys])

In [15]:
i = 0
for key in keys:
    print(key)
    i += 1
    if i == 5:
        break


backdrop
periods
dil
delegates


In [16]:
tsne_rep = TSNE()

In [17]:
words_qty = 10000

In [18]:
embedding = tsne_rep.fit_transform(Vectors)

In [19]:
embedding_x = [v[0] for v in embedding]
embedding_y = [v[1] for v in embedding]

In [24]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(list(keys)[0: words_qty]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("3 datasets word embedding 10,000", dpi = 1000)

In [25]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(list(keys)[0: 5000]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("3 datasets word embedding 5,000", dpi = 1000)

In [26]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(list(keys)[0: 1000]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("3 datasets word embedding 1,000", dpi = 1000)

In [27]:
plt.clf()
plt.xlim(min(embedding_x) , max(embedding_x))
plt.ylim(min(embedding_y) , max(embedding_y))

plt.scatter(embedding_x, embedding_y, marker = "")
for i, tag in enumerate(list(keys)[0: 2500]):
    plt.annotate(tag, (embedding_x[i], embedding_y[i]), size = 0.1)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 250
fig_size[1] = 100
plt.rcParams["figure.figsize"] = fig_size
plt.margins(0.01,0)


plt.savefig("3 datasets word embedding 2,500", dpi = 1000)

In [28]:
"queen" in model.vocab.keys()

True

In [29]:
search = ["man", "woman", "king", "queen"]
indices = list(map(list(keys).index, search))
indices

[10486, 6621, 6800, 11945]

In [None]:
plt.clf()
for i, j in enumerate(indices):
    print(i, j)
    plt.annotate(search[i], (embedding_x[j], embedding_y[j]), size = 0.1)
plt.savefig("man-woman vector", dpi = 1000)

In [30]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('silence', 0.5810280442237854)]