In [33]:
import numpy as np
import pandas as pd
import gensim
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

In [37]:
story = []

for filename in os.listdir('Dataset'):
    file = open(os.path.join("Dataset", filename))
    corpus = file.read()
    raw_sentence = sent_tokenize(corpus)
    for sentence in raw_sentence:
        # removeing stopwords
        # sentence = [word for word in sentence.lower(
        # ).split() if word not in stopwords.words('english')]
        # sentence = ' '.join(sentence)
        story.append(simple_preprocess(sentence))

In [8]:
print(len(story))
# story

145020


In [10]:
# window: The maximum distance between the current and predicted word within a sentence.
# min_count: Ignores all words with total frequency lower than this.
# workers: Use these many worker threads to train the model (=faster training with multicore machines).
model = gensim.models.Word2Vec(
    window=10, min_count=2, workers=4, vector_size=100)

In [11]:
model.build_vocab(story)

In [12]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(6569022, 8628190)

In [13]:
# It will print all the words related daenerys
model.wv.most_similar("daenerys")

[('stormborn', 0.7923920750617981),
 ('targaryen', 0.7917531132698059),
 ('princess', 0.7315335273742676),
 ('elia', 0.7229293584823608),
 ('unburnt', 0.7199075222015381),
 ('rhaegar', 0.6851088404655457),
 ('aegon', 0.6839161515235901),
 ('myrcella', 0.6771818995475769),
 ('queen', 0.675822377204895),
 ('jenny', 0.6747910976409912)]

In [17]:
# "Jon" is diffrent from others, others are cousins
model.wv.doesnt_match(["jon", "rikon", "robb", "arya", "sansa", "bran"])

'jon'

In [18]:
# bronn is diffrent from others
model.wv.doesnt_match(["cersei", "jaime", "bronn", "tyrion"])

'bronn'

In [16]:
# Vector of "jon"
model.wv["jon"]

array([-0.444146  , -0.7432829 , -1.2598553 , -0.92181116,  0.10688794,
       -0.60944104,  0.50321895,  0.48778677, -2.1468153 , -0.3138522 ,
       -1.518409  ,  0.4818032 ,  0.43263495, -2.3630474 ,  0.01018729,
       -0.32954967,  0.5164952 ,  3.47644   ,  1.0148877 , -0.5321463 ,
        2.1642606 , -2.5362906 , -0.8403213 ,  1.9357666 , -0.22616406,
       -1.56578   ,  2.0944457 ,  0.7115423 ,  0.72849387, -0.8910567 ,
       -2.1558921 , -1.0010891 , -0.0887501 , -1.7853795 ,  1.7669979 ,
        0.6649412 ,  1.0149761 ,  0.00538142,  0.46163526, -1.3363612 ,
       -1.0228379 , -1.7552032 ,  0.28748938,  1.0717545 ,  1.0243027 ,
       -3.2778132 , -0.01607786,  1.3288836 , -1.6610186 , -0.03832102,
        0.03196004, -0.4541148 ,  2.0480604 , -0.5190655 , -0.61130357,
       -0.7441726 , -0.72403413, -1.9356238 ,  0.13987395, -0.19436942,
        1.5313241 ,  0.95564854,  0.52567005, -1.6767005 ,  0.2504308 ,
       -0.19599454,  0.71388507,  3.2163851 ,  0.24078506, -0.20

In [19]:
# Arya and sansa are sister so high similarity
model.wv.similarity("arya", "sansa")

0.853336

In [20]:
# tywin and sansa are diffrent so low similarity
model.wv.similarity("tywin", "sansa")

0.24163988

#### To show that vector we can use PCA to reduce dimentions


In [21]:
# get_normed_vectors: Return all vectors normalized to unit length.
model.wv.get_normed_vectors()

array([[-0.10899297, -0.02549146,  0.11918868, ..., -0.02971656,
         0.07672407,  0.15471824],
       [-0.12935168, -0.15842023,  0.16181141, ...,  0.04149918,
        -0.21425322,  0.04247783],
       [ 0.12881252,  0.00394801,  0.00448241, ...,  0.0533753 ,
        -0.02457772, -0.03714726],
       ...,
       [ 0.05603361,  0.02748746, -0.09801928, ..., -0.15355676,
         0.06986222, -0.1886574 ],
       [ 0.00258152,  0.01536783,  0.18698503, ...,  0.06992401,
         0.1460495 , -0.03570397],
       [-0.06998096,  0.03929054,  0.09199707, ..., -0.07180195,
         0.1109774 , -0.04749938]], dtype=float32)

In [22]:
model.wv.get_normed_vectors().shape

(17453, 100)

In [25]:
# index_to_key use for Get the word from index
y = model.wv.index_to_key
# print(y)
print(len(y))

17453


In [26]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [27]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [29]:
X.shape

(17453, 3)

In [28]:
X[:5]

array([[-0.15509878,  0.59484065, -0.06524827],
       [-0.16353396,  0.34728566,  0.00340085],
       [ 0.30946556,  0.5712521 ,  0.21582179],
       [-0.00132891,  0.36319765, -0.14365149],
       [ 0.12089884,  0.55755407,  0.2929132 ]], dtype=float32)

In [32]:
import plotly.express as px
# fig = px.scatter_3d(X[: 500], x=0, y=1, z=2, color=y[:500]) # plot max 100 words together otherwise required more computational
fig = px.scatter_3d(X[: 50], x=0, y=1, z=2, color=y[:50])
fig.show()