In [44]:
import numpy as np
import pandas as pd


Gensim is an open-source Python library for topic modeling, document indexing, and similarity retrieval with large corpora. It is designed to be fast and efficient, and it is commonly used in a variety of natural language processing (NLP) tasks.

In [45]:
!pip install gensim



In [46]:
import gensim
import os

In [47]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story =[]
for filename in os.listdir('data'):  # fetching the files from the data folder
    f = open(os.path.join('data',filename), encoding='utf-8',errors='ignore') # opening the individual file
    corpus = f.read()   # reading those individual files
    raw_sent = sent_tokenize(corpus)  #converting the whole corpus into  sentences
    for sent  in raw_sent:
      story.append(simple_preprocess(sent)) # all those basic preprocessing is done by this function


In [49]:
len(story)

107386

In [50]:
story[0]

['george',
 'martin',
 'dance',
 'with',
 'dragons',
 'book',
 'five',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'dedication',
 'this',
 'one',
 'is',
 'for',
 'my',
 'fans',
 'for',
 'lodey',
 'trebla',
 'stego',
 'pod',
 'caress',
 'yags',
 'ray',
 'and',
 'mr',
 'kate',
 'chataya',
 'mormont',
 'mich',
 'jamie',
 'vanessa',
 'ro',
 'for',
 'stubby',
 'louise',
 'agravaine',
 'wert',
 'malt',
 'jo',
 'mouse',
 'telisiane',
 'blackfyre',
 'bronn',
 'stone',
 'coyotes',
 'daughter',
 'and',
 'the',
 'rest',
 'of',
 'the',
 'madmen',
 'and',
 'wild',
 'women',
 'of',
 'the',
 'brotherhood',
 'without',
 'banners',
 'for',
 'my',
 'website',
 'wizards',
 'elio',
 'and',
 'linda',
 'lords',
 'of',
 'westeros',
 'winter',
 'and',
 'fabio',
 'of',
 'wic',
 'and',
 'gibbs',
 'of',
 'dragonstone',
 'who',
 'started',
 'it',
 'all',
 'for',
 'men',
 'and',
 'women',
 'of',
 'asshai',
 'in',
 'spain',
 'who',
 'sang',
 'to',
 'us',
 'of',
 'bear',
 'and',
 'maiden',
 'fair',
 'and',
 'th

In [51]:
# model building
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,# accept those sentences which has minimum of 2 words
)

In [52]:
# vocabulary building
model.build_vocab(story)

In [53]:
# training my deep learning model
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

# total_examples means total number of sentence in corpus
# epochs=model.epochs has default value of 5

(4360914, 5745650)

now model has been trained

In [54]:
model.wv.most_similar('daenerys')

[('stormborn', 0.861239492893219),
 ('targaryen', 0.8134762048721313),
 ('unburnt', 0.8126688599586487),
 ('arryn', 0.8108766078948975),
 ('murdered', 0.7996143102645874),
 ('agreed', 0.7913310527801514),
 ('ii', 0.78371661901474),
 ('unworthy', 0.7758724093437195),
 ('widow', 0.7674218416213989),
 ('answered', 0.7649771571159363)]

In [55]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])



'robb'

In [56]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [57]:
model.wv['king']

array([ 1.0132014 , -1.1051873 ,  2.227161  , -0.30017388, -1.5077984 ,
        1.5185739 ,  1.5378664 , -0.29535833, -0.22952658, -0.582897  ,
        0.6497133 , -0.3040155 ,  0.1155894 , -2.0190086 ,  0.84173167,
       -2.3454762 , -1.292824  , -0.4546378 , -0.60126466, -1.3703802 ,
       -0.7476467 ,  0.6958061 , -0.26881745,  0.53680634, -2.5885623 ,
       -1.9794685 , -1.5206012 , -0.11036785,  0.5368849 , -0.23803054,
       -0.9732337 , -1.9746754 ,  2.6130133 , -2.9629896 , -0.57524735,
       -2.0843015 ,  2.0241163 , -0.06795266, -0.6172719 ,  1.7504416 ,
        1.0925435 , -1.851664  ,  0.31837258,  3.27329   , -0.05807016,
       -1.34654   ,  1.0112865 , -2.3099744 ,  1.0120132 ,  2.761344  ,
        0.2775448 , -1.3822366 , -0.696477  ,  0.84353805,  0.69697684,
       -0.8331827 , -1.120654  ,  1.6400014 , -2.930317  , -0.31162918,
       -0.2856759 ,  0.8114016 ,  0.1548145 , -0.3891606 , -0.03937927,
       -1.803093  ,  2.0159674 ,  0.6425195 , -0.69575274, -1.14

In [58]:
model.wv.similarity('arya','sansa')

0.85330397

In [59]:
model.wv.similarity('cersei','sansa')

0.79205096

In [60]:
model.wv.similarity('tywin','sansa')

0.22612962

In [61]:
model.wv.get_normed_vectors()

array([[ 0.08132631,  0.06877049,  0.1024591 , ..., -0.12582901,
         0.03584458,  0.10531598],
       [-0.13205199,  0.10975498,  0.02926873, ..., -0.04413924,
         0.00130307,  0.07883435],
       [-0.07475775, -0.10678311, -0.11102105, ...,  0.16769278,
         0.25722733, -0.28938177],
       ...,
       [ 0.02761368, -0.2512104 ,  0.00636702, ...,  0.20590442,
         0.05694348, -0.05542291],
       [-0.10349599,  0.19588105, -0.06584194, ...,  0.0378728 ,
         0.06045563,  0.10864859],
       [-0.05696902,  0.03368986,  0.06079393, ..., -0.13754886,
         0.02386096,  0.03352608]], dtype=float32)

In [62]:
model.wv.get_normed_vectors()

array([[ 0.08132631,  0.06877049,  0.1024591 , ..., -0.12582901,
         0.03584458,  0.10531598],
       [-0.13205199,  0.10975498,  0.02926873, ..., -0.04413924,
         0.00130307,  0.07883435],
       [-0.07475775, -0.10678311, -0.11102105, ...,  0.16769278,
         0.25722733, -0.28938177],
       ...,
       [ 0.02761368, -0.2512104 ,  0.00636702, ...,  0.20590442,
         0.05694348, -0.05542291],
       [-0.10349599,  0.19588105, -0.06584194, ...,  0.0378728 ,
         0.06045563,  0.10864859],
       [-0.05696902,  0.03368986,  0.06079393, ..., -0.13754886,
         0.02386096,  0.03352608]], dtype=float32)

In [63]:
y = model.wv.index_to_key

In [64]:
len(y)

15094

In [65]:
#Principal component analysis (PCA) is an unsupervised machine learning technique that is used for dimensionality reduction.
from sklearn.decomposition import PCA

In [66]:
pca = PCA(n_components=3)

In [67]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [69]:
X.shape # here we reduced the dimension from 100 to 3

(15094, 3)

In [70]:
import plotly.express as px
fig = px.scatter_3d(X[:500],x=0,y=1,z=2, color=y[:500])
fig.show()