In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import gensim
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import itertools


In [3]:
# Download the data set from
# https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books?resource=download

In [4]:
os.listdir("/content/data")

['001ssb.txt', '004ssb.txt', '005ssb.txt', '002ssb.txt', '003ssb.txt']

In [None]:
nltk.download("all")

In [7]:
# Appending Data into a single list
ls = []
for file in os.listdir("/content/data/"):
  print(file)
  f = open("/content/data/" + file, encoding="unicode_escape")
  corpus = f.read()
  sentences = sent_tokenize(corpus)
  processed_sent = [simple_preprocess(i) for i in sentences]
  ls.append(processed_sent)

001ssb.txt
004ssb.txt
005ssb.txt
002ssb.txt
003ssb.txt


In [9]:
complete_story = list(itertools.chain(*ls))

In [10]:
len(complete_story)

145020

In [12]:
model = gensim.models.Word2Vec(window=10, min_count=5, vector_size=150)


In [15]:
model.build_vocab(complete_story)

In [16]:
# Total number of sentences
model.corpus_count

145020

In [17]:
# Total number of words
model.corpus_total_words

1725638

In [18]:
# Model Training
model.train(complete_story, total_examples =model.corpus_count, epochs=5)

(6481726, 8628190)

In [20]:
# Vector for a word, say for example: "King"
model.wv["king"]

array([ 1.3106287e+00,  3.1381755e+00, -5.7397455e-01,  7.1758795e-01,
       -7.5017130e-01, -1.9451336e+00, -3.8772634e-01, -1.3240814e+00,
       -1.2066334e-01,  6.5825492e-01, -2.8941852e-01, -7.3162347e-02,
       -4.0683004e-01, -1.2361552e+00, -3.7670376e+00,  1.6607195e+00,
        1.5863153e+00,  2.5746384e+00, -1.9089932e+00,  1.2175533e+00,
       -3.1809521e+00,  2.4958861e+00, -8.8660091e-01, -1.1168407e+00,
        1.2353822e+00,  1.0060754e+00,  8.8354313e-01, -3.3197992e+00,
       -7.8650081e-01, -2.0317009e-01,  1.6143212e+00,  6.4539796e-01,
       -1.9752520e-01, -1.6645670e+00,  8.2082343e-01, -4.2723730e-01,
        7.6190287e-01, -5.2721941e-01,  2.6124749e+00,  1.2489467e+00,
        2.1795642e+00,  1.4089648e-01, -6.5693915e-01,  5.5869341e-01,
       -9.7269975e-02,  3.1149057e-01,  6.8966663e-01,  9.5656615e-01,
        6.5776318e-01, -2.4244046e+00, -2.7918520e+00, -9.2992586e-01,
        2.5865774e+00, -9.4470076e-02, -4.2614323e-01, -2.8686243e-01,
      

In [21]:
len(model.wv["king"])

150

In [23]:
# Vector for a non vocabulary word, say for example: "anjana"

model.wv["anjana"]

KeyError: "Key 'abcdc' not present"

In [22]:
#Get most similar word for "King"
model.wv.most_similar("king")

[('baratheon', 0.6598556041717529),
 ('throne', 0.6511567831039429),
 ('prince', 0.6505590081214905),
 ('realm', 0.6503951549530029),
 ('victory', 0.6088259220123291),
 ('usurper', 0.6074551939964294),
 ('council', 0.5823007225990295),
 ('tourney', 0.5798454284667969),
 ('conqueror', 0.578585684299469),
 ('rebellion', 0.5724166631698608)]

In [29]:
#Get the word doesn't match / odd one in a list based on similarity
model.wv.doesnt_match(["doctor","king","wealth","palace"])



'king'

In [32]:
model.wv.get_normed_vectors().shape

(11760, 150)

In [34]:
len(model.wv.index_to_key)

11760

In [35]:
#Save the model
model.save("word2vec.model")