In [23]:
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
import gensim

In [2]:
# google news corpus is used for training containing 3 billion words
# 300 dimensional vectors for 3 million words and phrases

In [3]:
word_emb = api.load('word2vec-google-news-300')

In [4]:
word_emb.get_index('car')

385

In [5]:
word_emb.get_vector('car').shape

(300,)

In [6]:
word_emb['man'];

In [7]:
word_emb.most_similar('man')

[('woman', 0.7664012908935547),
 ('boy', 0.6824871301651001),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585119128227234),
 ('Robbery_suspect', 0.5584409832954407),
 ('teen_ager', 0.5549196600914001),
 ('men', 0.5489763021469116)]

In [8]:
# so it contains vectors of words 

In [9]:
# types of word to vec => 1. CBOW   2. Skip-gram

# Custom word2vec embeddings

In [10]:
import nltk
import pandas as pd
import numpy as np
import os

In [11]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [13]:
# creating a list to keep the entire data
# the data used is game of thrones books (txt files) from kaggle
# link -> https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books
story = []

In [14]:
# list of text files

for file in os.listdir("./gotbooks"):
    print(file)

003ssb.txt
001ssb.txt
005ssb.txt
002ssb.txt
004ssb.txt


In [15]:
folder_path = "./gotbooks"

In [18]:
# function to tokenize, preprocess and store all the files in empty dictionary story

def preprocessor(folder_path_of_files):
    story= []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open (file_path, encoding= 'unicode_escape') as f:
            corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        # preprocess using simple_preprocess and store in story list
        for sent in raw_sent:
            story.append(simple_preprocess(sent))
            
    return story

In [19]:
story = preprocessor(folder_path)

In [20]:
story[0]

['storm',
 'of',
 'swords',
 'book',
 'three',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'by',
 'george']

In [22]:
story[3]

['with',
 'such',
 'structure',
 'the',
 'narrative',
 'cannot',
 'be',
 'strictly',
 'sequential',
 'sometimes',
 'important',
 'things',
 'are',
 'happening',
 'simultaneously',
 'thousand',
 'leagues',
 'apart']

#### Build the custom word2vec

In [39]:
# using CBOW
custom_model = gensim.models.Word2Vec(window= 10, min_count= 5, vector_size= 150,
                                      workers = 12, epochs= 5)

# min_count => dont capture the senteces with less than 5 words
# vector_size => dimensions
# by default it uses cbow (sg = 0), if want to use skip-gram set parameter sg = 1
# to use more cpu cores during training => workers = no_of_cpu_cores

In [40]:
# if u want to check how many cpu cores u have
import os
print(os.cpu_count())

18


In [41]:
# build the vocab
custom_model.build_vocab(story)

In [42]:
custom_model.corpus_count
# no. of sentences we have

145020

In [43]:
# no of unique words
custom_model.corpus_total_words

1725638

In [44]:
print(custom_model.epochs)

5


In [45]:
# train the model now
custom_model.train(story, total_examples=custom_model.corpus_count, epochs = custom_model.epochs)

(6482123, 8628190)

In [46]:
# check

custom_model.wv.most_similar('stark')

[('arryn', 0.7772131562232971),
 ('brandon', 0.7617887854576111),
 ('winterfell', 0.7604482173919678),
 ('greyjoy', 0.7175905704498291),
 ('robb', 0.7124232649803162),
 ('bastard', 0.6962007880210876),
 ('eddard', 0.692389726638794),
 ('tully', 0.6897363662719727),
 ('murdered', 0.6841720342636108),
 ('roose', 0.653350830078125)]

In [49]:
custom_model.wv.most_similar('dwarf')

[('imp', 0.7622819542884827),
 ('hound', 0.7214922904968262),
 ('wench', 0.6894556879997253),
 ('tyrion', 0.6866094470024109),
 ('varys', 0.6403233408927917),
 ('question', 0.6388692855834961),
 ('reek', 0.6375152468681335),
 ('eunuch', 0.6366840600967407),
 ('littlefinger', 0.6344404220581055),
 ('joff', 0.6325963139533997)]

In [51]:
custom_model.wv['king'];

In [52]:
custom_model.wv['king'].shape

(150,)

In [53]:
# Note: From the data we have not removed the stopswords which we should have