#Word2vec on google-news

In [2]:
!pip install gensim



In [3]:
# Loading the pretrained word2vec model
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [4]:
# Shows vector representation of word 'man'
model['man']

array([ 0.32617188,  0.13085938,  0.03466797, -0.08300781,  0.08984375,
       -0.04125977, -0.19824219,  0.00689697,  0.14355469,  0.0019455 ,
        0.02880859, -0.25      , -0.08398438, -0.15136719, -0.10205078,
        0.04077148, -0.09765625,  0.05932617,  0.02978516, -0.10058594,
       -0.13085938,  0.001297  ,  0.02612305, -0.27148438,  0.06396484,
       -0.19140625, -0.078125  ,  0.25976562,  0.375     , -0.04541016,
        0.16210938,  0.13671875, -0.06396484, -0.02062988, -0.09667969,
        0.25390625,  0.24804688, -0.12695312,  0.07177734,  0.3203125 ,
        0.03149414, -0.03857422,  0.21191406, -0.00811768,  0.22265625,
       -0.13476562, -0.07617188,  0.01049805, -0.05175781,  0.03808594,
       -0.13378906,  0.125     ,  0.0559082 , -0.18261719,  0.08154297,
       -0.08447266, -0.07763672, -0.04345703,  0.08105469, -0.01092529,
        0.17480469,  0.30664062, -0.04321289, -0.01416016,  0.09082031,
       -0.00927734, -0.03442383, -0.11523438,  0.12451172, -0.02

In [5]:
# vector contains 300 numbers
model['man'].shape

(300,)

In [6]:
# Shows words which are similar/related to 'man' vector
model.most_similar('man')

[('woman', 0.7664012908935547),
 ('boy', 0.6824871301651001),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585119128227234),
 ('Robbery_suspect', 0.5584409832954407),
 ('teen_ager', 0.5549196600914001),
 ('men', 0.5489763021469116)]

In [7]:
# Shows cosine similarity between 'man' & 'woman'
model.similarity('man', 'woman')

np.float32(0.76640123)

In [8]:
# Shows the odd one out (here monkey)
model.doesnt_match(['PHP','java','monkey'])

'monkey'

In [9]:
# Vector arithmetic
vec = model['king'] - model['man'] + model['women']
model.most_similar([vec])

[('king', 0.6478992700576782),
 ('queen', 0.535493791103363),
 ('women', 0.5233659148216248),
 ('kings', 0.5162314772605896),
 ('queens', 0.4995364248752594),
 ('kumaris', 0.49238473176956177),
 ('princes', 0.46233269572257996),
 ('monarch', 0.45280295610427856),
 ('monarchy', 0.4293173849582672),
 ('kings_princes', 0.42342400550842285)]

# Word2Vec on Game of thrones books dataset

In [60]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [61]:
# Function to remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stop_words:
            new_text.append(word)
    return " ".join(new_text)

In [62]:
import os
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

story = []
for filename in os.listdir('books'):
    # Open text files present in books directory
    f = open(os.path.join('books', filename), encoding='latin-1')
    # Store text content of file in corpus
    corpus = f.read()
    # Perform Sentence tokenization on the corpus of text
    raw_sent = sent_tokenize(corpus)
    # add the sentences in the story list by performing some simple preprocessing and removing stopwords
    for sent in raw_sent:
      sent = remove_stopwords(sent)
      story.append(simple_preprocess(sent))

In [64]:
print(story[0])

['game', 'of', 'thrones', 'book', 'one', 'song', 'ice', 'fire', 'by', 'george', 'martin', 'prologue', 'we', 'start', 'back', 'gared', 'urged', 'woods', 'began', 'grow', 'dark', 'around', 'them']


In [65]:
# Creating the Word2Vec model
model = gensim.models.Word2Vec(
    window=10, #window size
    min_count=2, #sentences should have min 2 words
)

# Extracting unique words form corpus using build_vocab function
model.build_vocab(story)

# Training the deep learning model
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(4996082, 5308310)

In [66]:
# Checking words similar to daenerys
model.wv.most_similar('daenerys')

[('stormborn', 0.8643744587898254),
 ('targaryen', 0.8311626315116882),
 ('unburnt', 0.8171461224555969),
 ('queen', 0.8155773282051086),
 ('dorne', 0.7992861866950989),
 ('myrcella', 0.7990050315856934),
 ('adultery', 0.791711688041687),
 ('princess', 0.7777925133705139),
 ('elia', 0.7746576070785522),
 ('viserys', 0.7615095973014832)]

In [67]:
# Checking odd man out
model.wv.doesnt_match(['jon', 'rikon', 'robb', 'arya', 'sansa', 'bran'])



'jon'

In [68]:
# Checking odd man out
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [69]:
# Checking vector representation of jon
model.wv['jon']

array([-0.37333268,  1.7559718 ,  1.67618   ,  2.3724275 , -0.6911467 ,
        0.1577172 , -1.402629  ,  0.7841473 ,  0.26537195, -1.7736396 ,
       -0.34880686, -0.8164014 ,  0.6294275 ,  0.16258931, -3.0746145 ,
       -0.96327513,  2.7218907 , -2.1940055 ,  0.8365205 ,  0.1791577 ,
        0.00960632, -1.5814421 , -0.12501015,  1.2620187 ,  2.102936  ,
        0.6265236 , -1.1301401 ,  1.0289282 ,  1.5124362 ,  1.040278  ,
        0.985531  , -1.0007259 ,  1.5073858 , -1.9560288 , -0.20401745,
       -0.39278445,  1.878159  , -0.10580686, -0.2741514 , -0.9786446 ,
       -0.31285724,  0.7454806 ,  0.7772452 , -0.9364487 , -0.8988815 ,
       -1.3085512 , -0.27379122, -0.35242218, -1.6736205 ,  0.0251095 ,
       -0.21426958,  0.502376  , -2.552921  ,  1.6141907 , -1.1325587 ,
       -1.3693942 ,  0.42351422,  0.06166199, -1.462749  , -0.1320718 ,
       -0.45961168,  0.47507977,  0.10000948,  1.722399  ,  0.32052237,
        0.20580088,  1.7583212 ,  0.49775842, -0.2500746 ,  0.03

In [70]:
# Checking similarity between arya & sansa
model.wv.similarity('arya', 'sansa') #0.83077496

np.float32(0.79935926)

In [71]:
# Get vector representation of all the words
model.wv.get_normed_vectors()

array([[ 0.00593347,  0.00887729,  0.09464634, ..., -0.04558886,
        -0.11038836,  0.03785123],
       [-0.08932988,  0.10151627,  0.10917999, ..., -0.05245783,
        -0.11135022,  0.08939289],
       [-0.0182923 ,  0.0746931 , -0.11632224, ...,  0.08262765,
         0.06957389, -0.1621232 ],
       ...,
       [-0.08539195,  0.12631853,  0.02866963, ..., -0.12534562,
        -0.03856654,  0.05141809],
       [-0.00860489,  0.11817105,  0.08698175, ..., -0.05211031,
        -0.08067939,  0.00346928],
       [ 0.00604317,  0.0601439 ,  0.0366868 , ...,  0.09437905,
         0.0288917 ,  0.01628141]], dtype=float32)

In [72]:
# Get the word corresponding to each vector representation
y = model.wv.index_to_key
y

['the',
 'he',
 'said',
 'lord',
 'you',
 'it',
 'would',
 'one',
 'ser',
 'him',
 'she',
 'could',
 'man',
 'her',
 'that',
 'king',
 'men',
 'back',
 'and',
 'well',
 'like',
 'jon',
 'me',
 'them',
 'they',
 'his',
 'father',
 'old',
 'hand',
 'even',
 'tyrion',
 'never',
 'know',
 'see',
 'there',
 'made',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'no',
 'if',
 'time',
 'long',
 'might',
 'us',
 'come',
 'we',
 'face',
 'still',
 'when',
 'head',
 'red',
 'way',
 'boy',
 'page',
 'but',
 'must',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 'little',
 'took',
 'came',
 'though',
 'say',
 'now',
 'three',
 'what',
 'away',
 'dead',
 'my',
 'son',
 'blood',
 'take',
 'go',
 'your',
 'half',
 'make',
 'this',
 'arya',
 'll',
 'saw',
 'all',
 'day',
 'white',
 'jaime',
 'first',
 'look',
 'want',
 'much',
 'enough',
 'sword',
 'tell',
 'girl',
 'bran',
 'great',
 'again',
 'looked',
 'left',
 'knew',
 'asked',
 'gave',
 'maester',
 'called',
 'wall',
 'every',
 'heard',
 's

In [73]:
# Reduce dimensions of vectors to 3 for visulization
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X = pca.fit_transform(model.wv.get_normed_vectors())

In [74]:
# Plot the vectors of 100 words
import plotly.express as px
fig = px.scatter_3d(X[200:300], x=0, y=1, z=2, color=y[200:300])
fig.show()