In [246]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operation
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [247]:
df = pd.read_csv('game-of-thrones.csv')
df.shape

(33198, 5)

In [248]:
df.head()

Unnamed: 0,Text,Speaker,Episode,Season,Show
0,[First scene opens with three Rangers riding t...,,e1-Winter is Coming,season-01,Game-of-Thrones
1,What d’you expect? They’re savages. One lot s...,WAYMAR ROYCE,e1-Winter is Coming,season-01,Game-of-Thrones
2,I’ve never seen wildlings do a thing like thi...,WILL,e1-Winter is Coming,season-01,Game-of-Thrones
3,How close did you get?,WAYMAR ROYCE,e1-Winter is Coming,season-01,Game-of-Thrones
4,Close as any man would.,WILL,e1-Winter is Coming,season-01,Game-of-Thrones


In [249]:
nlp = spacy.load('en_core_web_sm') # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)


In [250]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Text'])

In [251]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.88 mins


In [252]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(24710, 1)

In [253]:
from gensim.models.phrases import Phrases, Phraser


In [254]:
sent = [row.split() for row in df_clean['clean']]

In [258]:
phrases = Phrases(sent, min_count=5, progress_per=10000)


INFO - 08:04:25: collecting all words and their counts
INFO - 08:04:25: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 08:04:25: PROGRESS: at sentence #10000, processed 66539 words and 50089 word types
INFO - 08:04:25: PROGRESS: at sentence #20000, processed 141631 words and 93894 word types
INFO - 08:04:25: collected 113055 token types (unigram + bigrams) from a corpus of 179101 words and 24710 sentences
INFO - 08:04:25: merged Phrases<113055 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
INFO - 08:04:25: Phrases lifecycle event {'msg': 'built Phrases<113055 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 0.27s', 'datetime': '2023-03-01T08:04:25.938154', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [259]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 08:04:30: exporting phrases from Phrases<113055 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
INFO - 08:04:30: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<587 phrases, min_count=5, threshold=10.0> from Phrases<113055 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000> in 0.27s', 'datetime': '2023-03-01T08:04:30.392095', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [260]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

9045

In [261]:
word_freq

defaultdict(int,
            {'scene': 31,
             'open': 202,
             'ranger': 49,
             'ride': 317,
             'tunnel': 39,
             'leave': 673,
             'wall': 425,
             'go': 649,
             'wood': 100,
             'eerie': 2,
             'music': 14,
             'background': 27,
             'split': 9,
             'find': 472,
             'campsite': 3,
             'mutilated': 1,
             'body': 177,
             'include': 24,
             'child': 342,
             'hang': 68,
             'tree': 82,
             'branch': 7,
             'bird': 38,
             'eye': 263,
             'view': 28,
             'show': 51,
             'arrange': 19,
             'shield': 57,
             'like': 870,
             'pattern': 1,
             'd': 127,
             'expect': 77,
             'savage': 28,
             'lot': 111,
             'steal': 87,
             'goat': 24,
             'know': 1432,
             

In [262]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['man', 's', 'look', 'know', 'want', 'come', 'walk', 'tyrion', 'lord', 'king']

In [263]:
import multiprocessing

from gensim.models import Word2Vec

In [264]:
cores = multiprocessing.cpu_count() # Count the number of cores in a comput

In [265]:
cores

8

In [266]:
w2v_model = Word2Vec(min_count=20, window=2, vector_size=500, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)

INFO - 08:06:00: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=500, alpha=0.03>', 'datetime': '2023-03-01T08:06:00.683706', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [267]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 08:06:02: collecting all words and their counts
INFO - 08:06:02: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 08:06:02: PROGRESS: at sentence #10000, processed 63096 words, keeping 6084 word types
INFO - 08:06:02: PROGRESS: at sentence #20000, processed 133333 words, keeping 8252 word types
INFO - 08:06:02: collected 9045 word types from a corpus of 168555 raw words and 24710 sentences
INFO - 08:06:02: Creating a fresh vocabulary
INFO - 08:06:02: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 1439 unique words (15.91% of original 9045, drops 7606)', 'datetime': '2023-03-01T08:06:02.627180', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'prepare_vocab'}
INFO - 08:06:02: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 138037 word corpus (81.89% of original 168555, drops 30518)', 'datetime': '2023-03-01T08:0

Time to build vocab: 0.01 mins


In [268]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=200, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 08:06:08: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 1439 vocabulary and 500 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-03-01T08:06:08.726665', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
INFO - 08:06:09: EPOCH 0: training on 168555 raw words (46682 effective words) took 0.4s, 113769 effective words/s
INFO - 08:06:09: EPOCH 1: training on 168555 raw words (46711 effective words) took 0.4s, 117412 effective words/s
INFO - 08:06:10: EPOCH 2: training on 168555 raw words (46617 effective words) took 0.4s, 108121 effective words/s
INFO - 08:06:10: EPOCH 3: training on 168555 raw words (46612 effective words) took 0.3s, 135887 effective words/s
INFO - 08:06:10: EPOCH 4: training on 168555 raw words (46748 effective words) took 0.4s, 133002 effective words/s
INFO - 08:06:11: EPOCH 

INFO - 08:06:37: EPOCH 68: training on 168555 raw words (46620 effective words) took 0.4s, 132455 effective words/s
INFO - 08:06:37: EPOCH 69: training on 168555 raw words (46664 effective words) took 0.3s, 151161 effective words/s
INFO - 08:06:37: EPOCH 70: training on 168555 raw words (46564 effective words) took 0.4s, 123135 effective words/s
INFO - 08:06:38: EPOCH 71: training on 168555 raw words (46754 effective words) took 0.3s, 140207 effective words/s
INFO - 08:06:38: EPOCH 72: training on 168555 raw words (46599 effective words) took 0.3s, 167068 effective words/s
INFO - 08:06:38: EPOCH 73: training on 168555 raw words (46839 effective words) took 0.3s, 135029 effective words/s
INFO - 08:06:39: EPOCH 74: training on 168555 raw words (46677 effective words) took 0.4s, 122979 effective words/s
INFO - 08:06:39: EPOCH 75: training on 168555 raw words (46665 effective words) took 0.6s, 72194 effective words/s
INFO - 08:06:40: EPOCH 76: training on 168555 raw words (46859 effective 

INFO - 08:07:05: EPOCH 139: training on 168555 raw words (46753 effective words) took 0.4s, 129962 effective words/s
INFO - 08:07:06: EPOCH 140: training on 168555 raw words (46512 effective words) took 0.4s, 113622 effective words/s
INFO - 08:07:07: EPOCH 141: training on 168555 raw words (46679 effective words) took 0.7s, 69328 effective words/s
INFO - 08:07:07: EPOCH 142: training on 168555 raw words (46431 effective words) took 0.3s, 133541 effective words/s
INFO - 08:07:07: EPOCH 143: training on 168555 raw words (46671 effective words) took 0.3s, 134877 effective words/s
INFO - 08:07:08: EPOCH 144: training on 168555 raw words (46878 effective words) took 0.4s, 129426 effective words/s
INFO - 08:07:08: EPOCH 145: training on 168555 raw words (46632 effective words) took 0.4s, 127113 effective words/s
INFO - 08:07:08: EPOCH 146: training on 168555 raw words (46721 effective words) took 0.4s, 123520 effective words/s
INFO - 08:07:09: EPOCH 147: training on 168555 raw words (46966 e

Time to train the model: 1.36 mins


In [269]:
w2v_model.init_sims(replace=True)


Call to deprecated `init_sims` (Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. init_sims() is now obsoleted and will be completely removed in future versions. See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4).



In [311]:
w2v_model.wv.most_similar(positive=["baratheon"])

[('stannis', 0.23953251540660858),
 ('short', 0.23146995902061462),
 ('lannister', 0.22861948609352112),
 ('walk_forward', 0.2242448627948761),
 ('advance', 0.2190706729888916),
 ('banner', 0.21905829012393951),
 ('form', 0.21278077363967896),
 ('stannis_baratheon', 0.2127538025379181),
 ('party', 0.20951353013515472),
 ('mace', 0.20861847698688507)]

In [309]:
w2v_model.wv.most_similar(positive=["lannister"])

[('tywin', 0.27474284172058105),
 ('son', 0.2568546533584595),
 ('tywin_lannister', 0.2560979723930359),
 ('refuse', 0.25184348225593567),
 ('frey', 0.24865394830703735),
 ('family', 0.23525550961494446),
 ('iron_bank', 0.23485049605369568),
 ('stark', 0.2340048998594284),
 ('tyrell', 0.2310585081577301),
 ('nephew', 0.22907152771949768)]

In [310]:
w2v_model.wv.most_similar(positive=["stark"])

[('winterfell', 0.34082770347595215),
 ('bolton', 0.3261927366256714),
 ('tully', 0.29228541254997253),
 ('frey', 0.2845751941204071),
 ('banner', 0.28376504778862),
 ('warden_north', 0.2498905062675476),
 ('catelyn_stark', 0.246596097946167),
 ('shield', 0.23977579176425934),
 ('lannister', 0.2340048998594284),
 ('house_stark', 0.22942805290222168)]

In [314]:
w2v_model.wv.most_similar(positive=["targaryen"])

[('westero', 0.3207218050956726),
 ('mad_king', 0.31438544392585754),
 ('rebellion', 0.2999190092086792),
 ('seven_kingdom', 0.27919113636016846),
 ('aegon', 0.2781907021999359),
 ('rhaegar', 0.2578057646751404),
 ('kingdom', 0.25489428639411926),
 ('realm', 0.24933253228664398),
 ('century', 0.24461926519870758),
 ('year_ago', 0.2335478514432907)]

In [317]:
w2v_model.wv.most_similar(positive=["robb"])

[('talisa', 0.5508999824523926),
 ('catelyn', 0.43172067403793335),
 ('roslin', 0.42848455905914307),
 ('walder', 0.39282119274139404),
 ('rickard_karstark', 0.378397136926651),
 ('frey', 0.3640487790107727),
 ('edmure', 0.357572078704834),
 ('roose', 0.35129210352897644),
 ('brynden', 0.3361089527606964),
 ('twin', 0.3055846095085144)]

In [322]:
w2v_model.wv.most_similar(positive=["hound"])

[('polliver', 0.4304107427597046),
 ('arya', 0.4226236939430237),
 ('beric', 0.3682827353477478),
 ('fall_ground', 0.3541356027126312),
 ('gut', 0.34398970007896423),
 ('blow', 0.34219831228256226),
 ('sandor', 0.338631272315979),
 ('brienne', 0.33013981580734253),
 ('stumble', 0.3244004547595978),
 ('soldier', 0.32019343972206116)]

In [328]:
w2v_model.wv.most_similar(positive=["jon"])

[('tormund', 0.5142539739608765),
 ('ygritte', 0.46753817796707153),
 ('edd', 0.449279248714447),
 ('sam', 0.4248664975166321),
 ('davos', 0.4190618395805359),
 ('olly', 0.4113067090511322),
 ('grenn', 0.4062921106815338),
 ('orell', 0.4014999270439148),
 ('shoulder', 0.3761765956878662),
 ('qhorin', 0.36831140518188477)]

In [360]:
w2v_model.wv.most_similar(positive=["white_walker"])

[('army_dead', 0.4678243100643158),
 ('child_forest', 0.46315449476242065),
 ('dragonglass', 0.36464279890060425),
 ('walker', 0.3539915382862091),
 ('leaf', 0.35024288296699524),
 ('ice', 0.31917911767959595),
 ('thousand_year', 0.3183048963546753),
 ('meera', 0.2869231104850769),
 ('sam', 0.28154855966567993),
 ('wildling', 0.2746915817260742)]

In [353]:
w2v_model.wv.most_similar(positive=["castle_black"])

[('mance', 0.4235215187072754),
 ('lord_commander', 0.36043381690979004),
 ('wildling', 0.36023959517478943),
 ('night_watch', 0.34556323289871216),
 ('wall', 0.3302054703235626),
 ('mance_rayder', 0.30206263065338135),
 ('orell', 0.29435157775878906),
 ('north', 0.28205788135528564),
 ('qhorin', 0.2761329710483551),
 ('pyp', 0.2655641436576843)]

In [381]:
w2v_model.wv.most_similar(positive=["winterfell"])

[('stark', 0.34082770347595215),
 ('north', 0.3393135964870453),
 ('raven', 0.3218349814414978),
 ('banner', 0.31451213359832764),
 ('warden_north', 0.2974699139595032),
 ('bolton', 0.29463398456573486),
 ('bran_rickon', 0.27875667810440063),
 ('bannerman', 0.27061814069747925),
 ('roose_bolton', 0.2639978229999542),
 ('umber', 0.25930649042129517)]

In [393]:
w2v_model.wv.most_similar(positive=['king', 'targaryen'], negative=["baratheon"])

[('father', 0.3520020842552185),
 ('queen', 0.319766104221344),
 ('think', 0.289492130279541),
 ('capital', 0.28795862197875977),
 ('know', 0.28746360540390015),
 ('realm', 0.2849137783050537),
 ('believe', 0.2702147662639618),
 ('swear', 0.2672134339809418),
 ('want', 0.2651638388633728),
 ('grace', 0.26504966616630554)]

In [394]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.express as px
import plotly.graph_objs as go

import nltk
nltk.download('averaged_perceptron_tagger')

def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.index_to_key:
        tokens.append(model.wv.get_vector(word))
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

#     fig = px.scatter_3d(
#         new_values, x=0, y=1, z=2,
#         color=df.Speaker, labels={'color': 'species'}
#     )
#     fig.update_traces(marker_size=8)
#     fig.show()

    

    x = []
    y = []

    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    fig = go.Figure()

    for i in range(len(x)):
        fig.add_trace(go.Scatter(x=[x[i]], y=[y[i]],
                        mode='markers+text',
                        text=[labels[i]],
                        textposition='bottom center',
                        marker=dict(size=10)))

    fig.update_layout(
        title="2D tsne plot all wordvecs",
        xaxis_title="X-axis",
        yaxis_title="Y-axis",
        showlegend=False,
        width=800,
        height=800,
        margin=dict(l=40, r=40, t=80, b=40),
    )

    fig.show()

    
    
#     x = []
#     y = []
#     for value in new_values:
#         x.append(value[0])
#         y.append(value[1])
        
#     plt.figure(figsize=(16, 16)) 
#     for i in range(len(x)):
#         plt.scatter(x[i],y[i])
#         plt.annotate(labels[i],
#                      xy=(x[i], y[i]),
#                      xytext=(5, 2),
#                      textcoords='offset points',
#                      ha='right',
#                      va='bottom')
#     plt.show()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [395]:
tsne_plot(w2v_model)

In [396]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [398]:

fig = go.Figure()

for i in range(len(x)):
    
   # returns a document of object
    doc = nlp(labels[i])

    # checking if it is a noun or not
    if(doc[0].tag_ in ['NNP', 'NNPS']):
        fig.add_trace(go.Scatter(x=[x[i]], y=[y[i]],
                                 mode='markers+text',
                                 text=[labels[i]],
                                 textposition='bottom center',
                                 marker=dict(size=10)))
    else:
        pass
    
   

fig.update_layout(
    title="Tsne proper nouns only v1",
    xaxis_title="X-axis",
    yaxis_title="Y-axis",
    showlegend=False,
    width=800,
    height=800,
    margin=dict(l=40, r=40, t=80, b=40),
)

fig.show()



In [399]:
all_characters = []
for s in df[df["Speaker"].notnull()]["Speaker"].unique():
    all_characters.append(s.split()[0].lower())


fig = go.Figure()

for i in range(len(x)):
    if(labels[i].strip() in all_characters):
        doc = nlp(labels[i])

        # checking if it is a noun or not
        if(doc[0].tag_ in ['NNP', 'NNPS']):
            fig.add_trace(go.Scatter(x=[x[i]], y=[y[i]],
                                     mode='markers+text',
                                     text=[labels[i]],
                                     textposition='bottom center',
                                     marker=dict(size=10)))
    else:
        pass
    
   

fig.update_layout(
    title="Tsne proper nouns only v2",
    xaxis_title="X-axis",
    yaxis_title="Y-axis",
    showlegend=False,
    width=800,
    height=800,
    margin=dict(l=40, r=40, t=80, b=40),
)

fig.show()



In [198]:
nlp("hug")[0].tag_

'NNP'

In [237]:
all_characters = []
for s in df[df["Speaker"].notnull()]["Speaker"].unique():
    all_characters.append(s.split()[0].lower())

In [238]:
all_characters

['waymar',
 'will',
 'gared',
 'royce',
 'jon',
 'septa',
 'sansa',
 'septa',
 'ned',
 'robb',
 'jon/robb',
 'cassel',
 'catelyn',
 'will',
 'will',
 'jon',
 'bran',
 'theon',
 'robb',
 'bran',
 'jaime',
 'cersei',
 'maester',
 'luwin',
 'bran',
 'arya',
 'robert',
 'arya',
 'tyrion',
 'ros',
 'viserys',
 'viserys',
 'daenerys',
 'maid',
 'illyrio',
 'benjen',
 'tyrion',
 'sansa',
 'cersei',
 'a',
 'jorah',
 'khal',
 'the',
 'yoren',
 'arya',
 'bran',
 'osha',
 'rickon',
 'maester',
 'several',
 'catelyn',
 'robb',
 'marillion',
 'joffrey',
 'sansa',
 'sandor',
 'jonos',
 'galbart',
 'rickard',
 'theon',
 'greatjon',
 'all',
 'stark',
 'jaime',
 'lancel',
 'cersei',
 'tywin',
 'tyrion',
 'leo',
 'addam',
 'kevan',
 'daenerys',
 'jorah',
 'mirri',
 'samwell',
 'jon',
 'shae',
 'sam,',
 'grenn',
 'pyp',
 'grand',
 'ros',
 'varys',
 'petyr',
 'hot',
 'lommy',
 'gendry',
 'jeor',
 'rakharo',
 'doreah',
 'irri',
 'viserys',
 'myrcella',
 'cersei',
 'eddard',
 'robert',
 'benjen',
 'assassin

In [400]:
import pickle
filename = 'w2v_model.sav'
pickle.dump(w2v_model, open(filename, 'wb'))
