In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operation
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
df = pd.read_csv('game-of-thrones.csv')
df.shape

(33198, 5)

In [5]:
df.head()

Unnamed: 0,Text,Speaker,Episode,Season,Show
0,[First scene opens with three Rangers riding t...,,e1-Winter is Coming,season-01,Game-of-Thrones
1,What d’you expect? They’re savages. One lot s...,WAYMAR ROYCE,e1-Winter is Coming,season-01,Game-of-Thrones
2,I’ve never seen wildlings do a thing like thi...,WILL,e1-Winter is Coming,season-01,Game-of-Thrones
3,How close did you get?,WAYMAR ROYCE,e1-Winter is Coming,season-01,Game-of-Thrones
4,Close as any man would.,WILL,e1-Winter is Coming,season-01,Game-of-Thrones


In [3]:
nlp = spacy.load('en_core_web_sm') # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)


In [4]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Text'])

In [8]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.45 mins


In [10]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(24710, 1)

In [11]:
from gensim.models.phrases import Phrases, Phraser


In [12]:
sent = [row.split() for row in df_clean['clean']]

In [13]:
phrases = Phrases(sent, min_count=30, progress_per=10000)


INFO - 19:12:06: collecting all words and their counts
INFO - 19:12:06: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 19:12:07: PROGRESS: at sentence #10000, processed 66539 words and 50089 word types
INFO - 19:12:07: PROGRESS: at sentence #20000, processed 141631 words and 93894 word types
INFO - 19:12:07: collected 113055 token types (unigram + bigrams) from a corpus of 179101 words and 24710 sentences
INFO - 19:12:07: merged Phrases<113055 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 19:12:07: Phrases lifecycle event {'msg': 'built Phrases<113055 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.17s', 'datetime': '2023-02-28T19:12:07.102175', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [17]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 19:13:52: exporting phrases from Phrases<113055 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 19:13:52: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<54 phrases, min_count=30, threshold=10.0> from Phrases<113055 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.16s', 'datetime': '2023-02-28T19:13:52.984477', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [16]:
sentences

NameError: name 'sentences' is not defined

In [18]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

8559

In [19]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['man', 's', 'look', 'know', 'come', 'want', 'walk', 'king', 'lord', 'tyrion']

In [20]:
import multiprocessing

from gensim.models import Word2Vec

In [21]:
cores = multiprocessing.cpu_count() # Count the number of cores in a comput

In [22]:
cores

8

In [124]:
w2v_model = Word2Vec(min_count=20, window=2, vector_size=500, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)

INFO - 20:54:05: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=500, alpha=0.03>', 'datetime': '2023-02-28T20:54:05.521399', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [125]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 20:54:05: collecting all words and their counts
INFO - 20:54:05: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 20:54:05: PROGRESS: at sentence #10000, processed 65362 words, keeping 5695 word types
INFO - 20:54:05: PROGRESS: at sentence #20000, processed 138549 words, keeping 7781 word types
INFO - 20:54:05: collected 8559 word types from a corpus of 175115 raw words and 24710 sentences
INFO - 20:54:05: Creating a fresh vocabulary
INFO - 20:54:06: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 1440 unique words (16.82% of original 8559, drops 7119)', 'datetime': '2023-02-28T20:54:06.002788', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'prepare_vocab'}
INFO - 20:54:06: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 148923 word corpus (85.04% of original 175115, drops 26192)', 'datetime': '2023-02-28T20:5

Time to build vocab: 0.01 mins


In [127]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=200, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 20:54:15: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 1440 vocabulary and 500 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-02-28T20:54:15.975195', 'gensim': '4.3.0', 'python': '3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
INFO - 20:54:16: EPOCH 0: training on 175115 raw words (50024 effective words) took 0.3s, 154962 effective words/s
INFO - 20:54:16: EPOCH 1: training on 175115 raw words (50285 effective words) took 0.3s, 155327 effective words/s
INFO - 20:54:17: EPOCH 2: training on 175115 raw words (50250 effective words) took 0.3s, 148487 effective words/s
INFO - 20:54:17: EPOCH 3: training on 175115 raw words (50284 effective words) took 0.4s, 122414 effective words/s
INFO - 20:54:18: EPOCH 4: training on 175115 raw words (50005 effective words) took 0.5s, 97940 effective words/s
INFO - 20:54:18: EPOCH 5

INFO - 20:54:43: EPOCH 67: training on 175115 raw words (50193 effective words) took 0.3s, 144865 effective words/s
INFO - 20:54:43: EPOCH 68: training on 175115 raw words (50096 effective words) took 0.3s, 143695 effective words/s
INFO - 20:54:43: EPOCH 69: training on 175115 raw words (50125 effective words) took 0.3s, 145204 effective words/s
INFO - 20:54:44: EPOCH 70: training on 175115 raw words (50242 effective words) took 0.4s, 133634 effective words/s
INFO - 20:54:44: EPOCH 71: training on 175115 raw words (50489 effective words) took 0.5s, 95730 effective words/s
INFO - 20:54:45: EPOCH 72: training on 175115 raw words (49954 effective words) took 0.4s, 134919 effective words/s
INFO - 20:54:45: EPOCH 73: training on 175115 raw words (50332 effective words) took 0.4s, 136050 effective words/s
INFO - 20:54:45: EPOCH 74: training on 175115 raw words (50192 effective words) took 0.4s, 130255 effective words/s
INFO - 20:54:46: EPOCH 75: training on 175115 raw words (50165 effective 

INFO - 20:55:10: EPOCH 138: training on 175115 raw words (50193 effective words) took 0.3s, 147346 effective words/s
INFO - 20:55:10: EPOCH 139: training on 175115 raw words (50128 effective words) took 0.4s, 134699 effective words/s
INFO - 20:55:11: EPOCH 140: training on 175115 raw words (50401 effective words) took 0.5s, 96992 effective words/s
INFO - 20:55:11: EPOCH 141: training on 175115 raw words (50415 effective words) took 0.4s, 136683 effective words/s
INFO - 20:55:12: EPOCH 142: training on 175115 raw words (50460 effective words) took 0.3s, 145741 effective words/s
INFO - 20:55:12: EPOCH 143: training on 175115 raw words (50124 effective words) took 0.4s, 137015 effective words/s
INFO - 20:55:12: EPOCH 144: training on 175115 raw words (50004 effective words) took 0.3s, 144033 effective words/s
INFO - 20:55:13: EPOCH 145: training on 175115 raw words (50067 effective words) took 0.4s, 139372 effective words/s
INFO - 20:55:13: EPOCH 146: training on 175115 raw words (50431 e

Time to train the model: 1.3 mins


In [128]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [129]:
w2v_model.wv.most_similar(positive=["slave"])

[('master', 0.42362457513809204),
 ('speak_valyrian', 0.3817283511161804),
 ('yunkai', 0.373103529214859),
 ('kraznys', 0.3230419158935547),
 ('unsullie', 0.3153010904788971),
 ('meereen', 0.2728897035121918),
 ('valerian', 0.2709660530090332),
 ('dragon', 0.2700197100639343),
 ('dothraki', 0.257055401802063),
 ('astapor', 0.2508307993412018)]

In [130]:
w2v_model.sample()

TypeError: 'float' object is not callable

In [133]:
w2v_model.wv.most_similar(positive=['baratheon', 'stark'], negative=['robert'])

[('bolton', 0.297207236289978),
 ('house', 0.2921888828277588),
 ('tully', 0.2502659559249878),
 ('winterfell', 0.24744388461112976),
 ('banner', 0.24295684695243835),
 ('pledge', 0.24007676541805267),
 ('declare', 0.2158336192369461),
 ('tarth', 0.2122897207736969),
 ('catelyn', 0.21162854135036469),
 ('lannister', 0.20848864316940308)]

In [158]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.express as px
import plotly.graph_objs as go


def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.index_to_key:
        tokens.append(model.wv.get_vector(word))
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

#     fig = px.scatter_3d(
#         new_values, x=0, y=1, z=2,
#         color=df.Speaker, labels={'color': 'species'}
#     )
#     fig.update_traces(marker_size=8)
#     fig.show()

    

    x = []
    y = []

    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    fig = go.Figure()

    for i in range(len(x)):
        fig.add_trace(go.Scatter(x=[x[i]], y=[y[i]],
                        mode='markers+text',
                        text=[labels[i]],
                        textposition='bottom center',
                        marker=dict(size=10)))

    fig.update_layout(
        title="Scatter Plot with Labels",
        xaxis_title="X-axis",
        yaxis_title="Y-axis",
        showlegend=False,
        width=800,
        height=800,
        margin=dict(l=40, r=40, t=80, b=40),
    )

    fig.show()

    
    
#     x = []
#     y = []
#     for value in new_values:
#         x.append(value[0])
#         y.append(value[1])
        
#     plt.figure(figsize=(16, 16)) 
#     for i in range(len(x)):
#         plt.scatter(x[i],y[i])
#         plt.annotate(labels[i],
#                      xy=(x[i], y[i]),
#                      xytext=(5, 2),
#                      textcoords='offset points',
#                      ha='right',
#                      va='bottom')
#     plt.show()


In [159]:
tsne_plot(w2v_model)

In [165]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [183]:

fig = go.Figure()

for i in range(len(x)):
    
   # returns a document of object
    doc = nlp(labels[i])

    # checking if it is a noun or not
    if(doc[0].tag_ in ['NNP', 'NNPS']):
        fig.add_trace(go.Scatter(x=[x[i]], y=[y[i]],
                                 mode='markers+text',
                                 text=[labels[i]],
                                 textposition='bottom center',
                                 marker=dict(size=10)))
    else:
        pass
    
   

fig.update_layout(
    title="Scatter Plot with Labels",
    xaxis_title="X-axis",
    yaxis_title="Y-axis",
    showlegend=False,
    width=800,
    height=800,
    margin=dict(l=40, r=40, t=80, b=40),
)

fig.show()



In [173]:
sorted(w2v_model.wv.index_to_key)

["'",
 'abandon',
 'able',
 'accept',
 'accompany',
 'act',
 'actually',
 'address',
 'admire',
 'admit',
 'advance',
 'advice',
 'advise',
 'aegon',
 'aemon',
 'afraid',
 'age',
 'ago',
 'agree',
 'agreement',
 'ah',
 'ahead',
 'aim',
 'air',
 'ale',
 'alive',
 'alliance',
 'alliser',
 'allow',
 'ally',
 'alongside',
 'andal',
 'angrily',
 'angry',
 'animal',
 'answer',
 'anymore',
 'apart',
 'apologize',
 'apology',
 'appear',
 'approach',
 'archer',
 'archmaester',
 'aren',
 'arm',
 'armor',
 'army',
 'arrive',
 'arrow',
 'arryn',
 'arya',
 'aside',
 'ask',
 'ass',
 'assume',
 'astapor',
 'atop',
 'attack',
 'attempt',
 'attention',
 'audience',
 'aunt',
 'avenge',
 'away',
 'ax',
 'axe',
 'aye',
 'baby',
 'back',
 'background',
 'bad',
 'baelish',
 'baelor',
 'bag',
 'balcony',
 'ball',
 'balon',
 'band',
 'banner',
 'bannerman',
 'bar',
 'baratheon',
 'bark',
 'barrel',
 'barristan',
 'bastard',
 'bath',
 'battle',
 'battlefield',
 'bay',
 'beach',
 'bear',
 'beast',
 'beat',
 'be

In [182]:
nlp("tiger")[0].tag_

'NNS'

In [191]:

fig = go.Figure()

for i in range(len(x)):
    
   # returns a document of object
    doc = nlp(labels[i].strip())

    # checking if it is a noun or not
    if("sigh" in labels[i]):
        print(labels[i], "<")
        print(doc[0].tag_)
        
    if(doc[0].tag_ in ['NNP']):
        fig.add_trace(go.Scatter(x=[x[i]], y=[y[i]],
                                 mode='markers+text',
                                 text=[labels[i]],
                                 textposition='bottom center',
                                 marker=dict(size=10)))
    else:
        pass
    
   

fig.update_layout(
    title="Scatter Plot with Labels",
    xaxis_title="X-axis",
    yaxis_title="Y-axis",
    showlegend=False,
    width=800,
    height=800,
    margin=dict(l=40, r=40, t=80, b=40),
)

fig.show()



sigh <
NNP
sight <
NN


In [198]:
nlp("hug")[0].tag_

'NNP'

In [194]:
"sigh <"[:-1].strip()

'sigh'

In [200]:
import nltk
nltk.download('averaged_perceptron_tagger')
  
# taking input text as India
text = "India"
ans = nltk.pos_tag()
  
# ans returns a list of tuple
val = ans[0][1]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


TypeError: tokens: expected a list of strings, got a string