![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)

# <center> Machine Learning Methods </center>
## <center> Lecture 24 - Introduction to NLP </center>
### <center> Wrod2Vec </center>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethod/24_NLP/MainWord2Vec.ipynb)

In [1]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc('font', **{'size' : 16})

### Load IMDB Sentiment Analysis:
https://www.kaggle.com/kaushik3497/imdb-sentiment-analysis

In [2]:
N     = 25000
dData = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
dData = dData[:N]
dData

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


### Pre-processing:
* Remove HTML stuff.
* Remove punctuation and switch to lower case.

In [3]:
import re
from   bs4 import BeautifulSoup  

def PreprocessLine(text):
    
    text   = BeautifulSoup(text).get_text()                 #-- remove <br> and HTML
    lWords = re.sub("[^a-zA-Z]", " ", text).lower().split() #-- keep lower case letters
    
    return  lWords

### Split text into lines:
Using `tokenizer`

In [4]:
import nltk

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sText     = dData['review'].values[9]
lLines    = tokenizer.tokenize(sText.strip())

print(sText)
print('=' * 120)
print('=' * 120)
lLines

<br /><br />This movie is full of references. Like \Mad Max II\", \"The wild one\" and many others. The ladybug´s face it´s a clear reference (or tribute) to Peter Lorre. This movie is a masterpiece. We´ll talk much more about in the future."


['<br /><br />This movie is full of references.',
 'Like \\Mad Max II\\", \\"The wild one\\" and many others.',
 'The ladybug´s face it´s a clear reference (or tribute) to Peter Lorre.',
 'This movie is a masterpiece.',
 'We´ll talk much more about in the future."']

### Convert raw text to processed lines:

In [5]:
def Text2lines(sText):
    lRawLines = tokenizer.tokenize(sText.strip())
    lLines    = []
    
    for rawLine in lRawLines:
        if len(rawLine) > 0:
            lLines.append(PreprocessLine(rawLine))

    return lLines

In [6]:
lLines = []
for sText in dData['review'].values:
    lLines += Text2lines(sText)



In [7]:
len(lLines)

266885

### Train Word2Vec model:
Using `gensim`  
https://radimrehurek.com/gensim/index.html

In [8]:
from gensim.models import word2vec

d            = 300
minWordCount = 40
contextWin   = 5

oWord2Vec    = word2vec.Word2Vec(lLines, workers=4, vector_size=d, min_count=minWordCount, window=contextWin)



#### Check vocabulary size:

In [9]:
len(oWord2Vec.wv.key_to_index)

8306

In [10]:
oWord2Vec.wv.key_to_index

{'the': 0,
 'and': 1,
 'a': 2,
 'of': 3,
 'to': 4,
 'is': 5,
 'it': 6,
 'in': 7,
 'i': 8,
 'this': 9,
 'that': 10,
 's': 11,
 'was': 12,
 'as': 13,
 'for': 14,
 'with': 15,
 'movie': 16,
 'but': 17,
 'film': 18,
 't': 19,
 'you': 20,
 'on': 21,
 'not': 22,
 'he': 23,
 'are': 24,
 'his': 25,
 'have': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'they': 31,
 'by': 32,
 'an': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'there': 38,
 'her': 39,
 'or': 40,
 'just': 41,
 'about': 42,
 'out': 43,
 'if': 44,
 'has': 45,
 'what': 46,
 'some': 47,
 'good': 48,
 'can': 49,
 'more': 50,
 'she': 51,
 'when': 52,
 'very': 53,
 'up': 54,
 'time': 55,
 'no': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'story': 61,
 'only': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'we': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'much': 72,
 'get': 73,
 'bad': 74,
 'been': 75,
 'people': 76,
 'will': 77,
 'do': 78,
 'other': 79,
 'also': 80,
 'into': 81,
 'first

#### Which word from the given list doesn't go with the others?

In [11]:
oWord2Vec.wv.doesnt_match(['man', 'child', 'woman', 'film'])

'film'

#### Most similar:

In [12]:
oWord2Vec.wv.most_similar('film')

[('movie', 0.8893373012542725),
 ('documentary', 0.6703558564186096),
 ('picture', 0.6410948634147644),
 ('flick', 0.6283562779426575),
 ('sequel', 0.5940581560134888),
 ('cinema', 0.535230815410614),
 ('films', 0.5173639059066772),
 ('turkey', 0.5110447406768799),
 ('masterpiece', 0.5033935308456421),
 ('product', 0.49864691495895386)]

### A single word embedding:

In [13]:
word = 'this'
vZ   = oWord2Vec.wv[word]
vZ.shape

(300,)

### Algebra with words:

In [14]:
vWord = oWord2Vec.wv['actor'] - oWord2Vec.wv['man'] +  oWord2Vec.wv['woman']
oWord2Vec.wv.most_similar(positive=[vWord], topn=1)

# oWord2Vec.wv.most_similar(positive=['actor', 'woman'], negative=['man'])

[('actress', 0.8335807919502258)]

### Model words embedding:

In [15]:
#-- Embedding vectors (Vw)
mZ = oWord2Vec.wv.vectors
#-- Context vectors (Vc)
mC = oWord2Vec.syn1neg

mZ.shape, mC.shape

((8306, 300), (8306, 300))

### Compute $K$ clusters and print most similar words:

In [18]:
from sklearn.cluster import KMeans

K       = 10
# oKmeans = KMeans(n_clusters=K, n_init=1, init='random').fit(mZ)
oKmeans = KMeans(n_clusters=K).fit(mZ)
mMu     = oKmeans.cluster_centers_

In [19]:
bold = '\x1b[1m'
end  = '\x1b[0m'

for ii in range(K):
    print(bold + '--------------------------' + end)
    print(bold + f'Cluster {ii}:'             + end)
    display(oWord2Vec.wv.similar_by_vector(mMu[ii,:]))
    print(bold + '--------------------------' + end + '\n')

[1m--------------------------[0m
[1mCluster 0:[0m


[('it', 0.6963847279548645),
 ('i', 0.6258538365364075),
 ('just', 0.6148213744163513),
 ('you', 0.5909928679466248),
 ('actually', 0.5906994342803955),
 ('really', 0.5888749957084656),
 ('what', 0.5867460370063782),
 ('anyway', 0.5638018846511841),
 ('that', 0.5589742660522461),
 ('movie', 0.5473378896713257)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 1:[0m


[('truck', 0.8547428250312805),
 ('crashes', 0.8420207500457764),
 ('pool', 0.8314245343208313),
 ('riding', 0.8298056721687317),
 ('racing', 0.826576828956604),
 ('neck', 0.821727991104126),
 ('helicopter', 0.8204514384269714),
 ('tree', 0.8197736740112305),
 ('deserted', 0.8176237344741821),
 ('climbing', 0.8149627447128296)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 2:[0m


[('accept', 0.7897562384605408),
 ('relate', 0.7778269648551941),
 ('tell', 0.7591675519943237),
 ('ask', 0.7565035820007324),
 ('leave', 0.7556847333908081),
 ('find', 0.7475718855857849),
 ('explain', 0.7384020686149597),
 ('call', 0.7323665618896484),
 ('appreciate', 0.7301350831985474),
 ('understand', 0.7289038896560669)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 3:[0m


[('relations', 0.8446028232574463),
 ('beliefs', 0.8340627551078796),
 ('plight', 0.82899010181427),
 ('conflicts', 0.8280780911445618),
 ('awareness', 0.816286563873291),
 ('desperation', 0.8118703961372375),
 ('ideals', 0.8074533939361572),
 ('interests', 0.7975994348526001),
 ('views', 0.7928076982498169),
 ('existence', 0.790086030960083)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 4:[0m


[('thomas', 0.9392228126525879),
 ('julie', 0.9353925585746765),
 ('miller', 0.9344971776008606),
 ('lloyd', 0.924724280834198),
 ('patrick', 0.9228273034095764),
 ('burke', 0.922105073928833),
 ('raymond', 0.9198663830757141),
 ('keith', 0.9186036586761475),
 ('hugh', 0.9183512926101685),
 ('drake', 0.9160119891166687)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 5:[0m


[('uk', 0.7897157669067383),
 ('code', 0.7204399704933167),
 ('disc', 0.7199437618255615),
 ('columbia', 0.7165877223014832),
 ('bbc', 0.7083879113197327),
 ('blockbusters', 0.7080051898956299),
 ('broadcast', 0.7034980654716492),
 ('august', 0.6959718465805054),
 ('channel', 0.6902786493301392),
 ('th', 0.6857770681381226)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 6:[0m


[('wonderland', 0.8054859042167664),
 ('district', 0.8039807081222534),
 ('mobster', 0.7998520731925964),
 ('investigator', 0.7947738170623779),
 ('hugo', 0.7935276627540588),
 ('tourist', 0.7897177934646606),
 ('gangs', 0.7884373664855957),
 ('poet', 0.7870345115661621),
 ('gypsy', 0.7847400903701782),
 ('abu', 0.7842482328414917)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 7:[0m


[('uneven', 0.8552534580230713),
 ('bland', 0.8447214961051941),
 ('unrealistic', 0.8260059952735901),
 ('riveting', 0.824479341506958),
 ('melodramatic', 0.8235369920730591),
 ('formulaic', 0.8180772662162781),
 ('gripping', 0.8161770701408386),
 ('stylish', 0.8158475756645203),
 ('compelling', 0.8153582811355591),
 ('unreal', 0.8121597170829773)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 8:[0m


[('wanted', 0.6428464651107788),
 ('understood', 0.6208462119102478),
 ('appears', 0.6137144565582275),
 ('forgot', 0.6033156514167786),
 ('got', 0.6017614603042603),
 ('took', 0.5991787314414978),
 ('hated', 0.5886221528053284),
 ('did', 0.587199330329895),
 ('showed', 0.5856050252914429),
 ('missed', 0.5772632956504822)]

[1m--------------------------[0m

[1m--------------------------[0m
[1mCluster 9:[0m


[('visuals', 0.8131970763206482),
 ('dialog', 0.7800335884094238),
 ('dialogue', 0.7739783525466919),
 ('pacing', 0.7617191672325134),
 ('atmosphere', 0.7530574202537537),
 ('cinematography', 0.7408364415168762),
 ('storyline', 0.7345183491706848),
 ('imagery', 0.730393648147583),
 ('storytelling', 0.7242763042449951),
 ('editing', 0.7229069471359253)]

[1m--------------------------[0m

