![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)

# <center> Machine Learning Methods </center>
## <center> Lecture 25 - Introduction to NLP</center>
### <center> Wrod2Vec </center>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethod/25_NLP/MainWord2Vec.ipynb)

In [1]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc('font', **{'size' : 16})

### Load IMDB Sentiment Analysis:
https://www.kaggle.com/kaushik3497/imdb-sentiment-analysis

In [2]:
N     = 25000
dData = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
dData = dData[:N]
dData

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


### Pre-processing:
* Remove HTML stuff.
* Remove punctuation and switch to lower case.

In [3]:
import re
from   bs4 import BeautifulSoup  

def PreprocessLine(text):
    
    #-- Remove <br> and HTML:
    text   = BeautifulSoup(text).get_text()
    
    #-- Keep lower case letters:
    lWords = re.sub("[^a-zA-Z]", " ", text).lower().split() 
    
    return  " ".join(lWords)

In [4]:
import re
import nltk
from   nltk.corpus import stopwords
from   nltk.stem   import WordNetLemmatizer
from   bs4         import BeautifulSoup  

oWordNetLemmatizer = WordNetLemmatizer()
sStopWords         = set(stopwords.words('english')) 

def PreprocessLine(text, printFlag=False):
        
    if printFlag == False:
        print2 = lambda str: None
    else:
        print2 = lambda str: print(str)
        
    print2('Original text:')
    print2(text)
    print2('----------------------------------------------------------\n')
    
    print2('Remove <br> and HTML:')
    
    print2(text)
    print2('----------------------------------------------------------\n')
    
    print2('Keep lower case letters:')
    lWords = re.sub("[^a-zA-Z]", " ", text).lower().split() 
    print2(lWords)
    print2('----------------------------------------------------------\n')
    
#     print2('Lemmatization:')
#     lWords = [oWordNetLemmatizer.lemmatize(word) for word in lWords if word not in sStopWords]   
#     print2(lWords)
#     print2('----------------------------------------------------------\n')
    
    return  lWords

### Split text into lines:
Using `tokenizer`

In [5]:
import nltk

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sText  = dData['review'].values[9]
lLines = tokenizer.tokenize(sText.strip())

print(sText)
print('=================')
print('=================')
lLines

<br /><br />This movie is full of references. Like \Mad Max II\", \"The wild one\" and many others. The ladybug´s face it´s a clear reference (or tribute) to Peter Lorre. This movie is a masterpiece. We´ll talk much more about in the future."


['<br /><br />This movie is full of references.',
 'Like \\Mad Max II\\", \\"The wild one\\" and many others.',
 'The ladybug´s face it´s a clear reference (or tribute) to Peter Lorre.',
 'This movie is a masterpiece.',
 'We´ll talk much more about in the future."']

### Convert raw text to processed lines:

In [6]:
def Text2lines(text):
    lRawLines = tokenizer.tokenize(text.strip())
    lLines = []
    
    for rawLine in lRawLines:
        if len(rawLine) > 0:
            lLines.append(PreprocessLine(rawLine))

    return lLines

In [7]:
lLines = []
for text in dData['review'].values:
    lLines += Text2lines(text)

### Train Word2Vec model:
Using `gensim`  
https://radimrehurek.com/gensim/index.html

In [8]:
from gensim.models import word2vec

d            = 300
minWordCount = 40
contextWin   = 5

oWord2Vec = word2vec.Word2Vec(lLines, workers=4, size=d, min_count=minWordCount, window=contextWin)

#-- If you don't plan to train the model any further, calling 
#-- init_sims will make the model much more memory-efficient.
oWord2Vec.init_sims(replace=True)

#### Check vocabulary size:

In [9]:
len(oWord2Vec.wv.vocab)

8308

#### Which word from the given list doesn't go with the others?

In [10]:
oWord2Vec.wv.doesnt_match(['man', 'child', 'woman', 'film'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'film'

#### Most similar:

In [11]:
oWord2Vec.wv.most_similar('film')

[('movie', 0.8880027532577515),
 ('documentary', 0.6742621064186096),
 ('picture', 0.6466796398162842),
 ('flick', 0.6288743615150452),
 ('sequel', 0.5803209543228149),
 ('masterpiece', 0.5269690155982971),
 ('films', 0.5204336047172546),
 ('segment', 0.5121288895606995),
 ('thriller', 0.5106791853904724),
 ('cinema', 0.5068612098693848)]

### A single word embedding:

In [12]:
word = 'this'
vZ   = oWord2Vec.wv[word]
vZ.shape

(300,)

### Algebra with words:

In [13]:
vWord = oWord2Vec.wv['actor'] - oWord2Vec.wv['man'] +  oWord2Vec.wv['woman']
oWord2Vec.wv.most_similar(positive=[vWord], topn=1)

# oWord2Vec.wv.most_similar(positive=['actor', 'woman'], negative=['man'])

[('actress', 0.8203716278076172)]

### Model words embedding:

In [14]:
#-- Embedding vectors (Vw)
mZ = oWord2Vec.wv.vectors
#-- Context vectors (Vc)
mC = oWord2Vec.trainables.syn1neg

mZ.shape, mC.shape

((8308, 300), (8308, 300))

### Compute $K$ clusters and print most similar words:

In [15]:
from sklearn.cluster import KMeans

K       = 10
oKmeans = KMeans(n_clusters=K, n_init=1, init='random').fit(mZ)
mMu     = oKmeans.cluster_centers_

In [16]:
for ii in range(K):
    print('--------------------------')
    print(f'Cluster {ii}:')
    display(oWord2Vec.wv.similar_by_vector(mMu[ii,:]))
    print('--------------------------\n')

--------------------------
Cluster 0:


[('ask', 0.6485928297042847),
 ('say', 0.6430670022964478),
 ('guess', 0.6388207674026489),
 ('leave', 0.6332195997238159),
 ('warn', 0.6293051838874817),
 ('forgive', 0.6214397549629211),
 ('buy', 0.6196361780166626),
 ('think', 0.6162622570991516),
 ('understand', 0.6160597205162048),
 ('regret', 0.614203929901123)]

--------------------------

--------------------------
Cluster 1:


[('climbing', 0.8881481289863586),
 ('racing', 0.8640392422676086),
 ('spaceship', 0.8573546409606934),
 ('mountain', 0.8503150939941406),
 ('laser', 0.8500832915306091),
 ('crashes', 0.8498013019561768),
 ('ranch', 0.8490526080131531),
 ('flames', 0.8454511165618896),
 ('windows', 0.8430398106575012),
 ('yard', 0.8426832556724548)]

--------------------------

--------------------------
Cluster 2:


[('natives', 0.823610782623291),
 ('groups', 0.8109252452850342),
 ('methods', 0.8036380410194397),
 ('inmates', 0.8035577535629272),
 ('muslims', 0.8012620806694031),
 ('cities', 0.7992398738861084),
 ('workers', 0.7922746539115906),
 ('companies', 0.7908176183700562),
 ('dancers', 0.7901692390441895),
 ('slaves', 0.787493109703064)]

--------------------------

--------------------------
Cluster 3:


[('imagery', 0.8432190418243408),
 ('aesthetic', 0.8067528009414673),
 ('storytelling', 0.8001387119293213),
 ('blend', 0.7867540121078491),
 ('atmosphere', 0.7765041589736938),
 ('artistry', 0.7763543128967285),
 ('scope', 0.7749865055084229),
 ('flair', 0.7732428312301636),
 ('technique', 0.7716243267059326),
 ('symbolism', 0.758178174495697)]

--------------------------

--------------------------
Cluster 4:


[('unsettling', 0.8582496643066406),
 ('improbable', 0.8398882150650024),
 ('preposterous', 0.8398847579956055),
 ('unpredictable', 0.8349109888076782),
 ('restrained', 0.8341395854949951),
 ('stylized', 0.8336154818534851),
 ('simplistic', 0.82525634765625),
 ('clumsy', 0.8245313167572021),
 ('vivid', 0.8210886120796204),
 ('poetic', 0.8150763511657715)]

--------------------------

--------------------------
Cluster 5:


[('policeman', 0.897533118724823),
 ('dictator', 0.8954358100891113),
 ('millionaire', 0.8869168758392334),
 ('dealer', 0.8862613439559937),
 ('salesman', 0.8700532913208008),
 ('abu', 0.8655052185058594),
 ('playboy', 0.8649786114692688),
 ('investigator', 0.8608105778694153),
 ('attorney', 0.8553116917610168),
 ('preacher', 0.8551937341690063)]

--------------------------

--------------------------
Cluster 6:


[('ideals', 0.8482915163040161),
 ('corruption', 0.8396477699279785),
 ('turmoil', 0.8337985277175903),
 ('destruction', 0.8320393562316895),
 ('beliefs', 0.8309690952301025),
 ('addiction', 0.8285077214241028),
 ('dilemma', 0.8259705305099487),
 ('awareness', 0.8209913969039917),
 ('plight', 0.8209081292152405),
 ('involvement', 0.8199225664138794)]

--------------------------

--------------------------
Cluster 7:


[('columbia', 0.8638345003128052),
 ('august', 0.8076130151748657),
 ('wwe', 0.7895396947860718),
 ('revival', 0.7722963690757751),
 ('uk', 0.7620391845703125),
 ('region', 0.7544271349906921),
 ('corporation', 0.7538974285125732),
 ('fifties', 0.7523468732833862),
 ('boogeyman', 0.7453266978263855),
 ('code', 0.7446240782737732)]

--------------------------

--------------------------
Cluster 8:


[('everett', 0.9420100450515747),
 ('keith', 0.9394093155860901),
 ('mitchell', 0.932623565196991),
 ('thomas', 0.930459201335907),
 ('rooney', 0.9293650388717651),
 ('miller', 0.9288190603256226),
 ('stevens', 0.9278692007064819),
 ('collins', 0.9263511896133423),
 ('burke', 0.9262815713882446),
 ('bennett', 0.9243419170379639)]

--------------------------

--------------------------
Cluster 9:


[('informs', 0.8278855085372925),
 ('reveals', 0.7913388609886169),
 ('commits', 0.7677836418151855),
 ('blames', 0.7613875269889832),
 ('considers', 0.7564082741737366),
 ('admits', 0.7492868900299072),
 ('forgets', 0.7487760782241821),
 ('convinces', 0.7476508021354675),
 ('demonstrates', 0.7459062337875366),
 ('approached', 0.7453361749649048)]

--------------------------

