<a href="https://colab.research.google.com/github/DanielaManate/SentimentAnalysis_MADC_UBB/blob/master/SA3_1_Text_Representation_EG_TRAIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [0]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

from sklearn.decomposition import PCA

import plotly.express as px

In [0]:
# Training data
sentences = [['we', 'are', 'going', 'to', 'use', 'word2vec'],
        ['also', 'we', 'will', 'use', 'fasttext'],
        ['this', 'is', 'a', 'sentence'],
        ['which', 'embedding', 'is', 'the', 'best', 'for', 'these', 'sentences'],
        ['lets', 'find', 'out']]

In [0]:
# %env PYTHONHASHSEED=0

# 1. Word2Vec - train your own Model. Default = CBOW (sg=0)

In [0]:
# Train model
# Size = 4 => 4 dimensions
# min_count = 1 => Ignores all words with total frequency lower than this
model_word2vec = Word2Vec(sentences, min_count=1, size = 4)
# seed=5, workers=1

In [5]:
# Summarize vocabulary
word_list = list(model_word2vec.wv.vocab)
word_list

['we',
 'are',
 'going',
 'to',
 'use',
 'word2vec',
 'also',
 'will',
 'fasttext',
 'this',
 'is',
 'a',
 'sentence',
 'which',
 'embedding',
 'the',
 'best',
 'for',
 'these',
 'sentences',
 'lets',
 'find',
 'out']

In [6]:
# Access vector for one word
model_word2vec.wv['sentence']

array([ 0.01587466, -0.06909975,  0.03416497,  0.05477782], dtype=float32)

## What happens if we are trying to determine the embedding for a word that wasn't seen in the training phase?

In [0]:
# model_word2vec.wv['sentiment']
# "word 'sentiment' not in vocabulary"

In [0]:
# model_word2vec.wv['random']
# "word 'random' not in vocabulary"

*   If we are trying to determine the embeddings for the words 'sentiment' and 'random', which did not exist in the training phase, we will get the following error: "word '...' not in vocabulary".
*   Word2Vec cannot show embeddings for words that were not seen in the training phase



## Visualization

In [9]:
X_word2vec = model_word2vec.wv[word_list]
X_word2vec

array([[-0.07902575, -0.06842621,  0.07083109,  0.08555245],
       [ 0.09756257, -0.05231469, -0.08910951,  0.09866431],
       [ 0.06362847, -0.03827805, -0.00503045,  0.01877309],
       [-0.0491444 ,  0.04018881, -0.11751047,  0.07474335],
       [-0.04847693, -0.02630245,  0.03138343, -0.07643729],
       [ 0.02418801,  0.11990099,  0.07720531,  0.0222541 ],
       [ 0.02726939, -0.05427095, -0.03311149,  0.00594523],
       [-0.01241367,  0.09394597,  0.08032301, -0.10774031],
       [-0.02654121, -0.11094412,  0.05059164, -0.10145399],
       [ 0.03353186, -0.06959727,  0.08702777, -0.08627736],
       [ 0.06110029, -0.0543455 ,  0.1092888 ,  0.08007254],
       [ 0.09169698,  0.02508538, -0.03747644, -0.07516889],
       [ 0.01587466, -0.06909975,  0.03416497,  0.05477782],
       [-0.06119813,  0.11127528,  0.03100854,  0.07332224],
       [ 0.04481099, -0.00627586,  0.0255466 , -0.08535679],
       [-0.08459608,  0.00687029, -0.10617413,  0.11529242],
       [ 0.12194087, -0.

In [10]:
# Fit a 2D PCA model to the vectors
pca_word2vec = PCA(n_components=2).fit_transform(X_word2vec)
pca_word2vec

array([[ 4.82901968e-02, -3.27646062e-02],
       [ 2.78435033e-02, -1.34967715e-01],
       [-3.38228233e-02, -4.84774373e-02],
       [ 1.34071380e-01, -1.66592002e-02],
       [-4.65583839e-02,  5.28310500e-02],
       [ 1.47776138e-02,  1.08762912e-01],
       [-1.79692153e-02, -5.21891788e-02],
       [-6.26698434e-02,  1.63020298e-01],
       [-1.12177148e-01,  1.14603390e-05],
       [-1.33374840e-01,  1.69975515e-02],
       [-4.01110686e-02, -5.13209663e-02],
       [-7.70343989e-02,  2.41710581e-02],
       [-9.73050855e-03, -6.07987307e-02],
       [ 1.09025337e-01,  9.00992602e-02],
       [-9.33695957e-02,  4.07283083e-02],
       [ 1.64153561e-01, -4.60125282e-02],
       [-3.90255116e-02, -1.18142202e-01],
       [ 7.74538293e-02, -2.35672314e-02],
       [-1.05815634e-01, -4.59795073e-02],
       [-4.98274900e-02, -3.33824158e-02],
       [ 1.95904285e-01,  7.37552196e-02],
       [-1.59421116e-02,  1.32832646e-01],
       [ 6.59088492e-02, -3.89480144e-02]], dtype=floa

In [11]:
fig = px.scatter(x=pca_word2vec[:, 0], y=pca_word2vec[:, 1],
                 text = word_list)
fig.update_traces(textposition='top center', 
                  textfont_size=14)
fig.show()

# 2. FastText - train your own Model. Default = CBOW (sg=0)

In [0]:
# Train model
# Size of character ngrams default: min_n=3, max_n=6
model_fasttext = FastText(sentences, min_count=1, size = 4)
# seed=5, workers=1

In [13]:
# Summarize vocabulary
word_list = list(model_fasttext.wv.vocab)
print(word_list)

['we', 'are', 'going', 'to', 'use', 'word2vec', 'also', 'will', 'fasttext', 'this', 'is', 'a', 'sentence', 'which', 'embedding', 'the', 'best', 'for', 'these', 'sentences', 'lets', 'find', 'out']


In [14]:
# Access vector for one word
print(model_fasttext.wv['sentence'])

[ 0.00564454 -0.02561449 -0.02562428 -0.02249124]


## What happens if we are trying to determine the embedding for a word that wasn't seen in the training phase?

In [15]:
model_fasttext.wv['sentiment']

array([-0.03443928, -0.06827369, -0.06017304,  0.007128  ], dtype=float32)

In [0]:
# model_fasttext.wv['random']
# 'all ngrams for word random absent from model'

*   If we are trying to determine the embedding for the word 'sentiment' (which was not in the training phase), fasttext can show it's embeddings because the word's character ngrams were present in the training phase
*   For the word 'random', we are getting an error because all ngrams for the word are absent from model
*   FastText can show embeddings for words that were not seen in the training phase, as long as the ngrams were present in the training



## Visualisation

In [17]:
X_fasttext = model_fasttext.wv[word_list]
X_fasttext

array([[ 1.21204033e-02,  4.99350205e-02, -8.78660530e-02,
        -6.07791878e-02],
       [ 6.21617176e-02,  8.45489427e-02, -1.37556955e-01,
         1.46440333e-02],
       [-2.61515211e-02, -1.09270103e-01, -3.84047963e-02,
         5.08586355e-02],
       [ 4.00994942e-02,  1.83750257e-01, -4.02536131e-02,
         4.12145779e-02],
       [-6.26747683e-02, -9.83016379e-03,  1.28589766e-02,
         3.35453488e-02],
       [ 1.34324366e-02,  4.72705103e-02, -1.83642488e-02,
         2.53529791e-02],
       [-2.73731463e-02,  1.74377933e-02,  5.19142635e-02,
         4.21980470e-02],
       [ 1.66560952e-02,  8.05674866e-03,  1.59828421e-02,
        -2.28994116e-02],
       [ 2.45919377e-02,  2.28262017e-03, -3.85213620e-03,
        -3.79294008e-02],
       [ 4.10076343e-02, -5.09262718e-02,  3.17586921e-02,
         1.36985490e-03],
       [ 1.19109683e-01, -2.28398312e-02, -1.76607296e-02,
         5.28974459e-02],
       [-9.80637074e-02,  3.04356329e-02,  1.69686362e-01,
      

In [18]:
# Fit a 2D PCA model to the vectors
pca_fasttext = PCA(n_components=2).fit_transform(X_fasttext)
pca_fasttext

array([[-0.06378063,  0.05084066],
       [-0.16215678,  0.03011004],
       [ 0.01165616, -0.12420192],
       [-0.13164093,  0.12147529],
       [ 0.0303038 , -0.01728766],
       [-0.0437893 ,  0.01920552],
       [ 0.03027428,  0.00294396],
       [ 0.01348476,  0.01295085],
       [ 0.00353009,  0.01068245],
       [ 0.03002763, -0.05011297],
       [-0.0682824 , -0.06947778],
       [ 0.20815597,  0.12990944],
       [ 0.00097946, -0.02103066],
       [ 0.05801173,  0.01082189],
       [ 0.01170602, -0.00529915],
       [ 0.05024039, -0.04995183],
       [ 0.02708293, -0.03785813],
       [ 0.07808102,  0.02213352],
       [ 0.01759859,  0.00195458],
       [ 0.01011732, -0.03146114],
       [-0.02882611,  0.02822603],
       [-0.01541857, -0.06975606],
       [-0.06735545,  0.03518307]], dtype=float32)

In [19]:
fig = px.scatter(x=pca_fasttext[:, 0], y=pca_fasttext[:, 1],
                 text = word_list)
fig.update_traces(textposition='top center', 
                  textfont_size=14)
fig.show()