# Embedding
mathmetical representation of words

In [1]:
import gensim

In [2]:
from gensim.models import KeyedVectors

In [3]:
model = KeyedVectors.load_word2vec_format('./dataset/dataset/word2vec.6B.200d.txt')

In [7]:
model.most_similar(positive= ['person', 'queen'], negative=['king'])

[('woman', 0.6354276537895203),
 ('someone', 0.5549800992012024),
 ('she', 0.5433660745620728),
 ('herself', 0.5216142535209656),
 ('her', 0.5083276629447937),
 ('anyone', 0.5072499513626099),
 ('persons', 0.5012387633323669),
 ('one', 0.49033915996551514),
 ('child', 0.48775848746299744),
 ('every', 0.4856557548046112)]

In [13]:
model.similar_by_word('marvel')

[('comics', 0.766106128692627),
 ('superhero', 0.6700377464294434),
 ('superheroes', 0.6589516401290894),
 ('superman', 0.6061909198760986),
 ('comic', 0.5999482274055481),
 ('x-men', 0.5982449054718018),
 ('spider-man', 0.5939557552337646),
 ('universe', 0.5562811493873596),
 ('batman', 0.5428684949874878),
 ('team-up', 0.5420282483100891)]

In [14]:
import numpy as np
import pandas as pd
import nltk
import joblib

In [15]:
df = pd.read_csv('./dataset/dataset/imdb_labelled.txt', sep= '\t', header=None, names=['Review', 'Sentiment'])
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [19]:
df.shape

(748, 2)

In [20]:
df.drop_duplicates(inplace= True)

In [21]:
df.shape

(745, 2)

In [22]:
X= df['Review']
y= df['Sentiment']

In [23]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences  # padding helps to give same number of input to the model

In [24]:
tokenizer = Tokenizer()

In [25]:
tokenizer.fit_on_texts(X)

In [26]:
vocab_size = len(tokenizer.word_counts.keys())
vocab_size

3133

In [28]:
tokenizer.word_counts

OrderedDict([('a', 433),
             ('very', 65),
             ('slow', 6),
             ('moving', 4),
             ('aimless', 1),
             ('movie', 181),
             ('about', 50),
             ('distressed', 1),
             ('drifting', 1),
             ('young', 4),
             ('man', 13),
             ('not', 72),
             ('sure', 3),
             ('who', 38),
             ('was', 185),
             ('more', 31),
             ('lost', 4),
             ('the', 848),
             ('flat', 2),
             ('characters', 35),
             ('or', 41),
             ('audience', 5),
             ('nearly', 1),
             ('half', 6),
             ('of', 377),
             ('whom', 2),
             ('walked', 2),
             ('out', 41),
             ('attempting', 1),
             ('artiness', 1),
             ('with', 90),
             ('black', 9),
             ('white', 8),
             ('and', 434),
             ('clever', 5),
             ('camera', 10),
       

In [32]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'is': 5,
 'this': 6,
 'i': 7,
 'it': 8,
 'to': 9,
 'in': 10,
 'was': 11,
 'movie': 12,
 'film': 13,
 'that': 14,
 '0': 15,
 '1': 16,
 'for': 17,
 'as': 18,
 'but': 19,
 'with': 20,
 'one': 21,
 'on': 22,
 'you': 23,
 'are': 24,
 'not': 25,
 'bad': 26,
 "it's": 27,
 'very': 28,
 'all': 29,
 'just': 30,
 'so': 31,
 'good': 32,
 'at': 33,
 'an': 34,
 'be': 35,
 'there': 36,
 'about': 37,
 'have': 38,
 'by': 39,
 'like': 40,
 'from': 41,
 'if': 42,
 'acting': 43,
 'time': 44,
 'his': 45,
 'or': 46,
 'out': 47,
 'really': 48,
 'great': 49,
 'even': 50,
 'he': 51,
 'who': 52,
 'were': 53,
 'has': 54,
 'see': 55,
 'my': 56,
 'characters': 57,
 'well': 58,
 'most': 59,
 'how': 60,
 'more': 61,
 'no': 62,
 'only': 63,
 'when': 64,
 'ever': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 '10': 71,
 'they': 72,
 'best': 73,
 'because': 74,
 'your': 75,
 'can': 76,
 'also': 77,
 "don't": 78,
 'films': 79,
 'than': 80,
 'its': 81,
 

In [30]:
X[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [31]:
tokens = tokenizer.texts_to_sequences(X)
tokens[0]

[3, 28, 28, 28, 287, 407, 1216, 12, 37, 3, 1217, 1218, 408, 143]

In [33]:
sentence_length = [len(x) for x in tokens]

min(sentence_length), max(sentence_length)

(1, 1400)

In [36]:
from collections import Counter
Counter(sentence_length)

Counter({14: 23,
         18: 18,
         29: 7,
         8: 37,
         21: 21,
         20: 30,
         3: 24,
         15: 30,
         10: 30,
         6: 38,
         11: 47,
         4: 26,
         16: 34,
         25: 14,
         17: 21,
         872: 1,
         12: 42,
         5: 36,
         19: 21,
         24: 15,
         34: 8,
         7: 35,
         23: 12,
         9: 37,
         2: 12,
         13: 25,
         26: 6,
         1: 3,
         37: 2,
         22: 14,
         27: 8,
         35: 4,
         200: 1,
         1400: 1,
         45: 4,
         28: 7,
         302: 1,
         43: 2,
         31: 8,
         55: 1,
         44: 2,
         33: 10,
         36: 5,
         69: 1,
         57: 1,
         32: 3,
         30: 6,
         73: 1,
         47: 1,
         38: 3,
         39: 1,
         53: 1,
         51: 1,
         42: 2,
         802: 1})

In [37]:
padded_tokens = pad_sequences(tokens, maxlen=50)

In [38]:
embedding_index = {}
f = open('./dataset/dataset/glove.6B.200d.txt', encoding='UTF-8')
for line in f:
    values = line.split()
    word = values[0]
    coef = np.asarray(values[1:], dtype= 'float32')
    embedding_index[word]= coef
f.close()

In [39]:
embedding_index['nepal']

array([ 0.020101 ,  0.41544  , -0.27556  ,  0.63967  ,  0.13188  ,
       -0.077937 ,  0.045322 , -0.23615  , -0.38871  , -0.24816  ,
       -0.11325  ,  0.62334  , -0.0043612,  0.095859 , -0.36703  ,
       -0.097942 , -0.028298 ,  0.17543  , -0.61044  ,  0.0072222,
        0.65129  ,  2.5718   , -0.45644  ,  0.51435  ,  0.33696  ,
       -0.41476  , -0.049719 ,  1.5525   ,  0.2805   ,  0.89074  ,
       -0.068298 , -0.86965  , -0.1083   ,  0.20397  , -0.28714  ,
       -0.0095998, -0.06447  , -0.046072 , -0.16697  ,  0.23467  ,
       -0.14176  ,  0.4919   ,  0.38737  , -0.13118  , -0.16852  ,
       -0.087953 , -0.27169  , -0.27162  , -0.46881  , -0.76117  ,
        0.10411  , -0.29944  , -0.088991 , -0.71948  , -0.23261  ,
        0.18182  , -0.16252  , -0.93978  ,  0.033649 ,  0.72291  ,
        0.46839  ,  0.30018  ,  0.47769  ,  1.3974   ,  0.057041 ,
        0.16294  , -0.3421   ,  0.069266 , -0.65844  , -0.90702  ,
       -0.48365  , -0.28711  , -0.22239  ,  0.91995  , -0.0400

In [40]:
embedding_matrix = np.zeros((len(word_index) + 1, 200))

In [41]:
embedding_matrix.shape

(3134, 200)

In [42]:
for word in word_index.items():
    word, i = word
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i-1] = embedding_vector

In [46]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense, Conv1D, MaxPool1D #lstm preserve the previous data
from keras.optimizers import Adam

In [55]:
rnn = Sequential()
rnn.add(Embedding(input_dim= vocab_size + 1,
                  output_dim = 200,
                  input_length= 50, weights= [embedding_matrix],
                  trainable = False
                 ))
rnn.add(Conv1D(filters=50, kernel_size=3, activation='relu'))
rnn.add(MaxPool1D())
rnn.add(LSTM(units= 50, activation='relu'))
rnn.add(Dense(units=20, activation='relu'))
rnn.add(Dense(units=1, activation='sigmoid'))

rnn.compile(loss= 'binary_crossentropy', optimizer= Adam(learning_rate= 1e-6), metrics=['acc'])

In [56]:
rnn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 200)           626800    
                                                                 
 conv1d_3 (Conv1D)           (None, 48, 50)            30050     
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, 24, 50)            0         
 g1D)                                                            
                                                                 
 lstm_2 (LSTM)               (None, 50)                20200     
                                                                 
 dense_4 (Dense)             (None, 20)                1020      
                                                                 
 dense_5 (Dense)             (None, 1)                 21        
                                                      

In [57]:
rnn.fit(padded_tokens, y, epochs= 20, validation_split=0.15)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x2552ed86d50>