Walk into Embedding

In [14]:
import pandas as pd
import numpy as np
df = pd.read_csv(r"C:\Users\ADITYA\Desktop\IMDB Dataset.csv")

In [27]:
df.sentiment[df.sentiment == 'positive'] = 1
df.sentiment[df.sentiment == 'negative'] = 0

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.review.values,df.sentiment.values,random_state=1)
print(y_train)

[0 0 1 ... 0 0 1]


In [29]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(X_train))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(X_train) 
x_val_seq = tokenizer.texts_to_sequences(X_test)

#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)
print(x_tr_seq)

[[    33    979      9 ...     34  37372    979]
 [   913      1    368 ...     42      4    639]
 [     1     16  12900 ...     18     21     75]
 ...
 [109356     35     27 ...   3948    455    155]
 [    37     18     54 ...     10    213     11]
 [  2375   1511     94 ...     69    543   1146]]


In [30]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

109359


In [31]:
from keras.models import *
from keras.layers import *
from keras.callbacks import *

model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=100,trainable=True)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          32807700  
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 33,035,669
Trainable params: 33,035,669
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
history = model.fit(np.array(x_tr_seq),np.array(y_train),batch_size=128,epochs=10,validation_data=(np.array(x_val_seq),np.array(y_test)),verbose=1,callbacks=[es,mc])

Train on 37500 samples, validate on 12500 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.86568, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.86568 to 0.86784, saving model to best_model.h5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.86784
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.86784
Epoch 00004: early stopping


In [34]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
_,val_acc = model.evaluate(x_val_seq,y_test, batch_size=128)
print(val_acc)

0.8678399920463562
