In [1]:
import numpy as np 
import pandas as pd 
import nltk
import os
import gc
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import re
import warnings
warnings.filterwarnings("ignore")
#pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_colwidth', -1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Import DATA SET

In [2]:
#import dataset
data = pd.read_csv('dataset/cleand.csv')
# Keeping only the neccessary columns
data = data[['SentimentText','Sentiment']]

In [3]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['SentimentText'].values)
X = tokenizer.texts_to_sequences(data['SentimentText'].values)
X = pad_sequences(X)

In [4]:
Y = pd.get_dummies(data['Sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(16750, 741) (16750, 2)
(8250, 741) (8250, 2)


# Implementing LSTM

In [4]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 741, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 741, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
 - 1156s - loss: 0.4584 - acc: 0.7829
Epoch 2/5
 - 1142s - loss: 0.3512 - acc: 0.8566
Epoch 3/5
 - 1119s - loss: 0.3126 - acc: 0.8740
Epoch 4/5
 - 1135s - loss: 0.2801 - acc: 0.8893
Epoch 5/5
 - 1164s - loss: 0.2576 - acc: 0.8984


<keras.callbacks.History at 0x10aef198>

In [7]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.41
acc: 0.85


# Implementing CNN

In [6]:
acc = []
max_features= 2000
model2= Sequential()
model2.add(Embedding(max_features,100,input_length= X.shape[1]))
model2.add(Dropout(0.2))

model2.add(Conv1D(100,kernel_size=2,padding='same',activation='relu',strides=1))
model2.add(GlobalMaxPooling1D())

model2.add(Dense(256,activation='relu'))
model2.add(Dropout(0.2))

model2.add(Dense(2,activation='sigmoid'))


model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 741, 100)          200000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 741, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 741, 100)          20100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               25856     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 514       
Total para

In [7]:
%%time
history2=model2.fit(X_train, Y_train, validation_data=(X_test, Y_test),epochs=5, batch_size=32, verbose=2)

Train on 16750 samples, validate on 8250 samples
Epoch 1/5
 - 537s - loss: 0.4333 - acc: 0.7876 - val_loss: 0.3277 - val_acc: 0.8560
Epoch 2/5
 - 507s - loss: 0.2680 - acc: 0.8911 - val_loss: 0.3195 - val_acc: 0.8624
Epoch 3/5
 - 531s - loss: 0.1865 - acc: 0.9293 - val_loss: 0.3461 - val_acc: 0.8608
Epoch 4/5
 - 654s - loss: 0.1259 - acc: 0.9538 - val_loss: 0.4258 - val_acc: 0.8519
Epoch 5/5
 - 575s - loss: 0.0833 - acc: 0.9712 - val_loss: 0.4521 - val_acc: 0.8552
Wall time: 46min 45s


# CNN+GRU

In [8]:
max_features= 2000
model3= Sequential()
model3.add(Embedding(max_features,100,input_length= X.shape[1]))
model3.add(Conv1D(100,kernel_size=3,padding='same',activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Dropout(0.2))
model3.add(GRU(128,return_sequences=True))
model3.add(Dropout(0.3))
model3.add(Flatten())
model3.add(Dense(128,activation='relu'))
model3.add(Dropout(0.5))
model3.add(Dense(2,activation='softmax'))
model3.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 741, 100)          200000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 741, 100)          30100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 370, 100)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 370, 100)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 370, 128)          87936     
_________________________________________________________________
dropout_4 (Dropout)          (None, 370, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 47360)             0         
__________

In [9]:
%%time
history3=model3.fit(X_train, Y_train, validation_data=(X_test, Y_test),epochs=5, batch_size=32, verbose=2)

Train on 16750 samples, validate on 8250 samples
Epoch 1/5
 - 557s - loss: 0.4176 - acc: 0.8042 - val_loss: 0.3355 - val_acc: 0.8545
Epoch 2/5
 - 529s - loss: 0.2795 - acc: 0.8891 - val_loss: 0.3183 - val_acc: 0.8652
Epoch 3/5
 - 547s - loss: 0.2366 - acc: 0.9064 - val_loss: 0.3524 - val_acc: 0.8550
Epoch 4/5
 - 541s - loss: 0.1798 - acc: 0.9301 - val_loss: 0.4126 - val_acc: 0.8509
Epoch 5/5
 - 551s - loss: 0.1269 - acc: 0.9530 - val_loss: 0.4271 - val_acc: 0.8561
Wall time: 45min 27s


# implementing Bidirectional GRU

In [12]:
max_features= 2000
model4 = Sequential()

model4.add(Embedding(max_features, 100, input_length= X.shape[1]))
model4.add(SpatialDropout1D(0.2))
model4.add(Bidirectional(GRU(128)))
model4.add(Dropout(0.5))

model4.add(Dense(2, activation='softmax'))
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 741, 100)          200000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 741, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               175872    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 514       
Total params: 376,386
Trainable params: 376,386
Non-trainable params: 0
_________________________________________________________________


In [13]:
%%time
history4=model4.fit(X_train, Y_train, validation_data=(X_test, Y_test),epochs= 3, batch_size= 32, verbose=2)

Train on 16750 samples, validate on 8250 samples
Epoch 1/3
 - 22146s - loss: 0.4741 - acc: 0.7732 - val_loss: 0.3713 - val_acc: 0.8482
Epoch 2/3
 - 7434s - loss: 0.3050 - acc: 0.8782 - val_loss: 0.3640 - val_acc: 0.8436
Epoch 3/3
 - 2676s - loss: 0.2741 - acc: 0.8931 - val_loss: 0.3511 - val_acc: 0.8564
Wall time: 8h 58min 5s


# CNN LSTM

In [5]:
max_features= 2000
model5= Sequential()
model5.add(Embedding(max_features,100,input_length= X.shape[1]))
model5.add(Conv1D(100,kernel_size=2,padding='same',activation='relu'))
model5.add(MaxPooling1D(pool_size=2))
model5.add(Dropout(0.2))
model5.add(LSTM(128,return_sequences=True))
model5.add(Dropout(0.2))
model5.add(Flatten())
model5.add(Dense(128,activation='relu'))
model5.add(Dropout(0.2))
model5.add(Dense(2,activation='softmax'))
model5.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model5.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 741, 100)          200000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 741, 100)          20100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 370, 100)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 370, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 370, 128)          117248    
_________________________________________________________________
dropout_2 (Dropout)          (None, 370, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 47360)             0         
__________

In [6]:
%%time
history4=model5.fit(X_train, Y_train, validation_data=(X_test, Y_test),epochs= 3, batch_size= 32, verbose=2)

Train on 16750 samples, validate on 8250 samples
Epoch 1/3
 - 1033s - loss: 0.3918 - acc: 0.8152 - val_loss: 0.3274 - val_acc: 0.8572
Epoch 2/3
 - 990s - loss: 0.2707 - acc: 0.8907 - val_loss: 0.3342 - val_acc: 0.8549
Epoch 3/3
 - 1026s - loss: 0.2237 - acc: 0.9127 - val_loss: 0.3830 - val_acc: 0.8578
Wall time: 50min 52s


# Bidirectional LSTM

In [17]:
max_features= 2000
model6 = Sequential()

model6.add(Embedding(max_features, 100, input_length= X.shape[1]))
model6.add(SpatialDropout1D(0.25))
model6.add(Bidirectional(LSTM(128)))
model6.add(Dropout(0.5))

model6.add(Dense(2, activation='softmax'))
model6.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model6.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 741, 100)          200000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 741, 100)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               234496    
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 514       
Total params: 435,010
Trainable params: 435,010
Non-trainable params: 0
_________________________________________________________________


In [18]:
%%time
history4=model6.fit(X_train, Y_train, validation_data=(X_test, Y_test),epochs= 3, batch_size= 32, verbose=2)

Train on 16750 samples, validate on 8250 samples
Epoch 1/3
 - 1807s - loss: 0.4320 - acc: 0.7999 - val_loss: 0.3447 - val_acc: 0.8509
Epoch 2/3
 - 1890s - loss: 0.3143 - acc: 0.8744 - val_loss: 0.3563 - val_acc: 0.8552
Epoch 3/3
 - 2119s - loss: 0.2792 - acc: 0.8890 - val_loss: 0.4329 - val_acc: 0.8465
Wall time: 1h 39min 11s
