In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.metrics import confusion_matrix,classification_report

In [14]:
data = pd.read_csv('pre_sentiment.csv')

In [15]:
data.dtypes

Unnamed: 0       int64
reviewText      object
review_clean    object
sentiment       object
tokenized       object
token_count      int64
dtype: object

In [16]:
data = data[data['token_count'] < 512]

In [17]:
data.shape

(72643, 6)

In [18]:
data = data[['reviewText', 'sentiment']]

In [19]:
import re

In [21]:
data['reviewText'] = data['reviewText'].astype(str)

In [22]:
data['reviewText'] = data['reviewText'].apply(lambda x: x.lower()) #lower caseing
data['reviewText'] = data['reviewText'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) # removing special chars

In [23]:
data.head()

Unnamed: 0,reviewText,sentiment
0,it diffuses a very mild light perfume just wha...,Positive
1,all time favorite wish they still carried this,Positive
2,one of my wifes favorites,Positive
3,if you have body acne this product is a must i...,Positive
4,i really is what i expected,Positive


In [26]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['reviewText'].values)
X = tokenizer.texts_to_sequences(data['reviewText'].values)
X = pad_sequences(X)

In [27]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)

(58114, 488) (58114, 2)


In [28]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(2500, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 488, 128)          320000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 488, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 575,194
Trainable params: 575,194
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
101/455 [=====>........................] - ETA: 25s - loss: 0.1443 - accuracy: 0.9455

KeyboardInterrupt: 

In [31]:
Y_pred = model.predict(X_test,batch_size = batch_size) 
Y_pred=np.argmax(Y_pred,axis=1)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

confusion matrix [[ 1341   861]
 [  460 11867]]
              precision    recall  f1-score   support

           0       0.74      0.61      0.67      2202
           1       0.93      0.96      0.95     12327

    accuracy                           0.91     14529
   macro avg       0.84      0.79      0.81     14529
weighted avg       0.90      0.91      0.91     14529



In [32]:
from sklearn.utils import resample
from sklearn.utils import shuffle

In [33]:
# Separate majority and minority classes
data_majority = data[data['sentiment'] == 'Positive']
data_minority = data[data['sentiment'] == 'Negative']

bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then 
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [34]:
print('positive data in training:',(train.sentiment == 'Positive').sum())
print('negative data in training:',(train.sentiment == 'Negative').sum())
print('positive data in test:',(test.sentiment == 'Positive').sum())
print('negative data in test:',(test.sentiment == 'Negative').sum())

positive data in training: 49435
negative data in training: 8679
positive data in test: 12359
negative data in test: 2170


In [37]:
# Separate majority and minority classes in training data for upsampling 
data_majority = train[train['sentiment'] == 'Positive']
data_minority = train[train['sentiment'] == 'Negative']

print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
 
# Display new class counts
print("After upsampling\n",data_upsampled.sentiment.value_counts(),sep = "")

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['reviewText'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['reviewText'].values)
X_train = pad_sequences(X_train,maxlen=488)
Y_train = pd.get_dummies(data_upsampled['sentiment']).values
print('x_train shape:',X_train.shape)

X_test = tokenizer.texts_to_sequences(test['reviewText'].values)
X_test = pad_sequences(X_test,maxlen=488)
Y_test = pd.get_dummies(test['sentiment']).values
print("x_test shape", X_test.shape)

majority class before upsample: (49435, 2)
minority class before upsample: (8679, 2)
After upsampling
Positive    49435
Negative    49435
Name: sentiment, dtype: int64
x_train shape: (98870, 488)
x_test shape (14529, 488)


In [40]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['reviewText'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['reviewText'].values)
X_train = pad_sequences(X_train,maxlen=488)
Y_train = pd.get_dummies(data_upsampled['sentiment']).values
print('x_train shape:',X_train.shape)

X_test = tokenizer.texts_to_sequences(test['reviewText'].values)
X_test = pad_sequences(X_test,maxlen=488)
Y_test = pd.get_dummies(test['sentiment']).values
print("x_test shape", X_test.shape)

x_train shape: (98870, 488)
x_test shape (14529, 488)


In [38]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(2500, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 488, 128)          320000    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 488, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 575,194
Trainable params: 575,194
Non-trainable params: 0
_________________________________________________________________
None


In [39]:
import tensorflow as tf
tf.config.run_functions_eagerly(True)

In [41]:
batch_size = 128
# also adding weights
class_weights = {0: 1 ,
                1: 1.6/bias }
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1,
          class_weight=class_weights)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19dd62f45b0>

In [42]:
Y_pred = model.predict(X_test,batch_size = batch_size) 
Y_pred=np.argmax(Y_pred,axis=1)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

confusion matrix [[ 1131  1039]
 [  363 11996]]
              precision    recall  f1-score   support

           0       0.76      0.52      0.62      2170
           1       0.92      0.97      0.94     12359

    accuracy                           0.90     14529
   macro avg       0.84      0.75      0.78     14529
weighted avg       0.90      0.90      0.90     14529



In [50]:
twt = ['Bad smell is coming from air conditioner']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=488, dtype='int32', value=0)

sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

1/1 - 0s - 33ms/epoch - 33ms/step
negative


In [51]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 0, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.35
acc: 0.90


In [53]:
model.save('model12.hdf5')