# Sentiment Analysis: 
the process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer's attitude towards a particular topic, product, etc. is positive, negative, or neutral.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [89]:
#short_pos= open("C:\\Data_jupyter\\short_reviews\\positive.txt","r").read()        
#short_neg= open("C:\\Data_jupyter\\short_reviews\\negative.txt","r").read()        

document=[]
with open("C:\\Data_jupyter\\short_reviews\\positive.txt") as fp:
    line=fp.readline()
    while line:
        document.append((line,"pos"))
        line=fp.readline()
    
with open("C:\\Data_jupyter\\short_reviews\\negative.txt") as fp:
    line=fp.readline()
    while line:
        document.append((line,"neg"))
        line=fp.readline()

In [90]:
labels=['text','sentiment']
data= pd.DataFrame.from_records(document,columns=labels)
data.head(3)

Unnamed: 0,text,sentiment
0,the rock is destined to be the 21st century's ...,pos
1,"the gorgeously elaborate continuation of "" the...",pos
2,effective but too-tepid biopic\n,pos


I am filtering the tweets so only valid texts and words remain. Then, I define the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.

In [92]:
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'pos'].size)
print(data[ data['sentiment'] == 'neg'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

10662
10662


Next, I compose the LSTM Network. Note that embed_dim, lstm_out, batch_size, droupout_x variables are hyperparameters, their values are somehow intuitive, can be and must be played with in order to achieve good results. Please also note that I am using softmax as activation function. The reason is that our Network is using categorical crossentropy, and softmax is just the right activation method for that.

In [93]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
#model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 46, 128)           256000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


Hereby I declare the train and test dataset.

In [94]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(8529, 46) (8529, 2)
(2133, 46) (2133, 2)


Here we train the Network. We should run much more than 7 epoch.

In [95]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 2)

Epoch 1/10
 - 122s - loss: 0.6094 - acc: 0.6599
Epoch 2/10
 - 111s - loss: 0.4665 - acc: 0.7781
Epoch 3/10
 - 136s - loss: 0.4029 - acc: 0.8164
Epoch 4/10
 - 109s - loss: 0.3466 - acc: 0.8471
Epoch 5/10
 - 111s - loss: 0.2955 - acc: 0.8747
Epoch 6/10
 - 111s - loss: 0.2510 - acc: 0.8900
Epoch 7/10
 - 113s - loss: 0.2122 - acc: 0.9118
Epoch 8/10
 - 115s - loss: 0.1853 - acc: 0.9243
Epoch 9/10
 - 112s - loss: 0.1636 - acc: 0.9329
Epoch 10/10
 - 112s - loss: 0.1393 - acc: 0.9462


<keras.callbacks.History at 0x14ed0683710>

Extracting a validation set, and measuring score and accuracy.

In [97]:
validation_size = 1200

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 1.13
acc: 0.72


Finally measuring the number of correct guesses. It is clear that finding negative tweets goes very well for the Network but deciding whether is positive is not really. My educated guess here is that the positive training set is dramatically smaller than the negative, hence the "bad" results for positive tweets.

In [98]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 66.34146341463415 %
neg_acc 74.35897435897436 %


I have created this kernel when I knew much less about LSTM & ML. It is a really basic, beginner level kernel, yet it had a huge audience in the past year. I had a lot of private questions and requests regarding this notebook and I tried my best to help and answer them . In the future I am not planning to answer custom questions and support/enhance this kernel in any ways. Thank you my folks :)

In [101]:
#twt = 'Meetings: Because none of us is as dumb as all of us.'
#twt='''The storyline keeps you hooked to the seats with the background music and with attention capturing lyrics.The movie is great for those who love and adore music.'''
twt='very bad movie.worst please do not watch it.sad unhappy bullshit'
#vectorizing the text by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the text to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=46, dtype='int32', padding='post', truncating='post', value=0)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
print(sentiment)
if(sentiment[1] > 0.50):
    print("negative")
elif (sentiment[0] > 0.50):
    print("positive")


[ 0.53936911  0.46063095]
positive
