### LSTM Sentiment Analysis

In [43]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
import re

In [2]:
data = pd.read_csv('data/Sentiment.csv')

In [3]:
data.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         13871 non-null  int64  
 1   candidate                  13775 non-null  object 
 2   candidate_confidence       13871 non-null  float64
 3   relevant_yn                13871 non-null  object 
 4   relevant_yn_confidence     13871 non-null  float64
 5   sentiment                  13871 non-null  object 
 6   sentiment_confidence       13871 non-null  float64
 7   subject_matter             13545 non-null  object 
 8   subject_matter_confidence  13871 non-null  float64
 9   candidate_gold             28 non-null     object 
 10  name                       13871 non-null  object 
 11  relevant_yn_gold           32 non-null     object 
 12  retweet_count              13871 non-null  int64  
 13  sentiment_gold             15 non-null     obj

In [5]:
data['sentiment'].value_counts()

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [6]:
# data preprocessing
data = data[data['sentiment'] != 'Neutral']
data['text'] = data['text'].apply(lambda x :x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x)))

In [7]:
data['sentiment'].value_counts()

Negative    8493
Positive    2236
Name: sentiment, dtype: int64

In [9]:
# for idx, row in data.iterrows():
#     row[0] = row[0].replace('rt', ' ')

AttributeError: 'int' object has no attribute 'replace'

In [11]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [12]:
X

array([[   0,    0,    0, ..., 1303, 1387,  734],
       [   0,    0,    0, ...,  232,  714,   18],
       [   0,    0,    0, ...,  205,  367,  680],
       ...,
       [   0,    0,    0, ...,   72,   66,    4],
       [   0,    0,    0, ..., 1006, 1399,   74],
       [   0,    0,    0, ...,  195,    4,  712]], dtype=int32)

In [15]:
X.shape

(10729, 28)

In [50]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.35))
model.add(Dense(2, activation='softmax'))

In [51]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 28, 128)           256000    
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 28, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 196)               254800    
                                                                 
 dense_2 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [53]:
y = pd.get_dummies(data['sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2222)

In [54]:
print(X_train.shape)
print(X_test.shape)

(7510, 28)
(3219, 28)


In [55]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, verbose=2)

Epoch 1/10
235/235 - 49s - loss: 0.4336 - accuracy: 0.8144 - val_loss: 0.3697 - val_accuracy: 0.8490 - 49s/epoch - 207ms/step
Epoch 2/10
235/235 - 42s - loss: 0.3227 - accuracy: 0.8627 - val_loss: 0.3713 - val_accuracy: 0.8546 - 42s/epoch - 178ms/step
Epoch 3/10
235/235 - 38s - loss: 0.2777 - accuracy: 0.8820 - val_loss: 0.3658 - val_accuracy: 0.8546 - 38s/epoch - 161ms/step
Epoch 4/10
235/235 - 39s - loss: 0.2480 - accuracy: 0.8939 - val_loss: 0.3575 - val_accuracy: 0.8540 - 39s/epoch - 165ms/step
Epoch 5/10
235/235 - 48s - loss: 0.2234 - accuracy: 0.9075 - val_loss: 0.3925 - val_accuracy: 0.8447 - 48s/epoch - 204ms/step
Epoch 6/10
235/235 - 38s - loss: 0.2067 - accuracy: 0.9164 - val_loss: 0.4072 - val_accuracy: 0.8422 - 38s/epoch - 161ms/step
Epoch 7/10
235/235 - 35s - loss: 0.1889 - accuracy: 0.9229 - val_loss: 0.4305 - val_accuracy: 0.8465 - 35s/epoch - 148ms/step
Epoch 8/10
235/235 - 40s - loss: 0.1801 - accuracy: 0.9274 - val_loss: 0.4269 - val_accuracy: 0.8372 - 40s/epoch - 170

<keras.callbacks.History at 0x7fa80c8bfb80>

In [56]:
twt = ["I'm really angry because this is totally unacceptable"]
twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
print(twt)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   81 1200  204   21    6  684]]


In [57]:
sentiment = model.predict(twt, batch_size=1, verbose=2)

if np.argmax(sentiment) == 0:
    print('negative')
else:
    print('positive')
print(sentiment)

1/1 - 0s - 419ms/epoch - 419ms/step
negative
[[0.99567884 0.0043212 ]]
