# Twitter Sentiment Extraction using LSTM

## Importing the libraries

In [1]:
import re
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

## Reading the Dataset and Overview of Dataset

In [2]:
data=pd.read_csv('train.csv')

In [3]:
print(data.shape)

(27481, 4)


In [4]:
# 5 elements from the top
data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## Text Preprocessing

In [5]:
data=data[['text','sentiment']]
data.text=data.text.astype(str)

In [6]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [7]:
max_fatures = 4000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

## Defining the model

In [8]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 34, 128)           512000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 34, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 3)                 591       
Total params: 767,391
Trainable params: 767,391
Non-trainable params: 0
_________________________________________________________________
None


## Splitting the Dataset

In [9]:
from sklearn.model_selection import train_test_split
Y=pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

## Running the model

In [11]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 16, batch_size=batch_size, verbose = 2)

Epoch 1/16
687/687 - 30s - loss: 0.3101 - accuracy: 0.8798
Epoch 2/16
687/687 - 25s - loss: 0.2886 - accuracy: 0.8861
Epoch 3/16
687/687 - 24s - loss: 0.2691 - accuracy: 0.8956
Epoch 4/16
687/687 - 24s - loss: 0.2505 - accuracy: 0.9024
Epoch 5/16
687/687 - 24s - loss: 0.2338 - accuracy: 0.9105
Epoch 6/16
687/687 - 24s - loss: 0.2175 - accuracy: 0.9190
Epoch 7/16
687/687 - 24s - loss: 0.2074 - accuracy: 0.9189
Epoch 8/16
687/687 - 24s - loss: 0.1923 - accuracy: 0.9269
Epoch 9/16
687/687 - 27s - loss: 0.1785 - accuracy: 0.9317
Epoch 10/16
687/687 - 31s - loss: 0.1717 - accuracy: 0.9350
Epoch 11/16
687/687 - 24s - loss: 0.1614 - accuracy: 0.9373
Epoch 12/16
687/687 - 24s - loss: 0.1525 - accuracy: 0.9422
Epoch 13/16
687/687 - 24s - loss: 0.1476 - accuracy: 0.9427
Epoch 14/16
687/687 - 24s - loss: 0.1352 - accuracy: 0.9508
Epoch 15/16
687/687 - 24s - loss: 0.1289 - accuracy: 0.9529
Epoch 16/16
687/687 - 25s - loss: 0.1247 - accuracy: 0.9544


<tensorflow.python.keras.callbacks.History at 0x21485799248>