In [26]:
import pandas as pd
import re                                     #regular expression: used to remove all symbols except alphanumeric from a string
import time

from keras.preprocessing.text import Tokenizer                           # to convert text -> words -> single number

from keras.preprocessing.sequence import pad_sequences                   #to pad the vectors, so as to make equal size for all


from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, SimpleRNN
from keras.utils import to_categorical                                  #interger encoded class labes -> one hot encoding array


from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('twitter_sentiment.csv')

In [3]:
data.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [4]:
data = data[['text','sentiment']]

In [5]:
data.sample(5)

Unnamed: 0,text,sentiment
613,"RT @daveweigel: I believe that the phrase ""If ...",Negative
9413,RT @jsc1835: Chris Christie - You want to incr...,Negative
2113,"RT @kharyp: When #HillaryClinton calls men ""Fa...",Negative
747,"Oooh, so Fox News also doesn't want Trump to b...",Neutral
7501,Perfect #GOPDebate https://t.co/r5OZZtaeAb,Positive


In [6]:
data['text']=data['text'].apply(lambda x: x.lower())      #converting all tweets to lower case

In [7]:
data.head()

Unnamed: 0,text,sentiment
0,rt @nancyleegrahn: how did everyone feel about...,Neutral
1,rt @scottwalker: didn't catch the full #gopdeb...,Positive
2,rt @tjmshow: no mention of tamir rice and the ...,Neutral
3,rt @robgeorge: that carly fiorina is trending ...,Positive
4,rt @danscavino: #gopdebate w/ @realdonaldtrump...,Positive


In [8]:
data['text']= data.text.apply(lambda x:x[x.find(':')+1:].strip())                              


#x.find(':') will return index of the first occurence of ':' in the text otherwise (if not found) returns -1
#x[x.find(':')+1:] selects the text just after the ':' index to the end of the text
#strip() function removes and starting and trailing whitespaces from the text

In [9]:
data.head()

Unnamed: 0,text,sentiment
0,how did everyone feel about the climate change...,Neutral
1,didn't catch the full #gopdebate last night. h...,Positive
2,no mention of tamir rice and the #gopdebate wa...,Neutral
3,that carly fiorina is trending -- hours after ...,Positive
4,#gopdebate w/ @realdonaldtrump delivered the h...,Positive


In [10]:
data['text']=data.text.apply(lambda x: re.sub('[^a-zA-Z0-9\s]','',x) )   

#re.sub = substitution.  "^"= Not
#means substitute all, Not(^) a to z, A to Z, 0 to 9 & white spaces

In [11]:
data.head()

Unnamed: 0,text,sentiment
0,how did everyone feel about the climate change...,Neutral
1,didnt catch the full gopdebate last night here...,Positive
2,no mention of tamir rice and the gopdebate was...,Neutral
3,that carly fiorina is trending hours after he...,Positive
4,gopdebate w realdonaldtrump delivered the high...,Positive


In [12]:
print("Neutral Sentiment observations:",data[data['sentiment']=='Neutral'].size)
print("Positive Sentiment observations:",data[data['sentiment']=='Positive'].size)
print("Negative Sentiment observations:",data[data['sentiment']=='Negative'].size)
print("Size of overall data", data.size)

Neutral Sentiment observations: 6284
Positive Sentiment observations: 4472
Negative Sentiment observations: 16986
Size of overall data 27742


# Text Preprocessing

In [13]:
frequent_words = 2000                                         #maximum number of unique words in vocabulary

tokeniser = Tokenizer(num_words = frequent_words, split=' ')  #creats object of tokenizer with considering most 2000 frequent words

tokeniser.fit_on_texts(data['text'].values)                   #learns from the data,creates a list of all words, then creates a 
                                                              #dictionary keeping count of each word, and finally considering 
                                                              #the most frequent 2000 words only

X = tokeniser.texts_to_sequences(data['text'].values)         #convert the text data into sequences of numbers,returns a list
                                                              #only for the most frequent 2000 words as learned earlier

X=pad_sequences(X)                                            #pads all the vectors by 0 in start, to the 
                                                              #maximum leangth in the list


# LSTM Model

In [27]:
Embd_vec_dim = 128                                              #tells length of the word embedded vector
lstm_out = 196                                                  #tells number of LSTM units in LSTM layer

model = Sequential()                                            #linear stack of layers

model.add(Embedding(frequent_words,Embd_vec_dim,input_length= X.shape[1]))    
                                                                #embedding layer for words(converted to numbers) vectorization

model.add(SpatialDropout1D(0.4))                               #Droupout layer, to prevent overfitting, randomly drops 40%
                                                                #features while training
    
model.add(LSTM(lstm_out,dropout=0.2, recurrent_dropout=0.2))   #LSTM with dropout

model.add(Dense(3, activation = 'softmax'))                     #softmax layer for 3 classes (positive, Neutral, Negative)


#############################################################################################################################


model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
                                           
#configures the model as defining loss, optimiser and metrics

In [28]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 28, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 28, 128)           0         
 alDropout1D)                                                    
                                                                 
 simple_rnn (SimpleRNN)      (None, 196)               63700     
                                                                 
 dense_1 (Dense)             (None, 3)                 591       
                                                                 
Total params: 320291 (1.22 MB)
Trainable params: 320291 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None




# Creating Train Test data

In [16]:
Y = pd.get_dummies(data['sentiment']).values         #one hot encoded array in the form [Negative, Neutral, Positive]

In [44]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state =42)  
                                                                    #random_state control randomness and reproducability of data

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(9709, 28) (9709, 3)
(4162, 28) (4162, 3)





# Model Training

In [18]:
batch_size =32                                       #Commonly batch size 16 -128
                                                     #for smaller dataset 16-32, for large dataset 64-128   
    
model.fit(X_train, Y_train, batch_size=batch_size, epochs=10, verbose=2)

net=time.time()-start

Epoch 1/10
304/304 - 46s - loss: 0.8479 - accuracy: 0.6286 - 46s/epoch - 151ms/step
Epoch 2/10
304/304 - 37s - loss: 0.7269 - accuracy: 0.6763 - 37s/epoch - 121ms/step
Epoch 3/10
304/304 - 38s - loss: 0.6664 - accuracy: 0.7010 - 38s/epoch - 126ms/step
Epoch 4/10
304/304 - 38s - loss: 0.6268 - accuracy: 0.7216 - 38s/epoch - 124ms/step
Epoch 5/10
304/304 - 878s - loss: 0.5901 - accuracy: 0.7380 - 878s/epoch - 3s/step
Epoch 6/10
304/304 - 32s - loss: 0.5628 - accuracy: 0.7526 - 32s/epoch - 104ms/step
Epoch 7/10
304/304 - 38s - loss: 0.5295 - accuracy: 0.7662 - 38s/epoch - 124ms/step
Epoch 8/10
304/304 - 37s - loss: 0.5085 - accuracy: 0.7761 - 37s/epoch - 123ms/step
Epoch 9/10
304/304 - 38s - loss: 0.4857 - accuracy: 0.7875 - 38s/epoch - 124ms/step
Epoch 10/10
304/304 - 38s - loss: 0.4619 - accuracy: 0.7973 - 38s/epoch - 125ms/step


In [20]:
print("Total time taken for model training: ",net/60)

Total time taken for model training:  20.321604013442993




# Validation Set

In [55]:
val_entries = 1000

X_validation = X_test[-val_entries:]             #selecting from last 1000 to the end
Y_validation = Y_test[-val_entries:]

X_test= X_test[:-val_entries]                    #selecting from first to before the last 1000
Y_test= Y_test[:-val_entries]


score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Loss: %.2f" % (score))
print("accuracy: %.2f" % (acc))


#Note: while running this part of code more then twice it throws and error "OverflowError: cannot convert float infinity to integer"
#as because we are slicing X_test everytime we run, and create validation set, but after a time there is not sufficient entries
#available in the test set itself, so as to create validation set again

68/68 - 1s - loss: 1.0819 - accuracy: 0.3969 - 633ms/epoch - 9ms/step
Loss: 1.08
accuracy: 0.40


In [56]:
X_validation[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 1463,  230,  517,   70,  139,   75,
        699,   16,    1, 1999,  549,    3])

In [57]:
X_validation[0].reshape(1,28)

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 1463,  230,  517,   70,  139,   75,
         699,   16,    1, 1999,  549,    3]])

In [58]:
X_validation[0].shape


(28,)