# **PART II : Deep Learning**

---


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Sentiment.csv')
df.shape

(13871, 21)

In [3]:
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [11]:
df.dtypes

id                             int64
candidate                     object
candidate_confidence         float64
relevant_yn                   object
relevant_yn_confidence       float64
sentiment                     object
sentiment_confidence         float64
subject_matter                object
subject_matter_confidence    float64
name                          object
retweet_count                  int64
text                          object
tweet_created                 object
tweet_id                       int64
tweet_location                object
user_timezone                 object
dtype: object

In [7]:
df.isnull().sum()

id                               0
candidate                       96
candidate_confidence             0
relevant_yn                      0
relevant_yn_confidence           0
sentiment                        0
sentiment_confidence             0
subject_matter                 326
subject_matter_confidence        0
candidate_gold               13843
name                             0
relevant_yn_gold             13839
retweet_count                    0
sentiment_gold               13856
subject_matter_gold          13853
text                             0
tweet_coord                  13850
tweet_created                    0
tweet_id                         0
tweet_location                3912
user_timezone                 4403
dtype: int64

***Dropping the columns that have maximum number of missing values***

In [10]:
df.drop(['candidate_gold','relevant_yn_gold','sentiment_gold','subject_matter_gold','tweet_coord'],inplace=True,axis=1)

In [12]:
df.isnull().sum()

id                              0
candidate                      96
candidate_confidence            0
relevant_yn                     0
relevant_yn_confidence          0
sentiment                       0
sentiment_confidence            0
subject_matter                326
subject_matter_confidence       0
name                            0
retweet_count                   0
text                            0
tweet_created                   0
tweet_id                        0
tweet_location               3912
user_timezone                4403
dtype: int64

In [14]:
#replacing categorical missing values with mode operation
df=df.fillna(df.mode().iloc[0])

In [15]:
df.isnull().sum()

id                           0
candidate                    0
candidate_confidence         0
relevant_yn                  0
relevant_yn_confidence       0
sentiment                    0
sentiment_confidence         0
subject_matter               0
subject_matter_confidence    0
name                         0
retweet_count                0
text                         0
tweet_created                0
tweet_id                     0
tweet_location               0
user_timezone                0
dtype: int64

###  **Q.1 Total Number of Positive and Negative sentiments**

---



In [6]:
df['sentiment'].value_counts()

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [None]:
test_pos = df[df['sentiment'] == 'Positive']

In [18]:
test_neg = df[df['sentiment'] == 'Negative']

In [23]:
print('No.of Positive sentiments :',len(test_pos))
print('No.of Negative sentiments :',len(test_neg))

No.of Positive sentiments : 2236
No.of Negative sentiments : 8493


###  **Q.2 LSTM Model**

---



In [37]:
data = df[['text','sentiment']]

In [38]:
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
vocabSize = 2000
tokenizer = Tokenizer(num_words=vocabSize, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 28, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 28, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
from sklearn.model_selection import train_test_split

Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(9119, 28) (9119, 2)
(1610, 28) (1610, 2)


In [41]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
285/285 - 47s - loss: 0.4175 - accuracy: 0.8223 - 47s/epoch - 166ms/step
Epoch 2/5
285/285 - 44s - loss: 0.3076 - accuracy: 0.8715 - 44s/epoch - 155ms/step
Epoch 3/5
285/285 - 44s - loss: 0.2735 - accuracy: 0.8857 - 44s/epoch - 153ms/step
Epoch 4/5
285/285 - 45s - loss: 0.2487 - accuracy: 0.8988 - 45s/epoch - 157ms/step
Epoch 5/5
285/285 - 44s - loss: 0.2253 - accuracy: 0.9048 - 44s/epoch - 153ms/step


<keras.callbacks.History at 0x7fe87cc73510>

In [42]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

51/51 - 1s - loss: 0.3841 - accuracy: 0.8453 - 1s/epoch - 28ms/step
score: 0.38
acc: 0.85
