In [1]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

In [2]:
df = pd.read_csv('Processed_Data.csv',lineterminator='\n')
df

Unnamed: 0.1,Unnamed: 0,topic,tweet,sentiment,class,processed_tweets
0,0,#olympics,Aussies would be happy that the T20 series hap...,0.275000,Positive,aussie would happy series happen midst olympic...
1,1,#olympics,The worst thing about the #Olympics finishing ...,-0.133333,Negative,worst thing olympics finish whole week availab...
2,2,#olympics,#Olympics\n\nWe play for India: #Hockey captai...,0.000000,Neutral,olympics play india hockey captain ranirampal ...
3,3,#olympics,See the best moments from the #Tokyo2020 closi...,1.000000,Positive,see best moment tokyo close ceremony videoelep...
4,4,#olympics,Fabulous! #Olympics \n#LoveTheBBC \n\nTokyo Ol...,0.500000,Positive,fabulous olympics lovethebbc tokyo olympics bb...
...,...,...,...,...,...,...
140242,140242,Tokyo olympics,Congratulations to all our winners and partici...,0.500000,Positive,congratulation winner participant olympics win...
140243,140243,Tokyo olympics,I hope that I am wrong but I have seen no twee...,-0.500000,Negative,hope wrong see tweet government
140244,140244,Tokyo olympics,Tokyo passes the baton to Paris as strangest e...,0.000000,Neutral,tokyo pass baton paris strangest ever olympic ...
140245,140245,Tokyo olympics,"Paris plans to deliver inclusive, youth-centre...",0.000000,Neutral,paris plan deliver inclusive youth centre gend...


In [3]:
df = df.dropna()
df = df[df['processed_tweets'].apply(lambda x: x !="")]

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,sentiment
count,139715.0,139715.0
mean,70096.224908,0.176834
std,40503.120258,0.317648
min,0.0,-1.0
25%,35003.5,0.0
50%,70027.0,0.0
75%,105194.5,0.375
max,140246.0,1.0


converting the classes into mainly two category of Positive and Negetive. 

In [5]:
def ratio(x):
  if x=='Positive':
    return 1
  else:
    return 0;

In [6]:
label = df['class'].apply(ratio)
label

0         1
1         0
2         0
3         1
4         1
         ..
140242    1
140243    0
140244    0
140245    0
140246    1
Name: class, Length: 139715, dtype: int64

In [7]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['processed_tweets'])

sequences = tokenizer.texts_to_sequences(df['processed_tweets'])
data = pad_sequences(sequences, maxlen=50)

In [8]:
data

array([[    0,     0,     0, ...,    54,  1035,   134],
       [    0,     0,     0, ...,   422,   255,  2472],
       [    0,     0,     0, ...,  1282, 10026,   291],
       ...,
       [    0,     0,     0, ...,    10,    41,    47],
       [    0,     0,     0, ...,   906,  1338,    10],
       [    0,     0,     0, ...,   224,  1140,   326]], dtype=int32)

In [9]:
print(data.shape)

(139715, 50)


Building LSTM first. 

In [10]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [12]:
model_conv = create_conv_model()
model_conv.fit(data, np.array(label), validation_split=0.4, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f75a089b610>

In [13]:
test_data = data[83829:139715]
test_data

array([[    0,     0,     0, ...,   955,  5731,  1507],
       [    0,     0,     0, ...,    85,  6794,  4110],
       [    0,     0,     0, ...,  3004,   402, 12922],
       ...,
       [    0,     0,     0, ...,    10,    41,    47],
       [    0,     0,     0, ...,   906,  1338,    10],
       [    0,     0,     0, ...,   224,  1140,   326]], dtype=int32)

In [14]:
test_label = label[83829:139715]
test_label

84126     1
84127     1
84128     0
84129     1
84130     1
         ..
140242    1
140243    0
140244    0
140245    0
140246    1
Name: class, Length: 55886, dtype: int64

In [16]:
model_conv.evaluate(test_data, test_label)



[0.35947662591934204, 0.8741724491119385]