In [1]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split

Loading the Processed Dataset. 

In [2]:
df = pd.read_csv('Processed_Data.csv',lineterminator='\n')
df

Unnamed: 0.1,Unnamed: 0,topic,tweet,sentiment,class,processed_tweets
0,0,#olympics,Aussies would be happy that the T20 series hap...,0.275000,Positive,aussie would happy series happen midst olympic...
1,1,#olympics,The worst thing about the #Olympics finishing ...,-0.133333,Negative,worst thing olympics finish whole week availab...
2,2,#olympics,#Olympics\n\nWe play for India: #Hockey captai...,0.000000,Neutral,olympics play india hockey captain ranirampal ...
3,3,#olympics,See the best moments from the #Tokyo2020 closi...,1.000000,Positive,see best moment tokyo close ceremony videoelep...
4,4,#olympics,Fabulous! #Olympics \n#LoveTheBBC \n\nTokyo Ol...,0.500000,Positive,fabulous olympics lovethebbc tokyo olympics bb...
...,...,...,...,...,...,...
103929,103929,Athletes,Has NBC broadcast the Paralympics before? Beca...,0.300000,Positive,nbc broadcast paralympics remember hope brush ...
103930,103930,Athletes,@jonkay @ShreeParadkar Male-to-female trans at...,0.000000,Neutral,male female trans athlete front line war woman
103931,103931,Athletes,New Olympic motto adding TOGETHER.\nWe saw it ...,0.068182,Positive,new olympic motto add together saw repeatedly ...
103932,103932,Athletes,@mcuban would you be willing to give me a chan...,0.250000,Positive,would will give chance work athlete box train ...


In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2021-08-17 17:54:08--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-08-17 17:54:08--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-08-17 17:54:08--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [4]:
sentiment = df['class']
sentiment

0         Positive
1         Negative
2          Neutral
3         Positive
4         Positive
            ...   
103929    Positive
103930     Neutral
103931    Positive
103932    Positive
103933         NaN
Name: class, Length: 103934, dtype: object

In [5]:
y = np.array(list(map(lambda x: 1 if x=="Positive" else 0, sentiment)))
y

array([1, 0, 0, ..., 1, 1, 0])

In [6]:
df.processed_tweets = df.processed_tweets.astype(str)

In [7]:
reviews = df['processed_tweets']
reviews

0         aussie would happy series happen midst olympic...
1         worst thing olympics finish whole week availab...
2         olympics play india hockey captain ranirampal ...
3         see best moment tokyo close ceremony videoelep...
4         fabulous olympics lovethebbc tokyo olympics bb...
                                ...                        
103929    nbc broadcast paralympics remember hope brush ...
103930       male female trans athlete front line war woman
103931    new olympic motto add together saw repeatedly ...
103932    would will give chance work athlete box train ...
103933                                                  nan
Name: processed_tweets, Length: 103934, dtype: object

In [8]:
reviews_list = []
for i in range(len(reviews)):
  reviews_list.append(reviews[i])

In [9]:
X_train, X_test,Y_train, Y_test = train_test_split(reviews_list, y, test_size=0.2, random_state = 45)

In [10]:
len(Y_train)

83147

In [11]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [12]:
words_to_index = tokenizer.word_index
len(words_to_index)

33526

In [13]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map

In [14]:
word_to_vec_map = read_glove_vector('glove.6B.50d.txt')

In [15]:
maxLen = 150

In [16]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index-1, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [17]:
embedding_layer

<keras.layers.embeddings.Embedding at 0x7f906e54a310>

CNN training 


In [26]:
def conv1d_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = Conv1D(512,3,activation='relu')(embeddings)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  X = Dropout(0.8)(X)
  X = MaxPooling1D(3)(X)

  X = GlobalMaxPooling1D()(X)

  X = Dense(256, activation='relu')(X)
  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

cnn summary 

In [27]:
model_1d = conv1d_model((maxLen,))
model_1d.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 50)           1676300   
_________________________________________________________________
conv1d (Conv1D)              (None, 148, 512)          77312     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 49, 512)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 47, 256)           393472    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 15, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 13, 256)           1968

In [20]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

In [21]:
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_indices.shape

(83147, 150)

In [None]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model_1d.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

cnn fit


In [None]:
model_1d.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f54cea81ed0>

In [22]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [None]:
model_1d.evaluate(X_test_indices, Y_test)



[0.5399328470230103, 0.8062388300895691]