### load Libraries

In [2]:
import os
import pandas as pd
import numpy as np

from collections import Counter
vocab = Counter()

from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

import gensim
from gensim.models import FastText
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import random
random.seed(10)
print(random.random())

### Deep learning library
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout,SpatialDropout1D, Bidirectional, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences

  from pandas import Panel


0.5714025946899135


### load dataset

In [120]:
data = pd.read_csv("../data/sentiment_twitter_data.csv")
data.head(2)

Unnamed: 0,Tweet_date,Tweet_time,Tweet_City,Tweet_Country,Tweet_account,Retweet_count,Tweet_Text,Created Date,tweet_without_stopwords,neg,neu,pos,vader_polarity,sentiment
0,4/1/2020,0:08:00,,Australia,GSK_AU,0,ask award research excellence open nomination ...,2020-04-01 00:08:00,ask award research excellence open nomination ...,0.0,0.297,0.703,0.9349,positive
1,4/1/2020,0:35:00,,Australia,GSK_AU,3,award research excellence open nomination awar...,2020-04-01 00:35:00,award research excellence open nomination awar...,0.0,0.419,0.581,0.9022,positive


### data preparation

In [121]:
X = data['tweet_without_stopwords']
y = data['sentiment'].apply({'positive':2,'negative':0,'neutral':1}.get)

In [124]:
def tokenize(tweet):
    try:
        tokens = tokenizer.tokenize(tweet)
        return tokens
    except:
        return 'NC'

def postprocess(data, n=300):
    data['tokens'] = data['Tweet_Text'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    # data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

tokenData = postprocess(data)


progress-bar:   0%|                                                                          | 0/13724 [00:00<?, ?it/s]
progress-bar:  12%|███████                                                      | 1580/13724 [00:00<00:05, 2330.72it/s]
progress-bar:  21%|████████████▌                                                | 2837/13724 [00:00<00:03, 3082.83it/s]
progress-bar:  32%|███████████████████▎                                         | 4345/13724 [00:00<00:02, 4046.90it/s]
progress-bar:  44%|██████████████████████████▉                                  | 6049/13724 [00:00<00:01, 5245.78it/s]
progress-bar:  55%|█████████████████████████████████▎                           | 7498/13724 [00:01<00:00, 6484.80it/s]
progress-bar:  64%|███████████████████████████████████████▎                     | 8844/13724 [00:01<00:00, 7671.29it/s]
progress-bar:  73%|███████████████████████████████████████████▉                | 10055/13724 [00:01<00:00, 8568.51it/s]
progress-bar:  82%|████████████████████

In [125]:
tokenData.head(1)

Unnamed: 0,Tweet_date,Tweet_time,Tweet_City,Tweet_Country,Tweet_account,Retweet_count,Tweet_Text,Created Date,tweet_without_stopwords,neg,neu,pos,vader_polarity,sentiment,tokens
0,4/1/2020,0:08:00,,Australia,GSK_AU,0,ask award research excellence open nomination ...,2020-04-01 00:08:00,ask award research excellence open nomination ...,0.0,0.297,0.703,0.9349,positive,"[ask, award, research, excellence, open, nomin..."


### prepare custom fasttext embeddings

In [126]:
f2vec = FastText(size=300, window=5, min_count=3, workers=4,sg=1)
f2vec.build_vocab([x for x in tqdm(data['tokens'])])
f2vec.train([x for x in tqdm(data['tokens'])],total_examples=f2vec.corpus_count,epochs=100)


100%|███████████████████████████████████████████████████████████████████████| 13724/13724 [00:00<00:00, 1058603.58it/s]

100%|████████████████████████████████████████████████████████████████████████| 13724/13724 [00:00<00:00, 611074.73it/s]


### split the data

In [133]:
train_x, test_x, train_y,test_y, y1,y2 = train_test_split(data['Tweet_Text'], y,data['sentiment'], test_size=0.2,
                                                          random_state=1)
train_x.shape, train_y.shape, test_x.shape, test_y.shape, y1.shape, y2.shape

((10979,), (10979,), (2745,), (2745,), (10979,), (2745,))

### prepare data for deep learning architecture

In [135]:
MAX_NB_WORDS = len(x_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

def text_to_wordlist(text, lower=False):
    # Return a list of words
    vocab.update(text)
    return text

def process_comments(list_sentences, lower=False):
    comments = []
    for text in tqdm(list_sentences):
        txt = text_to_wordlist(text, lower=lower)
        comments.append(txt)
    return comments

list_sentences_train = list(train_x.values)
list_sentences_test = list(test_x.values)

comments = process_comments(list_sentences_train + list_sentences_test, lower=True)


  0%|                                                                                        | 0/13724 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████| 13724/13724 [00:00<00:00, 72873.86it/s]


In [137]:
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
sequences = [[word_index.get(t, 0) for t in comment]
             for comment in comments[:len(list_sentences_train)]]
test_sequences = [[word_index.get(t, 0)  for t in comment] 
                  for comment in comments[len(list_sentences_train):]]

#pad
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")
list_classes = ["positive", "negative", "neutral"]
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y1.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")
print('Shape of test_data tensor:', test_data.shape)

Shape of data tensor: (10979, 200)
Shape of label tensor: (10979,)
Shape of test_data tensor: (2745, 200)


In [138]:
WV_DIM = 300
nb_words = min(MAX_NB_WORDS, len(x_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = x_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass   

In [170]:
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(3, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=['accuracy'])


In [171]:
epochs = 5
batch_size = 64
hist = model.fit([data],  np.array(train_y), 
                 validation_split=0.1, 
                 epochs=epochs, 
                 batch_size=batch_size, 
                 shuffle=True)

Train on 9881 samples, validate on 1098 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10


In [3]:
accr = model.evaluate(test_x,np.array(test_y))
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [154]:
history = pd.DataFrame(hist.history)
plt.figure(figsize=(12,12));
plt.plot(history["loss"]);
plt.plot(history["val_loss"]);
plt.title("Loss with pretrained word vectors");
plt.show();

array(['positive', 'positive', 'positive', ..., 'neutral', 'positive',
       'positive'], dtype=object)

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

In [None]:
!pip install chart_studio        #Install chart_studio for plotly plot

In [None]:
import chart_studio.plotly as py                      #Import chart_studio for various plotly plot
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot
import plotly.io as pio
pio.renderers.default = 'colab'

In [None]:
#Plot shows the Bi-LSTM ML Classification model for Evaluation metrics of Negative sentiment label
trace1 = {
  "name": "Accuracy", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [57]
}

trace2 = {
  "name": "Precision", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [66]
}

trace3 = {
  "name": "Recall", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [16]
}

trace4 = {
  "name": "F1-score", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [25]
}

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(barmode = "group",title= 'BI-LSTM ML Model Evaluation Metrics Comparision on Negative Tweet sentiment ')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
#Plot shows the Bi-LSTM ML Classification model for Evaluation metrics of Neutral sentiment label
trace1 = {
  "name": "Accuracy", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [57]
}

trace2 = {
  "name": "Precision", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [44]
}

trace3 = {
  "name": "Recall", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [86]
}

trace4 = {
  "name": "F1-score", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [58]
}

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(barmode = "group",title= 'BI-LSTM ML Model Evaluation Metrics Comparision on Neutral Tweet sentiment ')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
#Plot shows the Bi-LSTM ML Classification model for Evaluation metrics of Positive sentiment label
trace1 = {
  "name": "Accuracy", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [57]
}

trace2 = {
  "name": "Precision", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [56]
}

trace3 = {
  "name": "Recall", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [33]
}

trace4 = {
  "name": "F1-score", 
  "type": "bar", 
  "x": ["Bi-LSTM"], 
  "y": [42]
}

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(barmode = "group",title= 'BI-LSTM ML Model Evaluation Metrics Comparision on Positive Tweet sentiment ')
fig = go.Figure(data=data, layout=layout)
iplot(fig)