### load libraries

In [1]:
import os
import random
random.seed(10)
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

import gensim
from gensim.models import FastText
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score,precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

  from pandas import Panel


### load Data

In [2]:
data = pd.read_csv("../data/sentiment_twitter_data.csv")
data.head(2)

Unnamed: 0,Tweet_date,Tweet_time,Tweet_City,Tweet_Country,Tweet_account,Retweet_count,Tweet_Text,Created Date,tweet_without_stopwords,neg,neu,pos,vader_polarity,sentiment
0,4/1/2020,0:08:00,,Australia,GSK_AU,0,ask award research excellence open nomination ...,2020-04-01 00:08:00,ask award research excellence open nomination ...,0.0,0.297,0.703,0.9349,positive
1,4/1/2020,0:35:00,,Australia,GSK_AU,3,award research excellence open nomination awar...,2020-04-01 00:35:00,award research excellence open nomination awar...,0.0,0.419,0.581,0.9022,positive


### prepare data

In [3]:
X = data['tweet_without_stopwords']
y = data['sentiment'].apply({'positive':2,'negative':0,'neutral':1}.get)

In [4]:
def tokenize(tweet):
    try:
        tokens = tokenizer.tokenize(tweet)
        return tokens
    except:
        return 'NC'

def postprocess(data, n=300):
    data['tokens'] = data['Tweet_Text'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    # data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

tokenData = postprocess(data)

progress-bar: 100%|███████████████████████████████████████████████████████████| 13724/13724 [00:00<00:00, 17520.01it/s]


### Build Gensim fasttext custom embeddings

In [5]:
f2vec = FastText(size=300, window=5, min_count=3, workers=4,sg=1)
f2vec.build_vocab([x for x in tqdm(data['tokens'])])
f2vec.train([x for x in tqdm(data['tokens'])],total_examples=f2vec.corpus_count,epochs=100)

100%|████████████████████████████████████████████████████████████████████████| 13724/13724 [00:00<00:00, 756984.67it/s]
100%|████████████████████████████████████████████████████████████████████████| 13724/13724 [00:00<00:00, 489103.82it/s]


In [6]:
x_vectors = f2vec.wv
print("Number of word vectors: {}".format(len(x_vectors.vocab)))

Number of word vectors: 4444


### split the dataset

In [7]:
train_x, test_x, train_y,test_y = train_test_split(data['Tweet_Text'], y, test_size=0.2, random_state=1)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((10979,), (10979,), (2745,), (2745,))

### sum up the each sentence to 300d vector

In [8]:
x_values = np.zeros((1, 300))
train_x_lst = []
test_x_lst = []
for val in range(len(train_x)):
    x_values = np.zeros((1, 300))
    for tok in data['tokens'][val]:
        x_values = x_values + f2vec[tok]
    train_x_lst.append(x_values)
    
for val in range(len(test_x)):
    x_values = np.zeros((1, 300))
    for tok in data['tokens'][val]:
        x_values = x_values + f2vec[tok]
    test_x_lst.append(x_values)

  import sys
  del sys.path[0]


In [9]:
new_train_x = np.reshape(train_x_lst, (10979*1, 300))
new_test_x = np.reshape(test_x_lst, (2745*1, 300))

In [10]:
len(train_x_lst), len(test_x_lst), len(new_train_x), len(new_test_x)

(10979, 2745, 10979, 2745)

### Random Forest Classifier 

In [11]:
rf = RandomForestClassifier(n_estimators = 20, criterion = 'entropy',random_state = 42)
rf.fit(new_train_x, train_y)


rf_pred_train = rf.predict(new_train_x)
print('random forest train accuracy')
print(classification_report(train_y, rf_pred_train, target_names=['negative','neutral', 'positive']))

print('***************')
rf_pred_test = rf.predict(new_test_x)
print('random forest test accuracy')
print(classification_report(test_y, rf_pred_test, target_names=['negative','neutral', 'positive']))

random forest train accuracy
              precision    recall  f1-score   support

    negative       0.85      0.77      0.81      3168
     neutral       0.79      0.85      0.82      4113
    positive       0.81      0.81      0.81      3698

    accuracy                           0.81     10979
   macro avg       0.82      0.81      0.81     10979
weighted avg       0.81      0.81      0.81     10979

***************
random forest test accuracy
              precision    recall  f1-score   support

    negative       0.28      0.27      0.27       775
     neutral       0.37      0.38      0.37      1012
    positive       0.35      0.34      0.35       958

    accuracy                           0.34      2745
   macro avg       0.33      0.33      0.33      2745
weighted avg       0.34      0.34      0.34      2745



### Naive Bayes Classifier

In [None]:
nb = MultinomialNB().fit(new_train_x, train_y)

nb_pred_test = nb.predict(new_test_x)
nb_pred_train = nb.predict(new_train_x)
print('Naive Bayes train accuracy')
print(classification_report(train_y, nb_pred_train, target_names=['negative','neutral', 'positive']))

print('***************')

print('Naive Bayes test accuracy')
print(classification_report(test_y, nb_pred_test, target_names=['negative','neutral', 'positive']))

### Decision Tree Classifier

In [13]:
dt = DecisionTreeClassifier(criterion='gini',max_depth=None, min_samples_split=5, min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0)
print(dt)
dt = dt.fit(new_train_x, train_y)

dttrain = dt.predict(new_train_x)
print('Decision Tree train accuracy')
print(classification_report(train_y, dttrain, target_names=['negative','neutral', 'positive']))

print('***************')
dttest = dt.predict(new_test_x)
print('Decision Tree test accuracy')
print(classification_report(test_y, dttest, target_names=['negative','neutral', 'positive']))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
Decision Tree train accuracy
              precision    recall  f1-score   support

    negative       0.80      0.80      0.80      3168
     neutral       0.79      0.86      0.82      4113
    positive       0.84      0.76      0.80      3698

    accuracy                           0.81     10979
   macro avg       0.81      0.80      0.81     10979
weighted avg       0.81      0.81      0.81     10979

***************
Decision Tree test accuracy
              precision    recall  f1-score   support

    negative       0.27      0.29      0.28       775
     neutral       0.37      0.3

### SVM

In [11]:
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3],  'C': [1, 10]}]

In [12]:
svm_model = GridSearchCV(SVC(), params_grid)
svm_model.fit(new_train_x, train_y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10], 'gamma': [0.001], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [14]:
# View the accuracy score
print('Best score for training data:', svm_model.best_score_,"\n") 
# View the best parameters for the model found using grid search
print('Best C:',svm_model.best_estimator_.C,"\n") 
print('Best Kernel:',svm_model.best_estimator_.kernel,"\n")
print('Best Gamma:',svm_model.best_estimator_.gamma,"\n")

final_model = svm_model.best_estimator_
Y_pred = final_model.predict(new_test_x)

Best score for training data: 0.36323891064759994 

Best C: 1 

Best Kernel: rbf 

Best Gamma: 0.001 



In [16]:
svmtrain = final_model.predict(new_train_x)
print('SVM train accuracy')
print(classification_report(train_y, svmtrain, target_names=['negative','neutral', 'positive']))

print('***************')
svmtest = final_model.predict(new_test_x)
print('SVM test accuracy')
print(classification_report(test_y, svmtest, target_names=['negative','neutral', 'positive']))

random forest train accuracy
              precision    recall  f1-score   support

    negative       0.66      0.16      0.25      3168
     neutral       0.44      0.86      0.58      4113
    positive       0.56      0.33      0.42      3698

    accuracy                           0.48     10979
   macro avg       0.55      0.45      0.42     10979
weighted avg       0.54      0.48      0.43     10979

***************
Decision Tree test accuracy
              precision    recall  f1-score   support

    negative       0.24      0.04      0.07       775
     neutral       0.37      0.79      0.50      1012
    positive       0.36      0.17      0.23       958

    accuracy                           0.36      2745
   macro avg       0.32      0.33      0.27      2745
weighted avg       0.33      0.36      0.28      2745



In [None]:
!pip install chart_studio        #Install chart_studio for plotly plot

In [None]:
import chart_studio.plotly as py                            #Import chart_studio for various plotly plot
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot
import plotly.io as pio
pio.renderers.default = 'colab'

In [None]:
#Plot shows the Comparison of Different ML Classification model for Evaluation metrics of Negative sentiment label
trace1 = {
  "name": "Accuracy", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,81,48]
}

trace2 = {
  "name": "Precision", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [85,80,66]
}

trace3 = {
  "name": "Recall", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [77,82,16]
}

trace4 = {
  "name": "F1-score", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,81,25]
}

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(barmode = "group",title= 'ML Model Evaluation Metrics Comparision on Negative Tweet sentiment ')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
#Plot shows the Comparison of Different ML Classification model for Evaluation metrics of Neutral sentiment label
trace1 = {
  "name": "Accuracy", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,81,48]
}

trace2 = {
  "name": "Precision", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [79,79,44]
}

trace3 = {
  "name": "Recall", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [85,86,86]
}

trace4 = {
  "name": "F1-score", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [82,82,58]
}

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(barmode = "group",title= 'ML Model Evaluation Metrics Comparision on Neutral Tweet sentiment ')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
#Plot shows the Comparison of Different ML Classification model for Evaluation metrics of Positive sentiment label
trace1 = {
  "name": "Accuracy", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,81,48]
}

trace2 = {
  "name": "Precision", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,86,56]
}

trace3 = {
  "name": "Recall", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,76,33]
}

trace4 = {
  "name": "F1-score", 
  "type": "bar", 
  "x": ["Random Forest", "Decision Tree", "SVM_rgf"], 
  "y": [81,80,42]
}

data = [trace1,trace2,trace3,trace4]
layout = go.Layout(barmode = "group",title= 'ML Model Evaluation Metrics Comparision on Positive Tweet sentiment ')
fig = go.Figure(data=data, layout=layout)
iplot(fig)