In [11]:
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 16:12:38 2019

@author: Mohamed Zeitoun
"""


# load basic libraries 
import nltk
import numpy as np
import pandas as pd 
import glob
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Load train dataset into dataframe

tweet_files = glob.glob("./twitter-201?train.txt")

# Load train dataset into dataframe

li = []

for filename in tweet_files:
    df = pd.read_csv(filename, index_col=None, names=['Timestamp', 'Sentiment', 'Tweet'], sep='\t')
    li.append(df)

tweets = pd.concat(li, axis=0, ignore_index=True)

from sklearn.preprocessing import LabelEncoder

# Label encoding test & training labels
le = LabelEncoder()
Y_train = le.fit_transform(tweets['Sentiment'])

X_train = tweets['Tweet']

tweets.head(10)

Unnamed: 0,Timestamp,Sentiment,Tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...
7,264094586689953794,negative,Talking about ACT\u2019s && SAT\u2019s\u002c d...
8,212392538055778304,neutral,"Why is \""""Happy Valentines Day\"""" trending? It..."
9,254941790757601280,negative,They may have a SuperBowl in Dallas\u002c but ...


In [13]:
#List of different models
model = [
            "CountVectorizer + Naïve Bayes Multinomial", 
            "TFIDFVectorizer + Naïve Bayes Multinomial", 
            "CountVectorizer with uni-grams and bi-grams + Naïve Bayes Multinomial", 
            "CountVectorizer + Logistic Regression", 
            "TFIDFVectorizer + Logistic Regression", 
            "CountVectorizer with uni-grams and bi-grams + Logistic Regression",
            "TFIDFVectorizer + SVM (Linear Kernel)",
            "CountVectorizer + SVM (Linear Kernel)",
            "CountVectorizer with uni-grams and bi-grams + SVM (Linear Kernel)",
            "TFIDFVectorizer + SVM (RBF)",
            "CountVectorizer + SVM (RBF)",
            "CountVectorizer with uni-grams and bi-grams + SVM (RBF)",
            "CountVectorizer + Random Forest"
        ]

#intialize the output matrix
result = pd.DataFrame(columns=['Accuracy', 'FScore'])

#Load libraries needed for classification 
from sklearn.pipeline import Pipeline
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


#define the 10-fold
kfold = KFold(n_splits=10, shuffle=True, random_state=1234)


In [14]:
#loop on each model
for i in model:
    if i == 'CountVectorizer + Naïve Bayes Multinomial':
        #CountVectorizer + Naïve Bayes Multinomial pipeline
        pipeline = Pipeline([
        ('CountVectprizer', CountVectorizer()),
        ('naive_bayes_Multinomial', naive_bayes.MultinomialNB())
        ])
    elif i == 'TFIDFVectorizer + Naïve Bayes Multinomial':
        #TFIDFVectorizer + Naïve Bayes Multinomial pipeline
        pipeline = Pipeline([
        ('TFIDFVectprizer', TfidfVectorizer()),
        ('naive_bayes_Multinomial', naive_bayes.MultinomialNB())
        ])
    elif i == 'CountVectorizer with uni-grams and bi-grams + Naïve Bayes Multinomial':
        #CountVectorizer with uni-grams and bi-grams + Naïve Bayes Multinomial pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer(ngram_range=(1,2))),
        ('naive_bayes_Multinomial', naive_bayes.MultinomialNB())
        ])
    elif i == 'CountVectorizer + Logistic Regression':
        #CountVectorizer + Logistic Regression pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('LogisticRegression', LogisticRegression())
        ])
    elif i == 'TFIDFVectorizer + Logistic Regression':
        #TFIDFVectorizer + Logistic Regression pipeline
        pipeline = Pipeline([
        ('TFIDFVectorizer', TfidfVectorizer()),
        ('LogisticRegression', LogisticRegression())
        ])
    elif i == 'CountVectorizer with uni-grams and bi-grams + Logistic Regression':
        #CountVectorizer with uni-grams and bi-grams + Logistic Regression pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer(ngram_range=(1,2))),
        ('LogisticRegression', LogisticRegression())
        ])
    elif i == 'CountVectorizer + SVM (Linear Kernel)':
        #CountVectorizer + SVM (Linear Kernel) pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('SVM_linear_kernel', svm.LinearSVC())
        ])
    elif i == 'TFIDFVectorizer + SVM (Linear Kernel)':
        #TFIDFVectorizer + SVM (Linear Kernel) pipeline
        pipeline = Pipeline([
        ('TFIDFVectorizer', TfidfVectorizer()),
        ('SVM_linear_kernel', svm.LinearSVC())
        ])
    elif i == 'CountVectorizer with uni-grams and bi-grams + SVM (Linear Kernel)':
        #CountVectorizer with uni-grams and bi-grams + SVM (Linear Kernel) pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer(ngram_range=(1,2))),
        ('SVM_linear_kernel', svm.LinearSVC())
        ])
    elif i == 'CountVectorizer + SVM (RBF)':
        #CountVectorizer + SVM (RBF) pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('SVM_RBF', svm.SVR(kernel='rbf'))
        ])
    elif i == 'TFIDFVectorizer + SVM (RBF)':
        #TFIDFVectorizer + SVM (RBF) pipeline
        pipeline = Pipeline([
        ('TFIDFVectorizer', TfidfVectorizer()),
        ('SVM_RBF', svm.SVR(kernel='rbf'))
        ])
    elif i == 'CountVectorizer with uni-grams and bi-grams + SVM (RBF)':
        #CountVectorizer with uni-grams and bi-grams + SVM (RBF) pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer(ngram_range=(1,2))),
        ('SVM_RBF', svm.SVR(kernel='rbf'))
        ])
    elif i == 'CountVectorizer + Random Forest':
        #CountVectorizer + Random Forest pipeline
        pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('RandomForest', RandomForestClassifier())
        ])
    else:
        print("ERROR: Model Not Supported!")
        continue
           
    #intialize the mean absolute error counter
    accuracy = 0.0
    Fscore = 0.0
    
    #K-fold cross validation
    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, Y_train)):
        train_x, train_y = X_train.iloc[train_index], Y_train[train_index]
        val_x, val_y = X_train.iloc[val_index], Y_train[val_index]
        
        #Model fit & Prediction
        pipeline.fit(train_x, train_y)
        predictions = pipeline.predict(val_x)
        
        #Calculate the Accuracy & F-Score 
        accuracy += metrics.accuracy_score(val_y, predictions.round())
        Fscore += metrics.f1_score(val_y, predictions.round(), average='macro')
        
    accuracy /= kfold.get_n_splits()
    Fscore /= kfold.get_n_splits()
    
    print(i + ":")
    print("Accuracy = {}".format(accuracy.round(2)))
    print("F-Score = {}".format(Fscore.round(2)))
    
#    scores = cross_val_score(pipeline, X_train, Y_train, cv=10 )
#    fscores = cross_val_score(pipeline, X_train, Y_train, cv=10, scoring='f1_macro')
#    print(i + ":")
#    print("Accuracy = {}".format(scores.mean()))
#    print("F-Score = {}".format(fscores.mean()))
    
    result.loc[i,'Accuracy']=accuracy.round(2)
    result.loc[i,'FScore']=Fscore.round(2)
        


CountVectorizer + Naïve Bayes Multinomial:
Accuracy = 0.62
F-Score = 0.52
TFIDFVectorizer + Naïve Bayes Multinomial:
Accuracy = 0.61
F-Score = 0.44
CountVectorizer with uni-grams and bi-grams + Naïve Bayes Multinomial:
Accuracy = 0.63
F-Score = 0.5
CountVectorizer + Logistic Regression:
Accuracy = 0.66
F-Score = 0.61
TFIDFVectorizer + Logistic Regression:
Accuracy = 0.65
F-Score = 0.57
CountVectorizer with uni-grams and bi-grams + Logistic Regression:
Accuracy = 0.67
F-Score = 0.62
TFIDFVectorizer + SVM (Linear Kernel):
Accuracy = 0.66
F-Score = 0.61
CountVectorizer + SVM (Linear Kernel):
Accuracy = 0.63
F-Score = 0.59
CountVectorizer with uni-grams and bi-grams + SVM (Linear Kernel):
Accuracy = 0.66
F-Score = 0.62
TFIDFVectorizer + SVM (RBF):
Accuracy = 0.43
F-Score = 0.2
CountVectorizer + SVM (RBF):
Accuracy = 0.43
F-Score = 0.2
CountVectorizer with uni-grams and bi-grams + SVM (RBF):
Accuracy = 0.43
F-Score = 0.2
CountVectorizer + Random Forest:
Accuracy = 0.6
F-Score = 0.52


In [15]:
display(result.sort_values(by='FScore', ascending=False))

Unnamed: 0,Accuracy,FScore
CountVectorizer with uni-grams and bi-grams + Logistic Regression,0.67,0.62
CountVectorizer with uni-grams and bi-grams + SVM (Linear Kernel),0.66,0.62
CountVectorizer + Logistic Regression,0.66,0.61
TFIDFVectorizer + SVM (Linear Kernel),0.66,0.61
CountVectorizer + SVM (Linear Kernel),0.63,0.59
TFIDFVectorizer + Logistic Regression,0.65,0.57
CountVectorizer + Naïve Bayes Multinomial,0.62,0.52
CountVectorizer + Random Forest,0.6,0.52
CountVectorizer with uni-grams and bi-grams + Naïve Bayes Multinomial,0.63,0.5
TFIDFVectorizer + Naïve Bayes Multinomial,0.61,0.44
