In [2]:
#Basic Necessities
import numpy as np
import pandas as pd

# Warnings Handling
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Word Embeddings Packages
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

#ML Models Packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score


In [3]:
train = pd.read_csv('data/train_clean_data.csv')
test = pd.read_csv('data/test_clean_data.csv')

# Word Embeddings

## 1) Bag-Of-Words

In [4]:
BOWvectorize = CountVectorizer(ngram_range=(1,2))
BOW = BOWvectorize.fit_transform(train['clean_text'])
BOW.shape

(21465, 170317)

In [5]:
print(BOW)

  (0, 54106)	1
  (0, 67550)	1
  (0, 65649)	1
  (0, 56761)	1
  (0, 23227)	1
  (0, 65329)	1
  (0, 127936)	1
  (0, 62341)	1
  (0, 54113)	1
  (0, 67604)	1
  (0, 65693)	1
  (0, 56949)	1
  (0, 23228)	1
  (0, 65369)	1
  (0, 127994)	1
  (1, 148325)	1
  (1, 160833)	1
  (1, 141313)	1
  (1, 133606)	1
  (1, 161747)	1
  (1, 119355)	1
  (1, 74202)	1
  (1, 34202)	1
  (1, 148328)	1
  (1, 160836)	1
  :	:
  (21463, 116380)	1
  (21463, 5483)	1
  (21463, 118861)	1
  (21463, 26507)	1
  (21463, 5518)	1
  (21463, 147700)	1
  (21463, 116419)	1
  (21463, 26533)	1
  (21463, 95501)	1
  (21463, 118915)	1
  (21464, 28520)	1
  (21464, 122123)	1
  (21464, 99995)	1
  (21464, 28998)	1
  (21464, 136306)	1
  (21464, 3367)	1
  (21464, 100906)	1
  (21464, 79099)	1
  (21464, 100907)	1
  (21464, 79105)	1
  (21464, 28554)	1
  (21464, 136307)	1
  (21464, 3384)	1
  (21464, 122141)	1
  (21464, 29018)	1


## 2) Tf-Idf Vectorization


In [6]:
TfidfVect = TfidfVectorizer(ngram_range=(1,2))
Tfidf = TfidfVect.fit_transform(train['clean_text'])
Tfidf.shape

(21465, 170317)

In [7]:
print(Tfidf)

  (0, 127994)	0.29842822463566954
  (0, 65369)	0.327627713876495
  (0, 23228)	0.2733024087674719
  (0, 56949)	0.29842822463566954
  (0, 65693)	0.327627713876495
  (0, 67604)	0.327627713876495
  (0, 54113)	0.327627713876495
  (0, 62341)	0.12348560615740357
  (0, 127936)	0.18087449176009213
  (0, 65329)	0.22505161357746817
  (0, 23227)	0.26797888668209513
  (0, 56761)	0.09866264738979658
  (0, 65649)	0.18879222412955343
  (0, 67550)	0.17740562220321512
  (0, 54106)	0.2656172836883205
  (1, 74205)	0.3018339344008399
  (1, 119360)	0.3018339344008399
  (1, 162185)	0.3018339344008399
  (1, 133717)	0.3018339344008399
  (1, 141546)	0.3018339344008399
  (1, 160836)	0.3018339344008399
  (1, 148328)	0.2899302094955352
  (1, 34202)	0.17449889426855378
  (1, 74202)	0.24923110522601574
  (1, 119355)	0.24688119556975172
  :	:
  (21463, 95501)	0.37879722156641804
  (21463, 26533)	0.36385821900074816
  (21463, 116419)	0.3098323292391115
  (21463, 147700)	0.3326402802506787
  (21463, 5518)	0.30983232923

## 3) Word2Vec

In [8]:
Tokenize_tweet = train['clean_text'].apply(lambda x: x.split())

Model_W2V = gensim.models.Word2Vec(Tokenize_tweet, size=200, 
                                   window=5,  
                                   sg=2, 
                                   hs=0,
                                   negative=10, 
                                   workers=2, 
                                   seed=34 )

Model_W2V.train(Tokenize_tweet, total_examples=len(train['clean_text']), epochs=20)

(3911649, 4693080)

In [14]:
print(Model_W2V.most_similar('sad'))

[('bitter', 0.458268940448761), ('endure', 0.4481304883956909), ('clever', 0.4398949444293976), ('esp', 0.43654775619506836), ('harrys', 0.4353716969490051), ('sore', 0.4351949691772461), ('gutted', 0.4263296127319336), ('babe', 0.42422252893447876), ('lool', 0.40090909600257874), ('incredibly', 0.39946436882019043)]


In [10]:
# Function to convert Word2vec into an array for passing into the model
def word_vec(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += Model_W2V[word].reshape((1, size))
            count += 1
        except KeyError:
      
            continue
    
    if count != 0:
        vec = vec/count
    return vec

In [15]:
Arrays = np.zeros((len(Tokenize_tweet), 200))

for i in range(len(Tokenize_tweet)):
    Arrays[i,:] = word_vec(Tokenize_tweet[i], 200)
Arr_df = pd.DataFrame(Arrays)
Arr_df.shape

(21465, 200)

# Model Selection

## Split data into train and validation set

In [18]:
# BOW
XTrain_Bow, XValid_Bow, YTrain, YValid = train_test_split(BOW, train['sentiment'],
                                                          random_state = 42,
                                                          test_size=0.2)

In [19]:
# Tf-idf 
XTrain_tfidf, XValid_tfidf, YTrain1, YValid1 = train_test_split(Tfidf, train['sentiment'],
                                                          random_state = 42,
                                                          test_size=0.2)

In [20]:
# Word2Vec
XTrain_w2v, XValid_w2v, YTrain2, YValid2 = train_test_split(Arr_df, train['sentiment'],
                                                          random_state = 42,
                                                          test_size=0.2)

## 1) Logistic Regression

In [48]:
# BOW
LR = LogisticRegression(solver='lbfgs', max_iter=500, multi_class='multinomial')
LR.fit(XTrain_Bow,YTrain)

Prediction = LR.predict(XValid_Bow)

f1_score(YValid, Prediction, average=None)

array([0.45340502, 0.66204773, 0.70528967])

In [51]:
# Tf-idf
LR = LogisticRegression(solver='lbfgs', max_iter=500, multi_class='multinomial')
LR.fit(XTrain_tfidf,YTrain1)

Prediction = LR.predict(XValid_tfidf)

f1_score(YValid, Prediction, average=None)

array([0.32079646, 0.67004097, 0.70648174])

In [52]:
# Word2Vec
LR = LogisticRegression(solver='lbfgs', max_iter=500, multi_class='multinomial')
LR.fit(XTrain_w2v,YTrain2)

Prediction = LR.predict(XValid_w2v)

f1_score(YValid, Prediction, average=None)

array([0.40687161, 0.60505529, 0.64258555])

## 2) Support Vector Machine (SVM)


In [37]:
# BOW
svc = svm.SVC(kernel='linear', C=1, probability=True)
svc.fit(XTrain_Bow, YTrain)

Prediction = svc.predict(XValid_Bow)

f1_score(YValid, Prediction, average=None)

array([0.4730832 , 0.65224192, 0.7014756 ])

In [38]:
# Tf-idf
svc.fit(XTrain_tfidf, YTrain1)

Prediction = svc.predict(XValid_tfidf)

f1_score(YValid1, Prediction, average=None)

array([0.39324727, 0.66618182, 0.69484655])

In [39]:
# Word2Vec
svc.fit(XTrain_w2v, YTrain2)

Prediction = svc.predict(XValid_w2v)

f1_score(YValid2, Prediction , average=None)

array([0.35568513, 0.6086736 , 0.63586054])

## 3) Decision Tree

In [95]:
# BOW
dt = DecisionTreeClassifier(random_state=0, max_depth=6)

dt.fit(XTrain_Bow, YTrain)

Prediction = dt.predict(XValid_Bow)
f1_score(YValid, Prediction, average=None)

array([0.32180209, 0.62405554, 0.43172527])

In [96]:
# Tf-idf
dt = DecisionTreeClassifier(random_state=0, max_depth=6)

dt.fit(XTrain_tfidf, YTrain1)

Prediction = dt.predict(XValid_tfidf)
f1_score(YValid, Prediction, average=None)

array([0.32045089, 0.62311762, 0.42304527])

In [97]:
# Word2vec
dt = DecisionTreeClassifier(random_state=0, max_depth=6)

dt.fit(XTrain_w2v, YTrain2)

Prediction = dt.predict(XValid_w2v)
f1_score(YValid, Prediction, average=None)

array([0.22494888, 0.52233316, 0.54076802])

## 4) Random Forest

In [41]:
# BOW
rf = RandomForestClassifier(n_estimators=400, random_state=11)

rf.fit(XTrain_Bow, YTrain)

Prediction = rf.predict(XValid_Bow)
f1_score(YValid, Prediction, average=None)

array([0.32      , 0.67305389, 0.69438906])

In [42]:
# Tf-idf
rf.fit(XTrain_tfidf, YTrain1)

Prediction = rf.predict(XValid_tfidf)
f1_score(YValid1, Prediction, average=None)

array([0.31116122, 0.66951567, 0.68769716])

In [43]:
# Word2vec
rf.fit(XTrain_w2v, YTrain2)

Prediction = rf.predict(XValid_w2v)
f1_score(YValid2, Prediction, average=None)

array([0.12516644, 0.60774578, 0.63829787])

## 5) Bernoulli Naive Bayes 

In [79]:
# BOW
BNB = BernoulliNB()
BNB.fit(XTrain_Bow, YTrain)
Prediction = BNB.predict(XValid_Bow)

f1_score(YValid, Prediction, average=None)

array([0.        , 0.62094434, 0.68157511])

In [80]:
# Tf-idf
BNB = BernoulliNB()
BNB.fit(XTrain_tfidf,YTrain1)
Prediction = BNB.predict(XValid_tfidf)

f1_score(YValid1, Prediction, average=None)

array([0.        , 0.62094434, 0.68157511])

In [81]:
# Word2Vec
BNB = BernoulliNB()
BNB.fit(XTrain_w2v,YTrain2)
Prediction = BNB.predict(XValid_w2v)

f1_score(YValid2, Prediction, average=None)

array([0.41035857, 0.54828481, 0.5959568 ])

## 6) Multinomial Naive Bayes

In [83]:
# BOW
MNB = MultinomialNB()
MNB.fit(XTrain_Bow, YTrain)
Prediction = MNB.predict(XValid_Bow)

f1_score(YValid, Prediction, average=None)

array([0.367428 , 0.6105919, 0.6916996])

In [84]:
# Tf-idf
MNB = MultinomialNB()
MNB.fit(XTrain_tfidf,YTrain1)
Prediction = MNB.predict(XValid_tfidf)

f1_score(YValid1, Prediction, average=None)

array([0.        , 0.61347038, 0.68344811])

In [21]:
# Word2Vec
# Multinomial Naive Bayes cannot run on Word2Vec models
# Since, multinomial Naive Bayes doesn't accept negative values as inputs.Whereas,Word2Vec models contains negative values in it

## 7) SGD Classification

In [99]:
# BOW
SGD = SGDClassifier()
SGD.fit(XTrain_Bow, YTrain)
Prediction = SGD.predict(XValid_Bow)

f1_score(YValid, Prediction, average=None)

array([0.4461671 , 0.65331599, 0.70167598])

In [100]:
# Tf-idf
SGD = SGDClassifier()
SGD.fit(XTrain_tfidf, YTrain1)
Prediction = SGD.predict(XValid_tfidf)

f1_score(YValid1, Prediction, average=None)

array([0.35791757, 0.67081936, 0.69226361])

In [101]:
# Word2Vec
SGD = SGDClassifier()
SGD.fit(XTrain_w2v, YTrain2)
Prediction = SGD.predict(XValid_w2v)

f1_score(YValid2, Prediction, average=None)

array([0.20754717, 0.60731645, 0.63985375])

## 8) Extreme Gradient Boosting (XGBoost)

In [53]:
# BOW
xgbModel = XGBClassifier(Max_depth=6, n_estimators=1000)

xgbModel.fit(XTrain_Bow,YTrain)

Prediction = xgbModel.predict(XValid_Bow)
f1_score(YValid, Prediction, average=None)


array([0.44026341, 0.68325041, 0.69351908])

In [54]:
# Tf-idf
xgbModel.fit(XTrain_tfidf,YTrain1)


Prediction = xgbModel.predict(XValid_tfidf)
f1_score(YValid1, Prediction, average=None)


array([0.41570881, 0.67750294, 0.68789809])

In [55]:
# Word2Vec
xgbModel = XGBClassifier(Max_depth=6, n_estimators=1000, nthread=3)


xgbModel.fit(XTrain_w2v,YTrain2)

Prediction = xgbModel.predict(XValid_w2v)
f1_score(YValid2, Prediction, average=None)


array([0.37981651, 0.60294892, 0.64250946])

# Up Sampling - Logistic Regression

In [22]:
# BOW
sm= SMOTE()
X,y = sm.fit_resample(XTrain_Bow,YTrain)
LR = LogisticRegression(solver='lbfgs', max_iter=500, multi_class='multinomial')
LR.fit(X,y)

Prediction = LR.predict(XValid_Bow)

f1_score(YValid, Prediction, average=None)

array([0.44369521, 0.61478815, 0.69931271])

In [23]:
# Tf-idf
sm= SMOTE()
X,y = sm.fit_resample(XTrain_tfidf,YTrain1)

LR.fit(X,y)

Prediction = LR.predict(XValid_tfidf)

f1_score(YValid1, Prediction, average=None)

array([0.48173005, 0.66061706, 0.69716647])

In [24]:
# Word2vec
sm= SMOTE()
X,y = sm.fit_resample(XTrain_w2v,YTrain2)

LR.fit(X,y)

Prediction = LR.predict(XValid_w2v)

f1_score(YValid2, Prediction, average=None)

array([0.47065217, 0.54589372, 0.62609202])