In [1]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('E-c-En-train.csv')
df_test = pd.read_csv('E-c-En-dev.csv')

Number of Tweets per emotion

In [3]:
df_new = df.drop(['ID', 'Tweet'], axis=1)
counts = []
categories = list(df_new.columns.values)
for i in categories:
    counts.append((i, df[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Emotion', 'no. of tweets'])
df_stats

Unnamed: 0,Emotion,no. of tweets
0,anger,2544
1,anticipation,978
2,disgust,2602
3,fear,1242
4,joy,2477
5,love,700
6,optimism,1984
7,pessimism,795
8,sadness,2008
9,surprise,361


In [4]:
print('Percentage of comments that are not labelled:')
print(len(df[(df['anger']==0) & (df['anticipation']==0) & (df['disgust']==0) & (df['fear']== 0) & (df['joy']==0) & (df['love']==0) & (df['optimism']==0) & (df['pessimism']==0) & (df['sadness']==0) & (df['surprise']==0) & (df['trust']==0)]) / len(df))

Percentage of comments that are not labelled:
0.029833284586136297


In [5]:
print('Percentage of comments that are not labelled:')
print(len(df_test[(df_test['anger']==0) & (df_test['anticipation']==0) & (df_test['disgust']==0) & (df_test['fear']== 0) & (df_test['joy']==0) & (df_test['love']==0) & (df_test['optimism']==0) & (df_test['pessimism']==0) & (df_test['sadness']==0) & (df_test['surprise']==0) & (df_test['trust']==0)]) / len(df_test))

Percentage of comments that are not labelled:
0.01580135440180587


In [6]:
pd.set_option('display.max_colwidth', -1)
df['Tweet'].head()

0    “Worry is a down payment on a problem you may never have'.  Joyce Meyer.  #motivation #leadership #worry                        
1    Whatever you decide to do make sure it makes you #happy.                                                                        
2    @Max_Kellerman  it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS
3    Accept the challenges so that you can literally even feel the exhilaration of victory.' -- George S. Patton 🐶                   
4    My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs                              
Name: Tweet, dtype: object

In [7]:
#pip install emoji

In [8]:
import emoji

# Function for converting emojis into words
def convert_emojis(text):
    text = emoji.demojize(text)
    return text

In [9]:
df['Tweet'] = df['Tweet'].map(lambda com : convert_emojis(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : convert_emojis(com))

In [10]:
def clean_text(text):
    #converts to lower case
    text = text.lower()
    # Remove all the special characters
    text = re.sub('\W', ' ', text)
    # removes numbers
    text = re.sub('\w*\d\w*', ' ', text) 
    # remove all single characters
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    # substitutes _ with a space
    text = re.sub('_', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub('\s+', ' ', text)
    #removes blank space at start of text
    text = text.strip(' ')
    # lemmatizer
    text = lemmatizer.lemmatize(text)
    return text

In [11]:
df['Tweet'] = df['Tweet'].map(lambda com : clean_text(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : clean_text(com))

In [12]:
df['Tweet'][10:20]

10    making that yearly transition from excited and hopeful college returner to sick and exhausted pessimist college                                    
11    and it hard to dance with devil on your back nso shake him off                                                                                     
12    tiller and breezy should do collab album rapping and singing prolly be fire                                                                        
13    to the girl that just hit my car not only did she get lucky no scratch but also from being spared the wrath of sleep deprived kait upside down face
14    bt uk broadband is shocking regretting signing up now angry shouldofgonewithvirgin                                                                 
15    people you need to look up the definition of protest what you are doing is not protesting is called vandalism angry stop                           
16    bitchesthecat look at those teef growl                                

In [13]:
# training and test data frames were used as is

train = df
test = df_test

In [14]:
X_train = train.Tweet
X_test = test.Tweet
print(X_train.shape)
print(X_test.shape)

(6838,)
(886,)


In [15]:
# create categories to iterate over
categories = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

In [16]:
#created test DF with no values to save predicted probabilities for each emotion

test_df_no_values = pd.read_csv('Test_English_no_values.csv')

In [17]:
test_df_no_values[0:10]

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,"@RanaAyyub @rajnathsingh Oh, hidden revenge and anger...I rememberthe time,she rebutted you.",,,,,,,,,,,
1,I'm doing all this to make sure you smiling down on me bro,,,,,,,,,,,
2,if not then #teamchristine bc all tana has done is provoke her by tweeting shady shit and trying to be a hard bitch begging for a fight,,,,,,,,,,,
3,"It is a #great start for #beginners to jump into auto #trading. PROFITABLE FX EA will give you full support, manuals &amp; Team Viewer support.",,,,,,,,,,,
4,My best friends driving for the first time with me in the car #terrifying,,,,,,,,,,,
5,Hey @SuperValuIRL #Fields in #skibbereen give your online delivery service a horrible name. 1.5 hours late on the 1 hour delivery window.,,,,,,,,,,,
6,Why have #Emmerdale had to rob #robron of having their first child together for that vile woman/cheating sl smh #bitter,,,,,,,,,,,
7,@ThomasEWoods I would like to hear a podcast of you going off refuting her entire article. Extra indignation please.,,,,,,,,,,,
8,If I have to hear one more time how I am intimidate men... I'm going to explode! Why are guys these days so pussified?,,,,,,,,,,,
9,depression sucks😔,,,,,,,,,,,


In [18]:
#C-Support Vector Classification.
SVC_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stop_words, max_df = 0.8)),
    #integrated a calibrated classifier to enable predicted_proba
               ('clf', CalibratedClassifierCV(LinearSVC())),
           ])


for category in categories:
    print('Emotion: {}'.format(category))
    # train the model
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy: {}'.format(accuracy_score(test[category], prediction)))
    ### used predicted proba to save to no value data frame with same tweets and column headers
    y_proba = SVC_pipeline.predict_proba(X_test)[:,1]
    test_df_no_values[category] = y_proba

Emotion: anger
Test accuracy: 0.7957110609480813
Emotion: anticipation
Test accuracy: 0.8577878103837472
Emotion: disgust
Test accuracy: 0.7584650112866818
Emotion: fear
Test accuracy: 0.9209932279909706
Emotion: joy
Test accuracy: 0.7979683972911964
Emotion: love
Test accuracy: 0.8893905191873589
Emotion: optimism
Test accuracy: 0.7663656884875847
Emotion: pessimism
Test accuracy: 0.8860045146726863
Emotion: sadness
Test accuracy: 0.7900677200902935
Emotion: surprise
Test accuracy: 0.9627539503386005
Emotion: trust
Test accuracy: 0.9514672686230248


In [31]:
#data frame with added probability values for each emotion
test_df_no_values.head(10)

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,"@RanaAyyub @rajnathsingh Oh, hidden revenge and anger...I rememberthe time,she rebutted you.",0.999821,0.045523,0.888137,0.017862,0.023099,0.01503,0.212767,0.049603,0.078998,0.018255,0.020512
1,I'm doing all this to make sure you smiling down on me bro,0.069404,0.133957,0.186289,0.079062,0.924498,0.2969,0.742139,0.066338,0.155026,0.018939,0.062199
2,if not then #teamchristine bc all tana has done is provoke her by tweeting shady shit and trying to be a hard bitch begging for a fight,0.94242,0.06886,0.829731,0.083766,0.038364,0.025993,0.044811,0.081486,0.317104,0.0239,0.02455
3,"It is a #great start for #beginners to jump into auto #trading. PROFITABLE FX EA will give you full support, manuals &amp; Team Viewer support.",0.22904,0.209044,0.372217,0.090401,0.446502,0.0499,0.432084,0.081376,0.171432,0.087032,0.094813
4,My best friends driving for the first time with me in the car #terrifying,0.190758,0.098758,0.3283,0.536408,0.498999,0.08863,0.526639,0.059442,0.400151,0.043139,0.044164
5,Hey @SuperValuIRL #Fields in #skibbereen give your online delivery service a horrible name. 1.5 hours late on the 1 hour delivery window.,0.886056,0.185273,0.878423,0.334658,0.141843,0.017722,0.101465,0.034676,0.295737,0.039641,0.04169
6,Why have #Emmerdale had to rob #robron of having their first child together for that vile woman/cheating sl smh #bitter,0.856661,0.132491,0.829636,0.090743,0.123824,0.048926,0.107893,0.139817,0.464447,0.024838,0.042761
7,@ThomasEWoods I would like to hear a podcast of you going off refuting her entire article. Extra indignation please.,0.356396,0.328359,0.213695,0.059089,0.124786,0.027806,0.387332,0.052078,0.178995,0.078454,0.039317
8,If I have to hear one more time how I am intimidate men... I'm going to explode! Why are guys these days so pussified?,0.723773,0.190538,0.327256,0.012396,0.258969,0.031416,0.114956,0.152359,0.363802,0.03406,0.033872
9,depression sucks😔,0.22951,0.056486,0.221747,0.157429,0.045045,0.031947,0.073581,0.208979,0.961885,0.028509,0.027324


In [20]:
from sklearn.naive_bayes import MultinomialNB

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', MultinomialNB(fit_prior=True, class_prior=None)),
            ])

for category in categories:
    print('Emotion {}'.format(category))
    NB_pipeline.fit(X_train, train[category])    
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Test accuracy is 0.7437923250564334
Emotion anticipation
Test accuracy is 0.8600451467268623
Emotion disgust
Test accuracy is 0.7110609480812641
Emotion fear
Test accuracy is 0.873589164785553
Emotion joy
Test accuracy is 0.7291196388261851
Emotion love
Test accuracy is 0.8510158013544018
Emotion optimism
Test accuracy is 0.6805869074492099
Emotion pessimism
Test accuracy is 0.8882618510158014
Emotion sadness
Test accuracy is 0.7313769751693002
Emotion surprise
Test accuracy is 0.9604966139954854
Emotion trust
Test accuracy is 0.9514672686230248


In [21]:
from sklearn.linear_model import LogisticRegression

LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', LogisticRegression(solver='sag')),
            ])

for category in categories:
    print('Emotion {}'.format(category))    
    LogReg_pipeline.fit(X_train, train[category])    
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Test accuracy is 0.7799097065462754
Emotion anticipation
Test accuracy is 0.8589164785553047
Emotion disgust
Test accuracy is 0.7426636568848759
Emotion fear
Test accuracy is 0.8939051918735892
Emotion joy
Test accuracy is 0.7799097065462754
Emotion love
Test accuracy is 0.8781038374717833
Emotion optimism
Test accuracy is 0.7528216704288939
Emotion pessimism
Test accuracy is 0.8848758465011287
Emotion sadness
Test accuracy is 0.7753950338600452
Emotion surprise
Test accuracy is 0.9627539503386005
Emotion trust
Test accuracy is 0.9514672686230248


In [22]:
from xgboost import XGBClassifier

xgboost_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', XGBClassifier()),
            ])

for category in categories:
    print('Emotion {}'.format(category))    
    xgboost_pipeline.fit(X_train, train[category])    
    prediction = xgboost_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Test accuracy is 0.7325056433408578
Emotion anticipation
Test accuracy is 0.8566591422121896
Emotion disgust
Test accuracy is 0.6952595936794582
Emotion fear
Test accuracy is 0.9063205417607223
Emotion joy
Test accuracy is 0.6941309255079007
Emotion love
Test accuracy is 0.8837471783295711
Emotion optimism
Test accuracy is 0.7234762979683973
Emotion pessimism
Test accuracy is 0.8871331828442438
Emotion sadness
Test accuracy is 0.7799097065462754
Emotion surprise
Test accuracy is 0.9627539503386005
Emotion trust
Test accuracy is 0.9514672686230248


In [23]:
from sklearn.svm import SVC

SVC_Nonlinear_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', SVC()),
            ])

for category in categories:
    print('Emotion {}'.format(category))    
    SVC_Nonlinear_pipeline.fit(X_train, train[category])    
    prediction = SVC_Nonlinear_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Test accuracy is 0.7708803611738149
Emotion anticipation
Test accuracy is 0.8589164785553047
Emotion disgust
Test accuracy is 0.7302483069977427
Emotion fear
Test accuracy is 0.9006772009029346
Emotion joy
Test accuracy is 0.7674943566591422
Emotion love
Test accuracy is 0.881489841986456
Emotion optimism
Test accuracy is 0.7483069977426636
Emotion pessimism
Test accuracy is 0.8871331828442438
Emotion sadness
Test accuracy is 0.7742663656884876
Emotion surprise
Test accuracy is 0.9627539503386005
Emotion trust
Test accuracy is 0.9514672686230248


In [24]:
from sklearn.neural_network import MLPClassifier

#Multi-layer Perceptron classifier
#This model optimizes the log-loss function using LBFGS or stochastic gradient descent.
#default solver is ADAM 

NN_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', MLPClassifier(hidden_layer_sizes = (100), learning_rate_init= 0.005, 
                                      learning_rate = 'adaptive', n_iter_no_change= 4, 
                                      verbose=True, early_stopping=True)),
            ])

for category in categories:
    print('Emotion {}'.format(category))    
    NN_pipeline.fit(X_train, train[category])    
    prediction = NN_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Iteration 1, loss = 0.59716831
Validation score: 0.786550
Iteration 2, loss = 0.28167502
Validation score: 0.780702
Iteration 3, loss = 0.12490617
Validation score: 0.770468
Iteration 4, loss = 0.08120362
Validation score: 0.769006
Iteration 5, loss = 0.06364976
Validation score: 0.763158
Iteration 6, loss = 0.05650738
Validation score: 0.769006
Validation score did not improve more than tol=0.000100 for 4 consecutive epochs. Stopping.
Test accuracy is 0.7629796839729119
Emotion anticipation
Iteration 1, loss = 0.48153683
Validation score: 0.856725
Iteration 2, loss = 0.29788813
Validation score: 0.856725
Iteration 3, loss = 0.20282777
Validation score: 0.842105
Iteration 4, loss = 0.13848714
Validation score: 0.830409
Iteration 5, loss = 0.08902768
Validation score: 0.817251
Iteration 6, loss = 0.06654522
Validation score: 0.817251
Validation score did not improve more than tol=0.000100 for 4 consecutive epochs. Stopping.
Test accuracy is 0.8600451467268623
Emotion disgu

In [25]:
from sklearn.neighbors import KNeighborsClassifier

KNN_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', KNeighborsClassifier()),
            ])

for category in categories:
    print('Emotion {}'.format(category))    
    KNN_pipeline.fit(X_train, train[category])    
    prediction = KNN_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Test accuracy is 0.7234762979683973
Emotion anticipation
Test accuracy is 0.8498871331828443
Emotion disgust
Test accuracy is 0.698645598194131
Emotion fear
Test accuracy is 0.8566591422121896
Emotion joy
Test accuracy is 0.718961625282167
Emotion love
Test accuracy is 0.8724604966139955
Emotion optimism
Test accuracy is 0.6896162528216704
Emotion pessimism
Test accuracy is 0.8724604966139955
Emotion sadness
Test accuracy is 0.7302483069977427
Emotion surprise
Test accuracy is 0.9616252821670429
Emotion trust
Test accuracy is 0.9503386004514672


In [26]:
from sklearn.ensemble import RandomForestClassifier

RF_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', RandomForestClassifier()),
            ])

for category in categories:
    print('Emotion {}'.format(category))    
    RF_pipeline.fit(X_train, train[category])    
    prediction = RF_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

Emotion anger
Test accuracy is 0.7799097065462754
Emotion anticipation
Test accuracy is 0.8611738148984198
Emotion disgust
Test accuracy is 0.7505643340857788
Emotion fear
Test accuracy is 0.9209932279909706
Emotion joy
Test accuracy is 0.7821670428893905
Emotion love
Test accuracy is 0.8792325056433409
Emotion optimism
Test accuracy is 0.7279909706546276
Emotion pessimism
Test accuracy is 0.8871331828442438
Emotion sadness
Test accuracy is 0.7821670428893905
Emotion surprise
Test accuracy is 0.9627539503386005
Emotion trust
Test accuracy is 0.9514672686230248


In [27]:
# created smaller unseen data set with no predicted values, only label columns included

df_unseen = pd.read_csv('unseen.csv')
df_unseen['Tweet'] = df_unseen['Tweet'].map(lambda com : convert_emojis(com))
df_unseen['Tweet'] = df_unseen['Tweet'].map(lambda com : clean_text(com))
df_unseen.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,so ryanair site crashes everytime try to book how do they help tell me there nothing wrong amp hang up furious helpless simoncalder,,,,,,,,,,,
1,theme of week ask the lord for strength amp perspective to persevere in integrity and effort despite being disheartened amp disappointed,,,,,,,,,,,
2,why announcing so late it will be hard to make it from manchester and organising day off sad,,,,,,,,,,,
3,the greatest happiness is seeing someone you like stay happy daidouji tomoyo cardcaptor sakura,,,,,,,,,,,
4,omg so grateful to have an education but ive been back at school for two days and my back hurts im exhausted and breaking out already smiling face with heart eyes,,,,,,,,,,,


In [28]:
X_unseen = df_unseen.Tweet
print(X_unseen.shape)

(10,)


In [29]:
#svc pipeline training needed to be included to get actual predicted outcome from model

for category in categories:
    SVC_pipeline.fit(X_train, train[category])
    new_proba = SVC_pipeline.predict_proba(X_unseen)[:,1]
    df_unseen[category] = new_proba

In [30]:
df_unseen.head(10)

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,so ryanair site crashes everytime try to book how do they help tell me there nothing wrong amp hang up furious helpless simoncalder,0.687843,0.160639,0.541247,0.076307,0.026194,0.02283,0.136074,0.067019,0.237131,0.040238,0.045809
1,theme of week ask the lord for strength amp perspective to persevere in integrity and effort despite being disheartened amp disappointed,0.060513,0.126585,0.158127,0.39412,0.210805,0.019894,0.207492,0.407167,0.590876,0.027548,0.045527
2,why announcing so late it will be hard to make it from manchester and organising day off sad,0.042662,0.070726,0.148653,0.016435,0.015671,0.014715,0.075524,0.107883,0.915479,0.012496,0.020998
3,the greatest happiness is seeing someone you like stay happy daidouji tomoyo cardcaptor sakura,0.038587,0.07673,0.079866,0.010128,0.992182,0.40995,0.888868,0.0475,0.02909,0.03391,0.071317
4,omg so grateful to have an education but ive been back at school for two days and my back hurts im exhausted and breaking out already smiling face with heart eyes,0.07481,0.062798,0.100174,0.323614,0.763688,0.166712,0.486084,0.192444,0.383971,0.037798,0.036075
5,because of your smile you make the life more beautiful,0.010375,0.093586,0.034155,0.009071,0.992907,0.749881,0.962362,0.041675,0.137449,0.030407,0.081258
6,mashable for some reason this has filled me with delight see auntie laugh,0.061592,0.160958,0.031709,0.016611,0.992198,0.33325,0.572994,0.114474,0.078024,0.101171,0.049732
7,lovemyffajacket facetime we can still annoy you face with tears of joy,0.210157,0.093034,0.169168,0.169506,0.726335,0.142944,0.218715,0.092466,0.212134,0.041595,0.041358
8,and shouldve cut them off the moment started hurting myself over them,0.54162,0.094003,0.149242,0.226306,0.126204,0.086353,0.119188,0.254522,0.438933,0.026243,0.027519
9,vesciodiana you forgot laughter as well red heart red heart red heart,0.052277,0.152533,0.02264,0.059902,0.977271,0.87941,0.392351,0.037617,0.167348,0.034842,0.053683
