In [1]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('E-c-En-train.csv')
df_test = pd.read_csv('E-c-En-dev.csv')

Number of Tweets per emotion

In [3]:
df_new = df.drop(['ID', 'Tweet'], axis=1)
counts = []
categories = list(df_new.columns.values)
for i in categories:
    counts.append((i, df[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Emotion', 'no. of tweets'])
df_stats

Unnamed: 0,Emotion,no. of tweets
0,anger,2544
1,anticipation,978
2,disgust,2602
3,fear,1242
4,joy,2477
5,love,700
6,optimism,1984
7,pessimism,795
8,sadness,2008
9,surprise,361


In [4]:
print('Percentage of comments that are not labelled:')
print(len(df[(df['anger']==0) & (df['anticipation']==0) & (df['disgust']==0) & (df['fear']== 0) & (df['joy']==0) & (df['love']==0) & (df['optimism']==0) & (df['pessimism']==0) & (df['sadness']==0) & (df['surprise']==0) & (df['trust']==0)]) / len(df))

Percentage of comments that are not labelled:
0.029833284586136297


In [5]:
print('Percentage of comments that are not labelled:')
print(len(df_test[(df['anger']==0) & (df['anticipation']==0) & (df['disgust']==0) & (df['fear']== 0) & (df['joy']==0) & (df['love']==0) & (df['optimism']==0) & (df['pessimism']==0) & (df['sadness']==0) & (df['surprise']==0) & (df['trust']==0)]) / len(df))

Percentage of comments that are not labelled:
0.0038022813688212928


  


In [6]:
pd.set_option('display.max_colwidth', -1)
df['Tweet'].head()

0    “Worry is a down payment on a problem you may never have'.  Joyce Meyer.  #motivation #leadership #worry                        
1    Whatever you decide to do make sure it makes you #happy.                                                                        
2    @Max_Kellerman  it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS
3    Accept the challenges so that you can literally even feel the exhilaration of victory.' -- George S. Patton 🐶                   
4    My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs                              
Name: Tweet, dtype: object

In [7]:
#pip install emoji

In [8]:
import emoji

# Function for converting emojis into words
def convert_emojis(text):
    text = emoji.demojize(text)
    return text

In [9]:
df['Tweet'] = df['Tweet'].map(lambda com : convert_emojis(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : convert_emojis(com))

In [10]:
def clean_text(text):
    #converts to lower case
    text = text.lower()
    # Remove all the special characters
    text = re.sub('\W', ' ', text)
    # removes numbers
    text = re.sub('\w*\d\w*', ' ', text) 
    # remove all single characters
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    # substitutes _ with a space
    text = re.sub('_', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub('\s+', ' ', text)
    # lemmatizer
    text = lemmatizer.lemmatize(text)
    return text

In [11]:
df['Tweet'] = df['Tweet'].map(lambda com : clean_text(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : clean_text(com))

In [12]:
df['Tweet'][10:20]

10    making that yearly transition from excited and hopeful college returner to sick and exhausted pessimist college                                      
11    and it hard to dance with devil on your back nso shake him off                                                                                       
12    tiller and breezy should do collab album rapping and singing prolly be fire                                                                          
13     to the girl that just hit my car not only did she get lucky no scratch but also from being spared the wrath of sleep deprived kait upside down face 
14     bt uk broadband is shocking regretting signing up now angry shouldofgonewithvirgin                                                                  
15    people you need to look up the definition of protest what you are doing is not protesting is called vandalism angry stop                             
16     bitchesthecat look at those teef growl                   

In [13]:
# training and test data frames were used as is

train = df
test = df_test

In [14]:
X_train = train.Tweet
X_test = test.Tweet
print(X_train.shape)
print(X_test.shape)

(6838,)
(886,)


In [15]:
# create categories to iterate over
categories = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

In [16]:
#created test DF with no values to save predicted probabilities for each emotion

test_df_no_values = pd.read_csv('book1.csv')

In [17]:
test_df_no_values[0:10]

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,"@RanaAyyub @rajnathsingh Oh, hidden revenge and anger...I rememberthe time,she rebutted you.",,,,,,,,,,,
1,I'm doing all this to make sure you smiling down on me bro,,,,,,,,,,,
2,if not then #teamchristine bc all tana has done is provoke her by tweeting shady shit and trying to be a hard bitch begging for a fight,,,,,,,,,,,
3,"It is a #great start for #beginners to jump into auto #trading. PROFITABLE FX EA will give you full support, manuals &amp; Team Viewer support.",,,,,,,,,,,
4,My best friends driving for the first time with me in the car #terrifying,,,,,,,,,,,
5,Hey @SuperValuIRL #Fields in #skibbereen give your online delivery service a horrible name. 1.5 hours late on the 1 hour delivery window.,,,,,,,,,,,
6,Why have #Emmerdale had to rob #robron of having their first child together for that vile woman/cheating sl smh #bitter,,,,,,,,,,,
7,@ThomasEWoods I would like to hear a podcast of you going off refuting her entire article. Extra indignation please.,,,,,,,,,,,
8,If I have to hear one more time how I am intimidate men... I'm going to explode! Why are guys these days so pussified?,,,,,,,,,,,
9,depression sucks😔,,,,,,,,,,,


In [18]:
#C-Support Vector Classification.
SVC_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stop_words, max_df = 0.8, max_features= 8000)),
    #integrated a calibrated classifier to enable predicted_proba
               ('clf', CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(), n_jobs=1))),
           ])


for category in categories:
    print('Emotion: {}'.format(category))
    # train the model
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy: {}'.format(accuracy_score(test[category], prediction)))
    ### used predicted proba to save to no value data frame with same tweets and column headers
    y_proba = SVC_pipeline.predict_proba(X_test)[:,1]
    test_df_no_values[category] = y_proba

Emotion: anger
Test accuracy: 0.7945823927765236
Emotion: anticipation
Test accuracy: 0.8600451467268623
Emotion: disgust
Test accuracy: 0.7528216704288939
Emotion: fear
Test accuracy: 0.917607223476298
Emotion: joy
Test accuracy: 0.8002257336343115
Emotion: love
Test accuracy: 0.8826185101580135
Emotion: optimism
Test accuracy: 0.7629796839729119
Emotion: pessimism
Test accuracy: 0.8871331828442438
Emotion: sadness
Test accuracy: 0.7844243792325056
Emotion: surprise
Test accuracy: 0.9627539503386005
Emotion: trust
Test accuracy: 0.9514672686230248


In [19]:
#data frame with added probability values for each emotion
test_df_no_values.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,"@RanaAyyub @rajnathsingh Oh, hidden revenge and anger...I rememberthe time,she rebutted you.",0.999561,0.053132,0.849845,0.025971,0.02644,0.016349,0.199909,0.044862,0.097224,0.015346,0.022889
1,I'm doing all this to make sure you smiling down on me bro,0.096949,0.140255,0.232004,0.122078,0.898798,0.265558,0.666418,0.071882,0.13789,0.016464,0.058567
2,if not then #teamchristine bc all tana has done is provoke her by tweeting shady shit and trying to be a hard bitch begging for a fight,0.931619,0.070235,0.829578,0.081811,0.034085,0.023745,0.070956,0.071858,0.323149,0.023503,0.021792
3,"It is a #great start for #beginners to jump into auto #trading. PROFITABLE FX EA will give you full support, manuals &amp; Team Viewer support.",0.189703,0.21155,0.35317,0.107946,0.447373,0.051785,0.425916,0.095243,0.196826,0.101851,0.110078
4,My best friends driving for the first time with me in the car #terrifying,0.246535,0.102123,0.339008,0.516463,0.485311,0.122349,0.526263,0.061957,0.358571,0.043337,0.044965


In [20]:
# created smaller unseen data set with no predicted values, only label columns included

df_unseen = pd.read_csv('unseen.csv')
df_unseen['Tweet'] = df_unseen['Tweet'].map(lambda com : convert_emojis(com))
df_unseen['Tweet'] = df_unseen['Tweet'].map(lambda com : clean_text(com))
df_unseen.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,so ryanair site crashes everytime try to book how do they help tell me there nothing wrong amp hang up furious helpless simoncalder,,,,,,,,,,,
1,theme of week ask the lord for strength amp perspective to persevere in integrity and effort despite being disheartened amp disappointed,,,,,,,,,,,
2,why announcing so late it will be hard to make it from manchester and organising day off sad,,,,,,,,,,,
3,the greatest happiness is seeing someone you like stay happy daidouji tomoyo cardcaptor sakura,,,,,,,,,,,
4,omg so grateful to have an education but ive been back at school for two days and my back hurts im exhausted and breaking out already smiling face with heart eyes,,,,,,,,,,,


In [21]:
X_unseen = df_unseen.Tweet
print(X_unseen.shape)

(10,)


In [22]:
#svc pipeline training needed to be included to get actual predicted outcome from model

for category in categories:
    SVC_pipeline.fit(X_train, train[category])
    new_proba = SVC_pipeline.predict_proba(X_unseen)[:,1]
    df_unseen[category] = new_proba

In [23]:
df_unseen.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,so ryanair site crashes everytime try to book how do they help tell me there nothing wrong amp hang up furious helpless simoncalder,0.713958,0.156236,0.520253,0.084982,0.0247,0.012279,0.145985,0.067478,0.272576,0.037676,0.048138
1,theme of week ask the lord for strength amp perspective to persevere in integrity and effort despite being disheartened amp disappointed,0.070732,0.153503,0.182371,0.299237,0.211023,0.015981,0.221913,0.32989,0.573303,0.026024,0.04106
2,why announcing so late it will be hard to make it from manchester and organising day off sad,0.059826,0.07331,0.183249,0.024354,0.028878,0.015422,0.082591,0.095248,0.883215,0.009733,0.020536
3,the greatest happiness is seeing someone you like stay happy daidouji tomoyo cardcaptor sakura,0.038929,0.078643,0.078637,0.012749,0.989245,0.451921,0.863532,0.048478,0.02955,0.033107,0.077852
4,omg so grateful to have an education but ive been back at school for two days and my back hurts im exhausted and breaking out already smiling face with heart eyes,0.088043,0.062213,0.12679,0.27689,0.721374,0.159173,0.447574,0.174978,0.440675,0.038531,0.034005
