In [1]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('E-c-En-train.csv')
df_test = pd.read_csv('E-c-En-dev.csv')

Number of Tweets per emotion

In [3]:
df_new = df.drop(['ID', 'Tweet'], axis=1)
counts = []
categories = list(df_new.columns.values)
for i in categories:
    counts.append((i, df[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Emotion', 'no. of tweets'])
df_stats

Unnamed: 0,Emotion,no. of tweets
0,anger,2544
1,anticipation,978
2,disgust,2602
3,fear,1242
4,joy,2477
5,love,700
6,optimism,1984
7,pessimism,795
8,sadness,2008
9,surprise,361


In [4]:
print('Percentage of comments that are not labelled:')
print(len(df[(df['anger']==0) & (df['anticipation']==0) & (df['disgust']==0) & (df['fear']== 0) & (df['joy']==0) & (df['love']==0) & (df['optimism']==0) & (df['pessimism']==0) & (df['sadness']==0) & (df['surprise']==0) & (df['trust']==0)]) / len(df))

Percentage of comments that are not labelled:
0.029833284586136297


In [5]:
print('Percentage of comments that are not labelled:')
print(len(df_test[(df['anger']==0) & (df['anticipation']==0) & (df['disgust']==0) & (df['fear']== 0) & (df['joy']==0) & (df['love']==0) & (df['optimism']==0) & (df['pessimism']==0) & (df['sadness']==0) & (df['surprise']==0) & (df['trust']==0)]) / len(df))

Percentage of comments that are not labelled:
0.0038022813688212928


  


In [6]:
pd.set_option('display.max_colwidth', -1)
df['Tweet'].head()

0    “Worry is a down payment on a problem you may never have'.  Joyce Meyer.  #motivation #leadership #worry                        
1    Whatever you decide to do make sure it makes you #happy.                                                                        
2    @Max_Kellerman  it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS
3    Accept the challenges so that you can literally even feel the exhilaration of victory.' -- George S. Patton 🐶                   
4    My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs                              
Name: Tweet, dtype: object

In [7]:
#pip install emoji

In [8]:
import emoji

# Function for converting emojis into words
def convert_emojis(text):
    text = emoji.demojize(text)
    return text

In [9]:
df['Tweet'] = df['Tweet'].map(lambda com : convert_emojis(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : convert_emojis(com))

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\W', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text) 
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = re.sub('_', ' ', text)
    text = text.strip(' ')
    text = lemmatizer.lemmatize(text)
    return text

In [11]:
df['Tweet'] = df['Tweet'].map(lambda com : clean_text(com))
df_test['Tweet'] = df_test['Tweet'].map(lambda com : clean_text(com))

In [12]:
df['Tweet'][10:20]

10    making that yearly transition from excited and hopeful college returner to sick and exhausted pessimist college                                     
11    and it  hard to dance with  devil on your back nso shake him off                                                                                    
12    tiller and breezy should do  collab album rapping and singing prolly be fire                                                                        
13    to the girl that just hit my car not only did she get lucky  no scratch but also from being spared the wrath of sleep deprived kait upside down face
14    bt uk broadband is shocking regretting signing up now angry shouldofgonewithvirgin                                                                  
15    people you need to look up the definition of protest what you are doing is not protesting is called vandalism angry stop                            
16    bitchesthecat look at those teef growl                          

In [13]:
train = df
test = df_test

In [14]:
X_train = train.Tweet
X_test = test.Tweet
print(X_train.shape)
print(X_test.shape)

(6838,)
(886,)


In [15]:
categories = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']

In [16]:
#C-Support Vector Classification.
SVC_pipeline = Pipeline([
               ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range = (1, 2))),
               ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

for category in categories:
    print('Emotion: {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy: {}'.format(accuracy_score(test[category], prediction)))

Emotion: anger
Test accuracy: 0.791196388261851
Emotion: anticipation
Test accuracy: 0.8498871331828443
Emotion: disgust
Test accuracy: 0.7641083521444695
Emotion: fear
Test accuracy: 0.9187358916478555
Emotion: joy
Test accuracy: 0.801354401805869
Emotion: love
Test accuracy: 0.8961625282167043
Emotion: optimism
Test accuracy: 0.781038374717833
Emotion: pessimism
Test accuracy: 0.8860045146726863
Emotion: sadness
Test accuracy: 0.7844243792325056
Emotion: surprise
Test accuracy: 0.963882618510158
Emotion: trust
Test accuracy: 0.9492099322799097


In [17]:
#import pickle
#model = SVC_pipeline
#filename = 'english_model.sav'
#pickle.dump(model, open(filename, 'wb'))