# Import Library

In [1]:
import numpy as np
import pandas as pd


In [2]:
import nltk

In [3]:

#Loading and Reading the data
data = pd.read_csv('../desktop/dataset/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
#Dropping the feature that are not needed 
data.drop(['tweet_id'], axis=1, inplace=True)

In [5]:
data.sample(5)

Unnamed: 0,sentiment,content
4629,worry,I wish I was in dallas with the kiddnation family
1801,sadness,really annoyed that work appear to have blocke...
32348,relief,@melgreco thanks for coming in tonight it mad...
6392,worry,@tommcfly I don't know what to write anymore! ...
38402,hate,http://twitpic.com/4wsjr - Leisure Bay beach ...


# Splitting the data

In [6]:
X = data['content']
y = data['sentiment']

In [7]:
X[150]

'Last day working for the Uni today, sad times'

In [8]:
y[105]

'hate'

In [9]:
#for train_test split
from sklearn.model_selection import train_test_split

In [10]:
from collections import Counter
Counter(y)

Counter({'empty': 827,
         'sadness': 5165,
         'enthusiasm': 759,
         'neutral': 8638,
         'worry': 8459,
         'surprise': 2187,
         'love': 3842,
         'fun': 1776,
         'hate': 1323,
         'happiness': 5209,
         'boredom': 179,
         'relief': 1526,
         'anger': 110})

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [13]:
'''
tf: total repetition of words
idf : log(document_no / no of document contining x)
tf-idf score tf * idf
'''


tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize,
                       stop_words='english', ngram_range=(1,2),
                       lowercase=True,
                       #max_features=1024,
                       min_df=3)

In [14]:
X_tfidf = tfidf.fit_transform(X_train)

In [15]:
import joblib
joblib.dump(tfidf, 'TF-IDF vectorizer.pkl')

['TF-IDF vectorizer.pkl']

In [16]:
X_tfidf.shape

(32000, 19122)

In [17]:
X.shape

(40000,)

# Model Selection, Training and Testing

In [18]:

from sklearn.svm import SVC

In [19]:
support_vector_machine = SVC(random_state=666)

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

In [22]:
support_vector_machine.fit(X_tfidf, y_train)

In [23]:
joblib.dump(support_vector_machine, 'Emotion_Analysis_model.pkl')

['Emotion_Analysis_model.pkl']

In [24]:
X_tfidf_test = tfidf.transform(X_test)

In [25]:
y_pred = support_vector_machine.predict(X_tfidf_test)

In [26]:
y_pred[89]

7

In [27]:
y_pred = encoder.inverse_transform(y_pred)

In [28]:
y_pred[89]

'love'

In [29]:
from sklearn.metrics import  classification_report

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        31
     boredom       0.00      0.00      0.00        35
       empty       0.33      0.01      0.01       191
  enthusiasm       0.00      0.00      0.00       130
         fun       0.00      0.00      0.00       372
   happiness       0.36      0.41      0.39      1060
        hate       0.31      0.08      0.13       258
        love       0.50      0.35      0.41       775
     neutral       0.33      0.59      0.42      1697
      relief       0.40      0.01      0.02       319
     sadness       0.42      0.16      0.23      1050
    surprise       0.21      0.01      0.02       413
       worry       0.33      0.53      0.41      1669

    accuracy                           0.35      8000
   macro avg       0.25      0.17      0.16      8000
weighted avg       0.33      0.35      0.30      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
sentences = ['Something cool about this is that , I have nothing to say',
            'Give me your money',
            'I think you are right',
             ' Thank you',
            'I totally hate this', 
            'I hate you',
            ' I hate dog']

sentences_tfidf = tfidf.transform(sentences)

In [32]:
#encoder.inverse_transform(random_forest.predict(sentences_tfidf))
encoder.inverse_transform(support_vector_machine.predict(sentences_tfidf))


array(['happiness', 'worry', 'neutral', 'happiness', 'hate', 'hate',
       'hate'], dtype=object)