In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
df = pd.read_csv('archive/train.txt',sep = ';',header = None,names = ['text','emotion'])

In [3]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [5]:
unique_emotions = df['emotion'].unique()
emotion_number= {}
i =0
for emo in unique_emotions:
    emotion_number[emo]= i
    i+=1
df['emotion'] = df['emotion'].map(emotion_number)

In [6]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [7]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [8]:
import string 
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [9]:
df['text'] = df['text'].apply(remove_punc)

In [10]:
def remove_numbers(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new 

df['text'] = df['text'].apply(remove_numbers)

In [11]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new 

df['text'] = df['text'].apply(remove_emojis)

In [12]:
!pip install nltk



In [13]:
import nltk 

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
stop_words = set(stopwords.words('english'))

In [18]:
len(stop_words)

198

In [19]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [20]:
def remove(txt):
    words = word_tokenize(txt)
    cleaned = []
    for i in words:
        if not i in stop_words:
            cleaned.append(i)
    return ' '.join(cleaned)

In [21]:
df['text'] = df['text'].apply(remove)

In [22]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [23]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(df['text'],df['emotion'],test_size = 0.2,random_state = 42)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

In [25]:
bow_vectorizer = CountVectorizer()

In [26]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [27]:
X_train_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 116049 stored elements and shape (12800, 13359)>

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [29]:
nb_model = MultinomialNB()

In [30]:
nb_model.fit(X_train_bow,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [31]:
pred_nb = nb_model.predict(X_test_bow)

In [32]:
print(accuracy_score(y_test,pred_nb))

0.7678125


In [33]:
tfidf_vectorizer = TfidfVectorizer()

In [34]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [35]:
nb2_model = MultinomialNB()

In [36]:
nb2_model.fit(X_train_tfidf,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [37]:
pred_nb2 = nb2_model.predict(X_test_tfidf)

In [38]:
print(accuracy_score(y_test,pred_nb2))

0.6609375


In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
logistic_model = LogisticRegression(max_iter = 1000)

In [41]:
logistic_model.fit(X_train_tfidf,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [42]:
pred_log = logistic_model.predict(X_test_tfidf)

In [43]:
print(accuracy_score(y_test,pred_log))

0.8615625


In [44]:
print(classification_report(y_test, pred_log))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       946
           1       0.90      0.81      0.86       427
           2       0.90      0.61      0.72       296
           3       0.88      0.47      0.61       113
           4       0.86      0.76      0.80       397
           5       0.81      0.96      0.88      1021

    accuracy                           0.86      3200
   macro avg       0.88      0.76      0.80      3200
weighted avg       0.87      0.86      0.86      3200



In [45]:
import pickle

# Save model
pickle.dump(logistic_model, open("model.pkl", "wb"))

# Save TF-IDF vectorizer
pickle.dump(tfidf_vectorizer, open("tfidf.pkl", "wb"))

# Save label map
emotion_number = {
    "sadness": 0,
    "anger": 1,
    "love": 2,
    "suprise": 3,
    "fear": 4,
    "joy": 5
}
pickle.dump(emotion_number, open("label_map.pkl", "wb"))
