In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("train.txt",sep=';',header=None,names=['Text','Emotions'])

In [3]:
df.head(10)

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
5,ive been feeling a little burdened lately wasn...,sadness
6,ive been taking or milligrams or times recomme...,surprise
7,i feel as confused about life as a teenager or...,fear
8,i have been with petronas for years i feel tha...,joy
9,i feel romantic too,love


In [4]:
df.isnull().sum()

Text        0
Emotions    0
dtype: int64

In [5]:
unique_emotions = df["Emotions"].unique()
emotion_numbers = {}
i =0
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i = i+1
df["Emotions"] = df["Emotions"].map(emotion_numbers)

In [6]:
df

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [7]:
df["Text"] = df["Text"].apply(lambda X : X.lower())

In [8]:
import string
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [9]:
df["Text"] = df["Text"].apply(remove_punc)

In [10]:
def remove_num(txt):
    new = ''
    for i in txt:
        if not i.isdigit():
            new += i
    return new

In [11]:
df["Text"] = df["Text"].apply(remove_num)

In [12]:
def remove_emojis(txt):
    new = ''
    for i in txt:
        if i.isascii():
            new += i
    return new

In [13]:
df["Text"] = df["Text"].apply(remove_emojis)

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [16]:
stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [17]:
df.loc[1]['Text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [18]:
def remove(txt):
    words = word_tokenize(txt)
    cleaned = []
    for i in words:
        if not i in stop_words:
            cleaned.append(i)
    return " ".join(cleaned)

In [19]:
df["Text"] = df["Text"].apply(remove)

In [20]:
df.loc[1,'Text']

'go feeling hopeless damned hopeful around someone cares awake'

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'],df['Emotions'], test_size=0.20, random_state=42)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

In [23]:
bow_vector = CountVectorizer()

In [24]:
X_train_bow = bow_vector.fit_transform(X_train)
X_test_bow = bow_vector.transform(X_test)

In [25]:
X_test_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 26934 stored elements and shape (3200, 13359)>

In [26]:
nb_model = MultinomialNB()
nb_model.fit(X_train_bow,y_train)

In [27]:
pred_nb = nb_model.predict(X_test_bow)
accuracy_score(y_test,pred_nb)

0.7678125

In [28]:
tfidf_vector = TfidfVectorizer()

In [29]:
X_train_tfidf = tfidf_vector.fit_transform(X_train)
X_test_tfidf = tfidf_vector.transform(X_test)

In [30]:
nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

In [31]:
pred_nb2 = nb2_model.predict(X_test_tfidf)
accuracy_score(y_test,pred_nb2)

0.6609375

In [32]:
lg_model = LogisticRegression(max_iter=1000)
lg_model.fit(X_train_tfidf,y_train)

In [33]:
pred_lg = lg_model.predict(X_test_tfidf)
accuracy_score(y_test,pred_lg)

0.8615625

In [37]:
lg_model2 = LogisticRegression(max_iter=1000)
lg_model2.fit(X_train_bow,y_train)

In [38]:
pred_lg2 = lg_model2.predict(X_test_bow)
accuracy_score(y_test,pred_lg2)

0.88875