In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
# Load the Data Set
df = pd.read_csv ("train.txt",sep=";", header=None, names=["text","emotions"])

In [4]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
# Print null values
df.isnull().sum()

Unnamed: 0,0
text,0
emotions,0


In [6]:
# Number of different emotions
df["emotions"].value_counts()

Unnamed: 0_level_0,count
emotions,Unnamed: 1_level_1
joy,5362
sadness,4666
anger,2159
fear,1937
love,1304
surprise,572


In [7]:
# Label encode emotions
unique_emotions = df["emotions"].unique()
emotion_number = {}
i = 0
for emo in unique_emotions:
  emotion_number[emo] = i
  i += 1


df['emotions'] = df['emotions'].map(emotion_number)

In [8]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [10]:
# Convert all text into lower case
df['text'] = df['text'].apply(lambda x: x.lower())

In [12]:
# Remove Punctuation From the Text
import string
def remove_punc(text):
  return text.translate(str.maketrans('', '', string.punctuation))

df['text'] = df['text'].apply(remove_punc)

In [13]:
# Remove number from the text
def remove_num(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new += i
  return new

df['text'] = df['text'].apply(remove_num)

In [14]:
# Remove Emojis from the text
def remove_emojis(txt):
  new = ""
  for i in txt:
    if i.isascii():
      new += i
  return new

df['text'] = df['text'].apply(remove_emojis)

In [15]:
import nltk

In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [22]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [24]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [49]:
# Remove Stop Words from the text

def remove(txt):
  words = txt.split()
  new = []
  for word in words:
    if word not in stop_words:
      new.append(word)
  return " ".join(new)

df['text'] = df['text'].apply(remove)

## Bag Of Words and TF-IDF encoding examples for self intution

In [27]:
# from sklearn.feature_extraction.text import CountVectorizer


# documents = [
#     "I love pizza",
#     "Pizza is the best",
#     "Pasta is great",
#     "Pasta tastes good"
# ]

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(documents)


# print("Vocabulary : ", vectorizer.get_feature_names_out())
# print("\nBOW Matrix:\n ", X.toarray())

Vocabulary :  ['best' 'good' 'great' 'is' 'love' 'pasta' 'pizza' 'tastes' 'the']

BOW Matrix:
  [[0 0 0 0 1 0 1 0 0]
 [1 0 0 1 0 0 1 0 1]
 [0 0 1 1 0 1 0 0 0]
 [0 1 0 0 0 1 0 1 0]]


In [29]:
# from sklearn.feature_extraction.text import TfidfVectorizer


# documents = [
#     "I love pizza",
#     "Pizza is the best",
#     "Pasta is great",
#     "Pasta tastes good"
# ]

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(documents)


# print("Vocabulary : ", vectorizer.get_feature_names_out())
# print("\nBOW Matrix:\n ", X.toarray())

In [30]:
#  Split the data into training and testing set
from sklearn.model_selection  import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.2, random_state=42)


In [42]:
# Train naive bayes model on Bag of Words Encodig
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))


0.768125


In [43]:
pred_bow

array([0, 5, 0, ..., 5, 5, 0])

In [47]:
# Train the Naive-Bayes Model on tf-idf encoding
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)
y_pred = nb2_model.predict(X_test_tfidf)

print(accuracy_score(y_test, y_pred))

0.6609375


In [48]:
# As Naive-Bayes models Accuracy is not so good we apply logistic regression of tf-idf encoding
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf,y_train)
LogisticRegression(max_iter=1000)
log_pred = logistic_model.predict(X_test_tfidf)

print(accuracy_score(y_test,log_pred ))

0.8628125
