In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv('train.txt',sep = ';', header = None,names = ['text','emotion'])

In [4]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,1


In [6]:
unique_emotions = df['emotion'].unique() # uss column mai kitne unique naming hai

In [7]:
## i will give numbering to all emotion

# Converting Text to Number using Map
emotion_numbers = {}   # dictionary
i =0

for emo in unique_emotions:
    emotion_numbers[emo]= i
    i=i+1

df['emotion'] = df['emotion'].map(emotion_numbers)

In [8]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [9]:
# Text cleaning

# 1) To Lower Case

df['text'] = df['text'].apply(lambda x : x.lower())

In [10]:
# 2) Removing Punctuation

import string
def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))

In [11]:
df['text'].apply(remove_punc)

Unnamed: 0,text
0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy
...,...
10048,i don t feel insulted because it doesn t sound...
10049,i feel pissed my friend didnt offer me a soda
10050,im feeling really festive this year usually i ...
10051,i am no i feel melancholy despondent often angry


In [12]:
# Remove Numberss



# Removing Emoji Ans Symbols


def remove_emoji(txt):
  new = ""
  for i in txt:
      if i.isascii():
           new +=i
  return new

df['text'] = df['text'].apply(remove_emoji)

In [13]:
# Remove Stop wordss Very import in ML to remove Noise


## NLTK :  TExt Preprocessing


import nltk


In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
stop_words = set(stopwords.words('english'))

len(stop_words)     #  These need to be removed

198

In [17]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [18]:

# phele mai ek ek karke Text lunga , Usee tokens ke convert karunga , Phir har token ko FInd karunga Stopword MAp mai ki woh map mai hai ki nahi , Agar stopword map mai exist karta hai toh remove kar dunga


In [19]:

def remove(txt):
  words = txt.split()  ## converting into text to toekn using word tokenize
  cleaned = []
  for i in words:
    if not i in stop_words:
       cleaned.append(i)
  return ' '.join(cleaned)

In [20]:
df['text'] = df['text'].apply(remove)

In [21]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [22]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [23]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [24]:
## Now vectorizing the Emotion to split into vector

## Using Bag of Words first

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [31]:

bow_vectorizer = CountVectorizer()

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


In [32]:
##  Using Model of Navive bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


nb_model = MultinomialNB()

nb_model.fit(X_train_bow,y_train)


In [33]:
pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test,pred_bow))

0.739433117851815


In [35]:
## NOW COMING UP TO TFIDF method




tfidf_vectorizer = TfidfVectorizer()


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb2_model = MultinomialNB()

nb2_model.fit(X_train_tfidf,y_train)

In [36]:
y_pred = nb2_model.predict(X_test_tfidf)

print(accuracy_score(y_test,y_pred))

0.6429636996519145


In [38]:
## Now lets use logistic regression to improve the accuracy because of probabilites range in 0-1



from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=1000)

logistic_model.fit(X_train_tfidf,y_train)



In [39]:
log_pred = logistic_model.predict(X_test_tfidf)

print(accuracy_score(y_test,log_pred))

0.8140228741919443


And Hence the Logistic regression model has given the highest accuracy score using TF-Idf vectorization