## Importing Library for model creation and  Data Preprocessing

In [None]:
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Preprocessing the Data

In [None]:
data=pd.read_csv("/content/tweet_emotions.csv")
data

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [None]:
# finding the shape of the data
data.shape

(40000, 3)

In [None]:
# the info of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [None]:
# drop the tweet id col because its irrelevant for the model creation
data1=data.drop('tweet_id', axis=1)
data1.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [None]:
#type of emotion present in the dataset
data1['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [None]:
labels=['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger']

In [None]:
# The total number of emotion
data1['sentiment'].nunique()

13

In [None]:
# groupby the count of the emotion in the dataset
emotion_count=data1.groupby(['sentiment'])['sentiment'].count()
emotion_count

Unnamed: 0_level_0,sentiment
sentiment,Unnamed: 1_level_1
anger,110
boredom,179
empty,827
enthusiasm,759
fun,1776
happiness,5209
hate,1323
love,3842
neutral,8638
relief,1526


In [None]:
# sorting the groupby methods
emotion_count.sort_values(ascending=False)

Unnamed: 0_level_0,sentiment
sentiment,Unnamed: 1_level_1
neutral,8638
worry,8459
happiness,5209
sadness,5165
love,3842
surprise,2187
fun,1776
relief,1526
hate,1323
empty,827


In [None]:
# Checking weather it contain null values
data1.isnull().sum()

Unnamed: 0,0
sentiment,0
content,0


#Cleaning the text

In [None]:
data1['content']

Unnamed: 0,content
0,@tiffanylue i know i was listenin to bad habi...
1,Layin n bed with a headache ughhhh...waitin o...
2,Funeral ceremony...gloomy friday...
3,wants to hang out with friends SOON!
4,@dannycastillo We want to trade with someone w...
...,...
39995,@JohnLloydTaylor
39996,Happy Mothers Day All my love
39997,Happy Mother's Day to all the mommies out ther...
39998,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [None]:
# Remove the number and special char from the content and making it lowercase

In [None]:
data1

Unnamed: 0,sentiment,content,Cleaned_text,emotion
0,empty,@tiffanylue i know i was listenin to bad habi...,"[iffanylue, i, know, i, was, listenin, to, bad...",2
1,sadness,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, with, a, headache, ughhhh, ......",10
2,sadness,Funeral ceremony...gloomy friday...,"[funeral, ceremony, ..., gloomy, friday, ...]",10
3,enthusiasm,wants to hang out with friends SOON!,"[wants, to, hang, out, with, friends, soon, !]",3
4,neutral,@dannycastillo We want to trade with someone w...,"[annycastillo, we, want, to, trade, with, some...",8
...,...,...,...,...
39995,neutral,@JohnLloydTaylor,[ohnlloydtaylor],8
39996,love,Happy Mothers Day All my love,"[happy, mothers, day, all, my, love]",7
39997,love,Happy Mother's Day to all the mommies out ther...,"[happy, mother, 's, day, to, all, the, mommies...",7
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,"[iariley, wassup, beautiful, !, !, !, follow, ...",5


In [None]:
def clean_text(text):
    text = re.sub('@[A-Za-z0-9_]+', '', text)  # Remove the mentions
    text = re.sub('https?://[A-Za-z0-9./]+', '', text)  # Remove URLs
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = word_tokenize(text)  # Tokenization
    text = [word for word in text if word not in set(stopwords.words('english'))]  # Remove stopwords
    #stemmer = PorterStemmer()
    lemmatizer= WordNetLemmatizer()
    #text = [stemmer.stem(word) for word in text]  # Stemming
    text=[lemmatizer.lemmatize(word) for word in text]
    return ' '.join(text)

In [None]:
data1['Cleaned_text']=data1['Cleaned_text'].apply(clean_text)

In [None]:
data1

Unnamed: 0,sentiment,content,Cleaned_text,emotion
0,empty,@tiffanylue i know i was listenin to bad habi...,iffanylu know listenin bad habit earlier start...,2
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headach ughhhh waitin call,10
2,sadness,Funeral ceremony...gloomy friday...,funer ceremoni gloomi friday,10
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,3
4,neutral,@dannycastillo We want to trade with someone w...,annycastillo want trade someon houston ticket one,8
...,...,...,...,...
39995,neutral,@JohnLloydTaylor,ohnlloydtaylor,8
39996,love,Happy Mothers Day All my love,happi mother day love,7
39997,love,Happy Mother's Day to all the mommies out ther...,happi mother day mommi woman man long momma so...,7
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,iariley wassup beauti follow peep new hit sing...,5


## Trying CountVectorize model

In [None]:
vectorizer=CountVectorizer()
x=vectorizer.fit_transform(data1['Cleaned_text'])

In [None]:
label_encode=LabelEncoder()
data1['emotion']=label_encode.fit_transform(data['sentiment'])

In [None]:
labels

['empty',
 'sadness',
 'enthusiasm',
 'neutral',
 'worry',
 'surprise',
 'love',
 'fun',
 'hate',
 'happiness',
 'boredom',
 'relief',
 'anger']

In [None]:
data1

Unnamed: 0,sentiment,content,Cleaned_text,emotion
0,empty,@tiffanylue i know i was listenin to bad habi...,iffanylu know listenin bad habit earlier start...,2
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headach ughhhh waitin call,10
2,sadness,Funeral ceremony...gloomy friday...,funer ceremoni gloomi friday,10
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,3
4,neutral,@dannycastillo We want to trade with someone w...,annycastillo want trade someon houston ticket one,8
...,...,...,...,...
39995,neutral,@JohnLloydTaylor,ohnlloydtaylor,8
39996,love,Happy Mothers Day All my love,happi mother day love,7
39997,love,Happy Mother's Day to all the mommies out ther...,happi mother day mommi woman man long momma so...,7
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,iariley wassup beauti follow peep new hit sing...,5


In [None]:
x_train, x_test, y_train, y_test=train_test_split(x , data1['emotion'], test_size=0.2, random_state=13)

In [None]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.314375


## SVM Model

In [None]:
svm=SVC()
svm.fit(x_train,y_train)

In [None]:
predictions_svm =svm.predict(x_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, predictions_svm))

Accuracy: 0.356875


## Trying spacy library

- Because of the lower accuacry in nltk i tried spacy

In [None]:
import spacy

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess(text):
  doc=nlp(text)
  words=[]
  for token in doc:
    if token.is_stop:
      continue
    words.append(token.lemma_)

  return ''.join(words)

In [None]:
data1['Preprocssed_text']=data1['Cleaned_text'].apply(preprocess)

In [None]:
data1

Unnamed: 0,sentiment,content,Cleaned_text,emotion,Preprocssed_text,Label
0,empty,@tiffanylue i know i was listenin to bad habi...,iffanylu know listenin bad habit earlier start...,2,iffanyluknowlisteninbadhabitearlystartfreakin,2
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headach ughhhh waitin call,10,layinnbedheadachughhhhwaitin,10
2,sadness,Funeral ceremony...gloomy friday...,funer ceremoni gloomi friday,10,funerceremonigloomifriday,10
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,3,wanthangfriendsoon,3
4,neutral,@dannycastillo We want to trade with someone w...,annycastillo want trade someon houston ticket one,8,annycastillowanttradesomeonhoustonticket,8
...,...,...,...,...,...,...
39995,neutral,@JohnLloydTaylor,ohnlloydtaylor,8,ohnlloydtaylor,8
39996,love,Happy Mothers Day All my love,happi mother day love,7,happimotherdaylove,7
39997,love,Happy Mother's Day to all the mommies out ther...,happi mother day mommi woman man long momma so...,7,happimotherdaymommiwomanmanlongmommasomeonday,7
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,iariley wassup beauti follow peep new hit sing...,5,iarileywassupbeautifollowpeepnewhitsinglwwwmys...,5


In [None]:
data1["Label"]=data1['emotion']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data1['Preprocssed_text'], data1['Label'],
                                                    test_size=0.2, random_state=42, stratify=data1['Label'])

In [None]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))
])

In [None]:
clf.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.21925
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.00      0.00      0.00        36
           2       0.00      0.00      0.00       165
           3       0.00      0.00      0.00       152
           4       0.00      0.00      0.00       355
           5       0.40      0.01      0.01      1042
           6       0.00      0.00      0.00       265
           7       0.70      0.02      0.05       768
           8       0.22      0.99      0.36      1728
           9       0.00      0.00      0.00       305
          10       0.40      0.01      0.02      1033
          11       0.00      0.00      0.00       437
          12       0.25      0.01      0.01      1692

    accuracy                           0.22      8000
   macro avg       0.15      0.08      0.03      8000
weighted avg       0.27      0.22      0.09      8000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
