In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB,ComplementNB,MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score

In [23]:
np.random.seed(42)

In [24]:
dataset=pd.read_csv('tweet_emotions.csv')

In [25]:
dataset.isna().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

In [26]:
dataset['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [27]:
labelled_dataset=dataset[dataset['sentiment'] != 'empty']

In [None]:
# extracting 1000 records of each dataset
neutral_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'neutral'][:1000]
worry_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'worry'][:1000]
happiness_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'happiness'][:1000]
sadness_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'sadness'][:1000]
love_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'love'][:1000]
surprise_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'surprise'][:1000]
fun_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'fun'][:1000]
relief_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'relief'][:1000]
hate_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'hate'][:1000]
enthusiasm_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'enthusiasm'][:]
boredom_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'boredom'][:]
anger_dataset = labelled_dataset[labelled_dataset['sentiment'] == 'anger'][:]

# combining these datasets together
final_trainings_dataset = pd.concat([neutral_dataset , worry_dataset , happiness_dataset , sadness_dataset , love_dataset , surprise_dataset , 
                                     fun_dataset , relief_dataset , hate_dataset , enthusiasm_dataset , boredom_dataset , anger_dataset],axis = 0)

In [28]:
labelled_dataset

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
5,1956968477,worry,Re-pinging @ghostridah14: why didn't you go to...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [29]:
corpus=[]
lemmatizer=WordNetLemmatizer()
content=labelled_dataset['content']
for document in content:
    tokenized_document = word_tokenize(document)
    filtered_document = [word for word in tokenized_document if word.lower() not in stopwords.words('english')]
    lemmatized_document = [lemmatizer.lemmatize(document) for document in filtered_document]
    corpus.append(lemmatized_document)

In [30]:
labelled_dataset['processed_corpus'] = corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_dataset['processed_corpus'] = corpus


In [31]:
dataset.to_csv('processed_dataset.csv')

In [32]:
corpus = [' '.join(document) for document in corpus]

In [33]:
len(corpus)

39173

In [34]:
dataset

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [50]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus[:10000])
# x = x.astype('float16')
x = x.toarray()

In [53]:

y = labelled_dataset['sentiment'][:10000]
xtrain,xtest,ytrain,ytest = train_test_split(x, y, test_size=0.2)
gnb=GaussianNB()
# mnb=MultinomialNB( force_alpha=True)
# mnb.fit(xtrain, ytrain)
gnb.fit(xtrain, ytrain)
ypred = gnb.predict(xtest)
accuracy=accuracy_score(ytest,ypred)
print(accuracy)

0.1755
