In [24]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from textblob import Word
import re

In [25]:
df = pd.read_csv('text_emotion.csv')
df

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [26]:
#Drop Author(Not Required)
df = df.drop('author', axis=1)
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [27]:
# Dropping rows with other emotion labels
df = df.drop(df[df.sentiment == 'anger'].index)
df = df.drop(df[df.sentiment == 'boredom'].index)
df = df.drop(df[df.sentiment == 'enthusiasm'].index)
df = df.drop(df[df.sentiment == 'empty'].index)
df = df.drop(df[df.sentiment == 'fun'].index)
df = df.drop(df[df.sentiment == 'relief'].index)
df = df.drop(df[df.sentiment == 'surprise'].index)
df = df.drop(df[df.sentiment == 'love'].index)
df = df.drop(df[df.sentiment == 'hate'].index)
df = df.drop(df[df.sentiment == 'neutral'].index)
df = df.drop(df[df.sentiment == 'worry'].index)

In [28]:
#Data Preprocessing
#Removing...
#Letter lowercases
df['content'] = df['content'].apply(lambda x:" ".join(x.lower() for x in x.split()))

#Punctuation, Symbols
df['content'] = df['content'].str.replace('[^\w\s]',' ')

#Stop Words(NLTK)
stop = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [29]:
df

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,layin n bed headache ughhhh waitin call
2,1956967696,sadness,funeral ceremony gloomy friday
6,1956968487,sadness,sleep im thinking old friend want married damn...
8,1956969035,sadness,charviray charlene love miss
9,1956969172,sadness,kelcouch sorry least friday
...,...,...,...
39986,1753905153,happiness,going watch boy striped pj hope cry
39987,1753918809,happiness,gave bikes thorough wash degrease grease think...
39988,1753918818,happiness,amazing time last night mcfly incredible
39994,1753918900,happiness,succesfully following tayla


In [30]:
#Lemmatisation
df['content'] = df['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

#Letter Repeitions
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

df['content'] = df['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [31]:
df

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,layin n bed headache ughh waitin call
2,1956967696,sadness,funeral ceremony gloomy friday
6,1956968487,sadness,sleep im thinking old friend want married damn...
8,1956969035,sadness,charviray charlene love miss
9,1956969172,sadness,kelcouch sorry least friday
...,...,...,...
39986,1753905153,happiness,going watch boy striped pj hope cry
39987,1753918809,happiness,gave bike thorough wash degrease grease think ...
39988,1753918818,happiness,amazing time last night mcfly incredible
39994,1753918900,happiness,succesfully following tayla


In [32]:
#Finding top 1000 words rarely appearing in the data
#Removing all the selected words from data

freq = pd.Series(' '.join(df['content']).split()).value_counts()[-1000:]
freq = list(freq.index)
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [33]:
df

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,layin n bed headache ughh waitin call
2,1956967696,sadness,funeral ceremony gloomy friday
6,1956968487,sadness,sleep im thinking old friend want married damn...
8,1956969035,sadness,charviray charlene love miss
9,1956969172,sadness,kelcouch sorry least friday
...,...,...,...
39986,1753905153,happiness,going watch boy pj hope cry
39987,1753918809,happiness,gave bike thorough wash degrease grease think ...
39988,1753918818,happiness,amazing time last night mcfly incredible
39994,1753918900,happiness,succesfully following tayla


In [34]:
#Feature Extraction
#Encoding output labels
enc = preprocessing.LabelEncoder()
y = enc.fit_transform(df.sentiment.values)

#Splitting into training and testing data in 90:10
X_train, X_val, y_train, y_val = train_test_split(df.content.values, y, stratify=y, random_state=42, 
test_size=0.1, shuffle=True)

In [35]:
'''
tf–idf (term frequency–inverse document frequency), is a numerical statistic that is intended to reflect how
important a word is to a document in a collection or corpus.

This parameter gives the relative importance of a term in the data and is a measure of how frequently and rarely
it appears in the text. '''

'\ntf–idf (term frequency–inverse document frequency), is a numerical statistic that is intended to reflect how\nimportant a word is to a document in a collection or corpus.\n\nThis parameter gives the relative importance of a term in the data and is a measure of how frequently and rarely\nit appears in the text. '

In [36]:
#Extracting TF-IDF Parameters
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

In [37]:
#Count Vectors
#Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer = 'word')
count_vect.fit(df['content'])

X_train_count = count_vect.transform(X_train)
X_val_count = count_vect.transform(X_val)

In [38]:
'''
Training the models for both "tf-idf" and "Count Vectors"
The models used will be 
(1)Multinomial Naive Bayes Classifier
(2)Linear SVM
(3)Logistic Regression
(4)Random Forest Classifier '''

'\nTraining the models for both "tf-idf" and "Count Vectors"\nThe models used will be \n(1)Multinomial Naive Bayes Classifier\n(2)Linear SVM\n(3)Logistic Regression\n(4)Random Forest Classifier '

In [39]:
#Using the TF-IDF features
#Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)

print('naive bayes using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes using tfidf accuracy 0.5414258188824663


In [40]:
#Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)

print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

svm using tfidf accuracy 0.5385356454720617


In [41]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)

print('logistic regression using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

logistic regression using tfidf accuracy 0.5452793834296724




In [42]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_val_tfidf)

print('random forest classifier using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

random forest classifier using tfidf accuracy 0.5394990366088632


In [43]:
#Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)

print('naive bayes using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes using count vectors accuracy 0.779383429672447


In [44]:
#Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)\

print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using count vectors accuracy 0.7803468208092486


In [45]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)

print('logestic regression count vectors accuracy %s' % accuracy_score(y_pred, y_val))

logestic regression count vectors accuracy 0.7861271676300579




In [46]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_val_count)

print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))

random forest with count vectors accuracy 0.7572254335260116
