In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [2]:
emotions = pd.read_csv('../Data/text_emotion.csv')
emotions_copy = emotions.copy()
emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [3]:
drop_rows_lst = ['empty', 'enthusiasm', 'worry', 'surprise', 'fun', 'boredom', 'relief', 'anger']
emotions = emotions[~emotions['sentiment'].isin(drop_rows_lst)]
emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you


In [4]:
vals_to_replace = {'sadness': 1, 'neutral': 2, 'love': 4, 'hate': 0, 'happiness': 3}
emotions['sent_num'] = emotions.sentiment.map(vals_to_replace)
emotions['sent_num'].value_counts()

2    8638
3    5209
1    5165
4    3842
0    1323
Name: sent_num, dtype: int64

In [5]:
# auxiliar function to remove a pattern defined by a regular expression 
def remove_by_regex(tweet, regexp):
        return re.sub(regexp, '', tweet)

# 3 specific cleaning functions to remove numbers, url's and special characters
def remove_numbers(tweet):
    return remove_by_regex(tweet, re.compile(r"[1234567890]"))

def remove_url(tweet):
    return remove_by_regex(tweet, re.compile(r"http.?://[^\s]+[\s]?"))

def remove_special_char(tweet):
    return re.sub(r"[^a-zA-Z0-9 ]", "", tweet) #add space placeholder

# general cleaning function to do it all at once
def clean_up(tweet):
    tweet = remove_numbers(tweet)
    tweet = remove_url(tweet)
    tweet = remove_special_char(tweet)
    return tweet.lower().strip()

In [6]:
emotions["content"] = emotions["content"].apply(clean_up)

In [7]:
stemmer = SnowballStemmer("english")

In [8]:
emotions["content_stemmed"] = emotions["content"].apply(stemmer.stem)

In [9]:
emotions["content_stemmed"] = [' '.join(x.split()) for x in emotions["content"]]

In [10]:
vectorizer = TfidfVectorizer()
content_vect = vectorizer.fit_transform(emotions.content_stemmed)
vector_df = pd.DataFrame.sparse.from_spmatrix(content_vect.tocoo(), columns = vectorizer.get_feature_names())

In [11]:
vector_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,aaaaaah,aaaaaalcohol,...,zur,zwriter,zyber,zykloid,zyote,zzerbe,zzz,zzzz,zzzzy,zzzzzzzgoodnight
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
emotions.reset_index(inplace=True)
emotions

Unnamed: 0,index,tweet_id,sentiment,author,content,sent_num,content_stemmed
0,1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,1,layin n bed with a headache ughhhhwaitin on yo...
1,2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,1,funeral ceremonygloomy friday
2,4,1956968416,neutral,xkilljoyx,dannycastillo we want to trade with someone wh...,2,dannycastillo we want to trade with someone wh...
3,6,1956968487,sadness,ShansBee,i should be sleep but im not thinking about an...,1,i should be sleep but im not thinking about an...
4,8,1956969035,sadness,nic0lepaula,charviray charlene my love i miss you,1,charviray charlene my love i miss you
...,...,...,...,...,...,...,...
24172,39995,1753918954,neutral,showMe_Heaven,johnlloydtaylor,2,johnlloydtaylor
24173,39996,1753919001,love,drapeaux,happy mothers day all my love,4,happy mothers day all my love
24174,39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,4,happy mothers day to all the mommies out there...
24175,39998,1753919043,happiness,ipdaman1,niariley wassup beautiful follow me peep out ...,3,niariley wassup beautiful follow me peep out m...


In [13]:
vector_df['sent_num'] = emotions['sent_num']
vector_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,aaaaaah,aaaaaalcohol,...,zwriter,zyber,zykloid,zyote,zzerbe,zzz,zzzz,zzzzy,zzzzzzzgoodnight,sent_num
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [14]:
train_set, test_set = train_test_split(vector_df, test_size=0.3)
train_X = train_set.drop(columns='sent_num')
train_y = train_set['sent_num']
test_X = test_set.drop(columns='sent_num')
test_y = test_set['sent_num']

### Random forest classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=1000)
rfc.fit(train_X, train_y)
predictions_rfc = rfc.predict(train_X)

In [16]:
accuracy_score(train_y, predictions_rfc)

0.9961590734503338

In [17]:
predictions_rfc_test = rfc.predict(test_X)
accuracy_score(test_y, predictions_rfc_test)

0.5037220843672456

Test with whole sample

In [18]:
full_X=vector_df.drop(columns='sent_num')
full_y=vector_df['sent_num']

In [19]:
full_rfc = RandomForestClassifier(max_depth=1000)
predictions_full_rfc = rfc.predict(full_X)
accuracy_score(full_y, predictions_full_rfc)

0.8484096455308765

Train with whole sample

In [20]:
full_X_train=vector_df.drop(columns='sent_num')
full_y_train=vector_df['sent_num']

In [21]:
full_rfc_train = RandomForestClassifier(max_depth=1000)
full_rfc_train.fit(full_X_train, full_y_train)
predictions_full_rfc_train = full_rfc_train.predict(full_X_train)
accuracy_score(full_y_train, predictions_full_rfc_train)

0.9953261364106382