### Importing Python Libraries

In [1]:
import numpy as np
import pandas as pd
import neattext.functions as nfx
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import gensim
from gensim.models import FastText
import joblib

### Loading and Merging Datasets

In [2]:
df1 = pd.read_csv('../../../Datasets/goemotions_1.csv')
df2 = pd.read_csv('../../../Datasets/goemotions_2.csv')
df3 = pd.read_csv('../../../Datasets/goemotions_3.csv')

df = pd.concat([df1,df2,df3],axis=0)
df.sample(10)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
59565,"Police aside, storms and high tides may make t...",eddsaaa,NorthKoreanDetergent,conspiracy,t3_ad3z0f,t3_ad3z0f,1546783000.0,66,False,0,...,0,0,0,0,1,0,0,0,0,0
31399,Oh my [NAME] I almost forgot about the hot dr ...,ed8yud4,da_asparagus,Cardinals,t3_acgsap,t1_ed8y8zc,1546632000.0,9,False,0,...,0,0,0,0,0,0,0,0,1,0
55874,Don’t make fun of the cancer kid,eed8pqu,Spook404,KidsAreFuckingStupid,t3_agstks,t3_agstks,1547831000.0,15,False,0,...,0,0,0,0,0,0,0,0,0,0
59279,That's great!,ef0mj58,Milfy-Way,breakingmom,t3_ak0qwz,t1_ef0g07d,1548515000.0,4,False,0,...,0,0,0,0,0,0,0,0,0,0
7432,Still better than YouTube rewind,ed0glwn,MrAbomidable,cringe,t3_abeksv,t3_abeksv,1546352000.0,76,False,0,...,0,0,0,0,0,0,0,0,0,1
26969,They're playing on their phone and smoking in ...,ee4dyfv,WhereTheHotWaterAt,China,t3_afz56z,t1_ee42e7a,1547563000.0,57,False,0,...,0,0,1,0,0,0,0,0,0,0
55658,"I don't remember the taste, but someone on my ...",ed2u2g2,5quirre1,morbidquestions,t3_abtcmw,t3_abtcmw,1546439000.0,4,False,0,...,0,0,0,0,0,0,0,0,0,0
60827,What. The. Fuck.,ef4mimh,iamyourvilli,ABCDesis,t3_akgaib,t3_akgaib,1548628000.0,10,False,0,...,0,0,0,0,0,0,0,0,0,0
47609,[NAME] were the only reason this episode was r...,ef1os8g,[deleted],rpdrcringe,t3_ajxqmx,t3_ajxqmx,1548540000.0,73,False,0,...,0,0,0,0,1,0,0,0,0,0
49140,[NAME]? For real?,ef88mfc,techgeek6061,gatekeeping,t3_aktfov,t1_ef86vdo,1548730000.0,4,False,0,...,0,0,0,0,0,0,0,0,1,0


### Listing the Emotion Columns by Type

In [3]:
positive = [
    "admiration","amusement","approval","caring","curiosity","desire","excitement",
    "gratitude","joy","love","optimism","pride","relief"
]
negative = [
    "anger","annoyance","confusion","disappointment","disapproval","disgust",
    "embarrassment","fear","grief","nervousness","remorse","sadness"
]
neutral = ["realization","surprise","neutral"]

### Assigning the Dataset Overall Emotion Labels

In [4]:
def Emotion_Labels(row):
    if row[positive].sum() > 0:
        return "Positive"
    elif row[negative].sum() > 0:
        return "Negative"
    elif row[neutral].sum() > 0:
        return "Neutral"
    else:
        return pd.NA

df['Emotions'] = df.apply(Emotion_Labels, axis=1)
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,Emotions
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,Negative
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,Neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,Positive
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,Neutral


### Checking for the Null Values and Dropping them if Any

In [5]:
df.isna().sum()

text                       0
id                         0
author                     0
subreddit                  0
link_id                    0
parent_id                  0
created_utc                0
rater_id                   0
example_very_unclear       0
admiration                 0
amusement                  0
anger                      0
annoyance                  0
approval                   0
caring                     0
confusion                  0
curiosity                  0
desire                     0
disappointment             0
disapproval                0
disgust                    0
embarrassment              0
excitement                 0
fear                       0
gratitude                  0
grief                      0
joy                        0
love                       0
nervousness                0
optimism                   0
pride                      0
realization                0
relief                     0
remorse                    0
sadness       

In [6]:
df.dropna(inplace=True)
df.isna().sum()

text                    0
id                      0
author                  0
subreddit               0
link_id                 0
parent_id               0
created_utc             0
rater_id                0
example_very_unclear    0
admiration              0
amusement               0
anger                   0
annoyance               0
approval                0
caring                  0
confusion               0
curiosity               0
desire                  0
disappointment          0
disapproval             0
disgust                 0
embarrassment           0
excitement              0
fear                    0
gratitude               0
grief                   0
joy                     0
love                    0
nervousness             0
optimism                0
pride                   0
realization             0
relief                  0
remorse                 0
sadness                 0
surprise                0
neutral                 0
Emotions                0
dtype: int64

### Preparing the Dataset for Training and Testing Datasets

In [7]:
dff = pd.DataFrame()
dff['Text'] = (df['text'].astype(str) +
    ' | Author: ' + df['author'].astype(str) +
    ' | Subreddit: ' + df['subreddit'].astype(str)
)
dff['Emotions'] = df['Emotions']
dff.head()

Unnamed: 0,Text,Emotions
0,That game hurt. | Author: Brdd9 | Subreddit: nrl,Negative
2,"You do right, if you don't care then fuck 'em!...",Neutral
3,Man I love reddit. | Author: MrsRobertshaw | S...,Positive
4,"[NAME] was nowhere near them, he was by the Fa...",Neutral
5,Right? Considering it’s such an important docu...,Positive


In [8]:
del df1,df2,df3,df

### Cleaning the Text

In [9]:
def clean(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\br/\w+', '', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'[{}\[\]()\|:\"\']', ' ', text)
    text = re.sub(r'[“”‘’]', ' ', text)            
    text = re.sub(r'\|{2,}', ' ', text)              
    text = nfx.remove_emails(text)
    text = nfx.remove_stopwords(text)
    text = nfx.remove_urls(text)
    text = nfx.remove_userhandles(text)
    text = nfx.remove_phone_numbers(text)
    text = nfx.remove_emojis(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_multiple_spaces(text)
    text = re.sub(r'[^a-z0-9 ]', ' ', text)           
    text = re.sub(r'\s{2,}', ' ', text)              
    return text.strip()

dff['Text'] = dff['Text'].apply(clean)
dff.head()

Unnamed: 0,Text,Emotions
0,game hurt author brdd9 subreddit nrl,Negative
2,right care fuck em author labalool subreddit c...,Neutral
3,man love reddit author mrsrobertshaw subreddit...,Positive
4,near them falcon author americanfascist713 sub...,Neutral
5,right considering important document know damn...,Positive


### Splitting the Dataset into Training and Testing Sets

In [10]:
x = dff['Text']
y = dff['Emotions']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

### FastText Embedding Preparation

In [11]:
tokenized_train = [text.split() for text in x_train]
ft_model = FastText(sentences=tokenized_train, vector_size=100, window=5, min_count=2, epochs=10, workers=4)

### Defining a Function to Get Average FastText Embedding for Each Sentence

In [12]:
def get_sentence_vector(model, sentence, size=100):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(size)

X_train_ft = np.vstack([get_sentence_vector(ft_model, sent, size=ft_model.vector_size) for sent in x_train])
X_test_ft = np.vstack([get_sentence_vector(ft_model, sent, size=ft_model.vector_size) for sent in x_test])

### Encoding the Labels

In [13]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

### Model Construction and Training

In [14]:
# Logistic Regression
log_clf = LogisticRegression(C=1, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=300, random_state=42)
log_clf.fit(X_train_ft, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,300


In [15]:
# XGBoost
xgb_clf = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, objective='multi:softmax', eval_metric='mlogloss', use_label_encoder=False, random_state=42)
xgb_clf.fit(X_train_ft, y_train_enc)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
# LightBGM
lgbm_clf = LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, class_weight='balanced', random_state=42)
lgbm_clf.fit(X_train_ft, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 166251, number of used features: 100
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


### Saving Everything

In [18]:
# Saving the Models
joblib.dump(log_clf, '../Models/log_model.joblib')
joblib.dump(xgb_clf, '../Models/xgb_model.joblib')
joblib.dump(lgbm_clf, '../Models/lgbm_model.joblib')
joblib.dump(le, '../Models/label_encoder.joblib')
ft_model.save('../Models/fasttext_model.bin')

# Saving the Datasets
np.save('../Test Datasets/X_test_ft.npy', X_test_ft)
np.save('../Test Datasets/y_test.npy', y_test.to_numpy())