### Importing Python Libraries

In [1]:
import numpy as np
import pandas as pd
import neattext.functions as nfx
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
from gensim.models import FastText
import warnings
warnings.filterwarnings('ignore')

### Loading and Merging the Datasets

In [2]:
df1 = pd.read_csv('../../../Datasets/goemotions_1.csv')
df2 = pd.read_csv('../../../Datasets/goemotions_2.csv')
df3 = pd.read_csv('../../../Datasets/goemotions_3.csv')
df = pd.concat([df1, df2, df3], axis=0)
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


### Listing the Emotion Columns by Type

In [3]:
positive = ["admiration","amusement","approval","caring","curiosity","desire","excitement","gratitude","joy","love","optimism","pride","relief"]
negative = ["anger","annoyance","confusion","disappointment","disapproval","disgust","embarrassment","fear","grief","nervousness","remorse","sadness"]
neutral = ["realization","surprise","neutral"]

### Assigning the Dataset Overall Emotion Labels

In [4]:
def Emotion_Labels(row):
    if row[positive].sum() > 0:
        return "Positive"
    elif row[negative].sum() > 0:
        return "Negative"
    elif row[neutral].sum() > 0:
        return "Neutral"
    else:
        return pd.NA

df['Emotions'] = df.apply(Emotion_Labels, axis=1)
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,Emotions
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,Negative
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,Neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,Positive
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,Neutral


### Checking for the Null Values and Dropping them if Any

In [5]:
df.isna().sum()

text                       0
id                         0
author                     0
subreddit                  0
link_id                    0
parent_id                  0
created_utc                0
rater_id                   0
example_very_unclear       0
admiration                 0
amusement                  0
anger                      0
annoyance                  0
approval                   0
caring                     0
confusion                  0
curiosity                  0
desire                     0
disappointment             0
disapproval                0
disgust                    0
embarrassment              0
excitement                 0
fear                       0
gratitude                  0
grief                      0
joy                        0
love                       0
nervousness                0
optimism                   0
pride                      0
realization                0
relief                     0
remorse                    0
sadness       

In [6]:
df.dropna(inplace=True)
df.isna().sum()

text                    0
id                      0
author                  0
subreddit               0
link_id                 0
parent_id               0
created_utc             0
rater_id                0
example_very_unclear    0
admiration              0
amusement               0
anger                   0
annoyance               0
approval                0
caring                  0
confusion               0
curiosity               0
desire                  0
disappointment          0
disapproval             0
disgust                 0
embarrassment           0
excitement              0
fear                    0
gratitude               0
grief                   0
joy                     0
love                    0
nervousness             0
optimism                0
pride                   0
realization             0
relief                  0
remorse                 0
sadness                 0
surprise                0
neutral                 0
Emotions                0
dtype: int64

### Balancing the Class

In [7]:
target = 53000
classes = ['Positive', 'Neutral', 'Negative']
balanced_frames = []
for cls in classes:
    cls_df = df[df['Emotions'] == cls]
    if len(cls_df) >= target:
        cls_df_bal = cls_df.sample(n=target, random_state=42)
    else:
        cls_df_bal = cls_df.sample(n=target, replace=True, random_state=42)
    balanced_frames.append(cls_df_bal)
bdf = pd.concat(balanced_frames).sample(frac=1, random_state=42).reset_index(drop=True)
bdf.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,Emotions
0,Some couples have a “cheat list” of celebritie...,eeief22,MondayMorningEssay,teenagers,t3_ahv6d7,t3_ahv6d7,1547968000.0,6,False,0,...,0,0,0,0,0,0,0,0,1,Neutral
1,"I am 63, D cups, no sag. I must be a unicorn!",eeyn6jd,GetOffMyLawn_,fatlogic,t3_ajsb76,t3_ajsb76,1548451000.0,62,False,0,...,0,0,0,0,0,0,0,0,1,Neutral
2,Unfortunately no one usually fights for their ...,edt8m9r,Chaoticsinner2294,progun,t3_aevpkg,t1_edt6e31,1547220000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,Negative
3,Is everybody forgetting that bills need to be ...,edobw1c,walkthisway34,PoliticalDiscussion,t3_aebn6e,t1_edoayt3,1547080000.0,34,False,0,...,0,0,0,0,0,0,0,0,0,Negative
4,"Nice, congrats!",efejs5t,sarcasmbunny,askcarsales,t3_aleulr,t1_efeb2s7,1548895000.0,51,False,1,...,0,0,0,0,0,0,0,0,0,Positive


### Text Feature Engineering

In [8]:
dff = pd.DataFrame()
dff['Text'] = (bdf['text'].astype(str) +
    ' | Author: ' + bdf['author'].astype(str) +
    ' | Subreddit: ' + bdf['subreddit'].astype(str)
)
dff['Emotions'] = bdf['Emotions']

### Cleaning the Text

In [9]:
def clean(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\br/\w+', '', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'[{}\[\]()\|:\"\']', ' ', text)
    text = re.sub(r'[“”‘’]', ' ', text)
    text = re.sub(r'\|{2,}', ' ', text)
    text = nfx.remove_emails(text)
    text = nfx.remove_stopwords(text)
    text = nfx.remove_urls(text)
    text = nfx.remove_userhandles(text)
    text = nfx.remove_phone_numbers(text)
    text = nfx.remove_emojis(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_multiple_spaces(text)
    text = re.sub(r'[^a-z0-9 ]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

dff['Text'] = dff['Text'].apply(clean)

### Training and Testing Set Split

In [10]:
x = dff['Text']
y = dff['Emotions']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

### FastText Embedding Training

In [11]:
tokenized_texts = [sentence.split() for sentence in x_train]
ft_model = FastText(sentences=tokenized_texts, vector_size=100, window=5, min_count=2, epochs=10, workers=4)

### Defining a Function to Get Average FastText Embedding for Each Sentence

In [12]:
def get_sentence_vector(model, sentence, size=100):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(size)

X_train_ft = np.vstack([get_sentence_vector(ft_model, sent, size=ft_model.vector_size) for sent in x_train])
X_test_ft = np.vstack([get_sentence_vector(ft_model, sent, size=ft_model.vector_size) for sent in x_test])

### Encoding the Labels

In [13]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

### Model Construction and Training

In [14]:
# Logistic Regression
log_clf = LogisticRegression(C=1, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=300, random_state=42)
log_clf.fit(X_train_ft, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,300


In [15]:
# XGBoost
xgb_clf = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1,objective='multi:softmax', eval_metric='mlogloss', use_label_encoder=False, random_state=42)
xgb_clf.fit(X_train_ft, y_train_enc)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
# LightBGM
lgbm_clf = LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, class_weight='balanced', random_state=42)
lgbm_clf.fit(X_train_ft, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 127200, number of used features: 100
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


### Saving Everything

In [17]:
# Saving the Models
joblib.dump(log_clf, '../Models/log_model_bal.joblib')
joblib.dump(xgb_clf, '../Models/xgb_model_bal.joblib')
joblib.dump(lgbm_clf, '../Models/lgbm_model_bal.joblib')
joblib.dump(le, '../Models/label_encoder_bal.joblib')
ft_model.save('../Models/fasttext_model_bal.bin')

# Saving the Datasets
np.save('../Test Datasets/X_test_ft_bal.npy', X_test_ft)
np.save('../Test Datasets/y_test_bal.npy', y_test.to_numpy())