In [None]:
import os
import re
import nltk
import sklearn
import pandas as pd
from tqdm.notebook import tqdm, trange
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import xgboost as xgb
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, auc, roc_curve
from sklearn.metrics import RocCurveDisplay

In [None]:
#Train test dataset was created using sci-kit learn

df_train=pd.read_csv("train_for_Emotion_oversample_copy")
df_test=pd.read_csv("test_for_Emotion_oversample_copy")

In [None]:
data={}

In [None]:
trainData=df_train

In [None]:
for x in trainData.columns.values:
    data[x]=[]

In [None]:
dfOUS=pd.DataFrame(data)

In [None]:
#Method to oversample the training dataset based on label and given cap. #cap is the highest amount of a class in training data
def sample(sourceDataframe,destDataframe,label,cap):
    remaining=cap
    for i in range(len(sourceDataframe)):
        if remaining<=0:
            break
        if (sourceDataframe["Emotion"][i] ==label) and (remaining >0):
            new={}
            for x in sourceDataframe.columns.values:
                new[x]=sourceDataframe[x][i]
            destDataframe=destDataframe.append(new,ignore_index=True)
            remaining-=1
        
    return destDataframe

In [None]:
#oversampling the train dataset
targetCap=412
for labelTarget in [0,1,2,3]:
    while True:
        try:
            if (dfOUS['Emotion'].value_counts()[labelTarget]>targetCap):
                dfOUS=sample(trainData,dfOUS,labelTarget,targetCap)
            elif (dfOUS['Emotion'].value_counts()[labelTarget]<targetCap):
                cap=targetCap-dfOUS['Emotion'].value_counts()[labelTarget]
                dfOUS=sample(trainData,dfOUS,labelTarget,cap)
            elif (dfOUS['Emotion'].value_counts()[labelTarget]==targetCap):
                break
        except:
            dfOUS=sample(trainData,dfOUS,labelTarget,targetCap)

In [None]:
df_train = dfOUS

In [None]:
class_names=["OPTIMISM","NEUTRAL","JOY","UPSET"]

In [None]:
label_binarizer = LabelBinarizer().fit(df_train['Emotion'])
y_onehot_test = label_binarizer.transform(df_test['Emotion'])
y_onehot_test.shape  # (n_samples, n_classes)

In [None]:
model1 = xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model2 = CatBoostClassifier(n_estimators=1550,
                        learning_rate=0.1,
                        depth=5, task_type='CPU',
                        random_state=1,
                        verbose=False)

model5 = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')


In [None]:
model = VotingClassifier(estimators=[('xgb', model1),  ('cat', model2), ('lin', model5)], voting='soft')


In [None]:
model.fit(df_train.iloc[:, 0:768], df_train['Emotion'])

In [None]:
model.score(df_test.iloc[:, 0:768], df_test['Emotion'])

In [None]:
y_pred=model.predict(df_test.iloc[:, 0:768])

In [None]:
def show_confusion_matrix(confusion_matrix):
   hmap= sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
   hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
   hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
   plt.ylabel('True Emotion')
   plt.xlabel('Predicted Emotion');

cm = confusion_matrix(df_test['Emotion'], y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
from sklearn import metrics
print(metrics.classification_report(df_test['Emotion'], y_pred))

In [None]:
y_score_proba = model.predict_proba(df_test.iloc[:, 0:768])

In [None]:
# actual_labels_binarized = label_binarize(actual_labels, classes=category_list)
category_list = ["OPTIMISM","NEUTRAL","JOY","UPSET"]
fpr = {} # False Positive Rate
tpr = {} # True Positive Rate
thresh ={} # Threshold
roc_auc = dict()

# plt.figure(figsize = (10, 7))
for i in range(4):
    fpr[i], tpr[i], thresh[i] = roc_curve(y_onehot_test[:, i], y_score_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    plt.plot(fpr[i], tpr[i], linestyle='--', 
             label='%s vs Rest (AUC=%0.2f)'%(category_list[i],roc_auc[i]))


plt.plot([0,1],[0,1],'b--')
plt.xlim([0,1])
plt.ylim([0,1.05])
plt.title('Multiclass ROC curve of XGB Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='lower right')
plt.show()