In [63]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [3]:
path = "../out/true-final-3500/"
t1 = joblib.load(path + 't1.joblib.z')
t1ngram = joblib.load(path + 't1ngram.joblib.z')
t2 = joblib.load(path + 't2.joblib.z')
t2ngram = joblib.load(path + 't2ngram.joblib.z')
t3ovr = joblib.load(path + 't3ovr.joblib.z')
t3ovrngram = joblib.load(path + 't3ovr_ngram.joblib.z')
t3bc = joblib.load(path + 't3bc.joblib.z')
t3cc = joblib.load(path + 't3cc.joblib.z')
t3lp = joblib.load(path + 't3lp.joblib.z')

In [11]:
val_df = pd.read_csv("../res/val_final_3500.csv")
val_text = val_df["text"]

In [71]:
t1_models = [t1, t1ngram]
t1_names = ["t1", "t1ngram"]
t2_models = [t2, t2ngram]
t2_names = ["t2", "t2ngram"]
t3_models = [t3ovr, t3ovrngram, t3bc, t3cc, t3lp]
t3_names = ["t3ovr", "t3ovrngram", "t3bc", "t3cc", "t3lp"]

t1_predictions = []
t1_predictions_proba = []
t2_predictions = []
t3_predictions = []

t1_targets = ['subtask1']
t1_targets_proba = ["HATE","NOT","OFFN"]
t2_targets = ['PRFN']
t3_targets = ['Race', 'Religion', 'Gender', 'Other', 'None']

In [72]:
for i,x in enumerate(t1_models):
    predicts = x.predict(val_text)
    t1_predictions.append(predicts)

for i,x in enumerate(t1_models):
    predicts = x.predict_proba(val_text)
    t1_predictions_proba.append(predicts)

for i,x in enumerate(t2_models):
    predicts = x.predict(val_text)
    t2_predictions.append(predicts)

for i,x in enumerate(t3_models):
    predicts = x.predict(val_text)
    t3_predictions.append(predicts)

In [30]:
print("----- t1 accuracies: -----")
for i,x in enumerate(t1_predictions):
    acc = round(accuracy_score(x, val_df[t1_targets])*100,4)
    print(f"{t1_names[i]} - {str(acc)}")

print("----- t2 accuracies: -----")
for i,x in enumerate(t2_predictions):
    acc = round(accuracy_score(x, val_df[t2_targets])*100,4)
    print(f"{t2_names[i]} - {str(acc)}")

print("-----t3 accuracies: -----")
for i,x in enumerate(t3_predictions):
    acc = round(accuracy_score(x, val_df[t3_targets])*100,4)
    print(f"{t3_names[i]} - {str(acc)}")

----- t1 accuracies: -----
t1 - 61.1429
t1ngram - 61.4286
----- t2 accuracies: -----
t2 - 86.8571
t2ngram - 86.2857
-----t3 accuracies: -----
t3ovr - 44.8571
t3ovrngram - 44.5714
t3bc - 44.8571
t3cc - 54.0
t3lp - 52.8571


In [38]:
from sklearn.metrics import multilabel_confusion_matrix

In [46]:
val_df.head()

Unnamed: 0,text,HATE,NOT,OFFN,PRFN,Race,Religion,Gender,Other,None,subtask1
0,unapologetically american i like legal immigra...,0,0,1,1,0,0,0,1,0,OFFN
1,what with this racist coon nurse a murderess h...,1,0,0,1,1,0,0,0,0,HATE
2,what the fuck is this nigger announcer on nhl ...,1,0,0,1,1,0,0,0,0,HATE
3,who the fuck is goatfucker hamed halts maul du...,1,0,0,1,1,0,0,0,0,HATE
4,labour negress says prince charles unfit this ...,1,0,0,1,1,0,1,0,0,HATE


In [57]:
print("----- t1 report: -----")
for i,x in enumerate(t1_predictions):
    report = classification_report(y_pred=x,
            y_true=val_df[t1_targets], 
            zero_division = 1)
    print(f"------------------- {t1_names[i]} -------------------")
    print(report)

print("----- t2 report: -----")
for i,x in enumerate(t2_predictions):
    report = classification_report(y_pred=x,
            y_true=val_df[t2_targets], 
            zero_division = 1)
    print(f"------------------- {t2_names[i]} -------------------")
    print(report)

print("----- t3 report: -----")
for i,x in enumerate(t3_predictions):
    report = classification_report(y_pred=x,
            y_true=val_df[t3_targets], 
            target_names=t3_targets,
            zero_division = 1)
    print(f"------------------- {t3_names[i]} -------------------")
    print(report)

----- t1 report: -----
------------------- t1 -------------------
              precision    recall  f1-score   support

        HATE       0.65      0.71      0.68       105
         NOT       0.61      0.70      0.65       134
        OFFN       0.57      0.41      0.47       111

    accuracy                           0.61       350
   macro avg       0.61      0.61      0.60       350
weighted avg       0.61      0.61      0.60       350

------------------- t1ngram -------------------
              precision    recall  f1-score   support

        HATE       0.60      0.70      0.65       105
         NOT       0.65      0.71      0.68       134
        OFFN       0.58      0.41      0.48       111

    accuracy                           0.61       350
   macro avg       0.61      0.61      0.60       350
weighted avg       0.61      0.61      0.61       350

----- t2 report: -----
------------------- t2 -------------------
              precision    recall  f1-score   support

   

In [82]:
print("----- t1 AUROC: -----")
for i,x in enumerate(t1_predictions_proba):
    roc = roc_auc_score(y_true=val_df[t1_targets_proba], 
        y_score=x,
        average='weighted')
    print(f"{t1_names[i]} - {str(roc)}")

print("----- t2 AUROC: -----")
for i,x in enumerate(t2_predictions):
    roc = roc_auc_score(y_true=val_df[t2_targets], 
        y_score=x,
        average='weighted')
    print(f"{t2_names[i]} - {str(roc)}")

print("----- t3 AUROC: -----")
for i,x in enumerate(t3_predictions):
    try:
        roc = roc_auc_score(y_true=val_df[t3_targets], 
            y_score=x,
            average='weighted')
    except: # If prediction scores are in a sparse array, convert it to an array
        roc = roc_auc_score(y_true=val_df[t3_targets], 
            y_score=x.toarray(),
            average='weighted')
    print(f"{t3_names[i]} - {str(roc)}")

----- t1 AUROC: -----
t1 - 0.7740418048603513
t1ngram - 0.7862729842019825
----- t2 AUROC: -----
t2 - 0.5465908292956956
t2ngram - 0.5432904992626922
----- t3 AUROC: -----
t3ovr - 0.7152642862102714
t3ovrngram - 0.7145836739033432
t3bc - 0.7152642862102714
t3cc - 0.7111818280572407
t3lp - 0.7051525381592801
