In [29]:
import numpy as np 
from dotenv import load_dotenv
import os 
import json
import matplotlib.pyplot as plt

load_dotenv(override=True)

True

In [30]:
fcm_result_txt = f"{os.getenv('ROOT_PATH')}/results/fcm_result.txt"
lfcm_result_txt = f"{os.getenv('ROOT_PATH')}/results/lfcm_result.txt"

In [40]:
def run_evaluation(model, mode="classification"):
    sum_racist = 0
    sum_notRacist = 0

    model = model 
    results = []

    text_file = ""

    if model == 'fcm':
        text_file = fcm_result_txt
    elif model == 'lfcm':
        text_file = lfcm_result_txt

    # FCM score results
    with open(text_file) as f:
        for line in f:
            data = line.split(',')
            label = int(data[0])
            notRacist_score = float(data[1])
            racist_score = float(data[2])
        
            # default for classification
            if mode == 'classification':
                softmax_racist_score = np.exp(racist_score) / (np.exp(racist_score) + np.exp(notRacist_score))
            
            if label == 1:
                sum_racist += softmax_racist_score
            elif label == 0:
                sum_notRacist += softmax_racist_score
        
            results.append([label, softmax_racist_score])

    print(sum_notRacist, sum_racist) 
    return results

In [41]:
fcm_results = run_evaluation('fcm')
lfcm_results = run_evaluation('lfcm')


478.5159525210982 238.7387398604339
464.7220208626825 229.83683553867962


In [33]:
# print(fcm_results)
# print(lfcm_results)

In [42]:
def write_results(data, filepath):
    index = 0
    for x in data:
        print(len(data[x]))
    with open(filepath, 'w', encoding='utf-8') as txt_file:
        for i in range(len(data['thresholds'])):
            txt_file.write(f"{round(float(data['thresholds'][i]), 3)}:\t\t\tf1({data['f1s'][i]})\t\t\tacc({data['accuracies'][i]})\n")
    

In [35]:
def evaluate(results, model_name):
    thresholds = np.arange(0, 1, 0.001)
    # thresholds = np.arange(0.4, 0.44, 0.0000001)

    best_f = 0
    best_th = 0
    best_f_re = 0
    best_f_pr = 0
    best_accuracy = 0

    acc_racist_best_accuracy = 0
    acc_notRacist_best_accuracy = 0
    best_acc_th = 0

    precisions = []
    recalls = []
    accuracies = []
    f1s = []
    fpr = []
    tps = []
    tns = []
    fps = []
    fns = []
    ths = []

    for th in thresholds:
        tp = 0
        fp = 0
        fn = 0
        tn = 0

        for r in results:
            if r[0] == 1 and r[1] >= th:
                tp += 1
            elif r[0] == 1 and r[1] < th:
                fn += 1
            elif r[0] == 0 and r[1] < th:
                tn += 1
            elif r[0] == 0 and r[1] >= th:
                fp += 1

        #  precision and recall
        if tp > 0:
            pr = tp / float((tp + fp))
            re = tp / float((tp + fn))

        # f1 score
        if pr + re > 0:
            f = 2 * (pr * re) / (pr + re)
        else:
            f = 0

        accuracy_racist = re
        if tn + fn > 0:
            accuracy_notRacist = tn / float(tn + fp)
        else:
            accuracy_notRacist = 0
        accuracy = (accuracy_racist + accuracy_notRacist) / 2

        precisions.append(pr)
        recalls.append(re)
        f1s.append(f)
        accuracies.append((tp + tn) / (tp + tn + fp + fn))
        fpr.append(tn / float(tn + fp))

        tps.append(tp)
        tns.append(tn)
        fps.append(fp)
        fns.append(fn)
        ths.append(th)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            acc_racist_best_accuracy = accuracy_racist
            acc_notRacist_best_accuracy = accuracy_notRacist
            best_acc_th = th

        if f > best_f:
            best_f = f
            best_th = th
            best_f_pr = pr
            best_f_re = re

        # print("thr " + str(th) + " --> F1: " + str(f) + " PR: " + str(pr) + " RE: " + str(re) + " ACC Hate: " + str(
        #     accuracy_racist) + " ACC NotHate: " + str(accuracy_notRacist) + " ACC mean: " + str(accuracy))
        
        # print(tp, tn, fp, fn)
    
    x = {
        'precisions': precisions,
        'recalls': recalls,
        'f1s': f1s,
        'accuracies': accuracies,
        'fpr': fpr,
        'tps': tps,
        'tns': tns,
        'fps': fps,
        'fns': fns,
        'thresholds': thresholds 
    }

    # print(x)  
    write_results(x, f"{os.getenv('ROOT_PATH')}/results/thresholds.txt")
    print("Best F1:  thr " + str(best_th) + " --> F1: " + str(best_f) + " PR: " + str(best_f_pr) + " RE: " + str(best_f_re))
    print("Best mean ACC:  thr " + str(best_acc_th) + " --> ACC: " + str(best_accuracy*100) + " Hate ACC: " + str(
        acc_racist_best_accuracy*100) + " Not Hate ACC: " + str(acc_notRacist_best_accuracy*100))    
    

    print('precision:', min(precisions), max(precisions))
    print('recall:', min(recalls), max(recalls))
    print('f1:', min(f1s), max(f1s))
    print('accuracy:', min(accuracies), max(accuracies))

    # # Print ROC curve
    # plt.plot(recalls, fpr)
    # plt.ylabel('True Positive Rate')
    # plt.xlabel('False Positive Rate')
    # plt.title("ROC " + model_name)
    # plt.show()

    # auc = np.trapz(recalls, fpr)
    # print('AUC:' + str(auc))



In [36]:
print(evaluate(fcm_results, 'fcm'))
fcm_threshold = 0.0
# print(fcm_results)
_r = 0
_nr = 0
for r in fcm_results:
    # threshold for fcm
    if r[1] >= fcm_threshold:
        _r += 1
    else:
        _nr += 1

print(_nr, _r)
print((_r) / (_r + _nr))

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
Best F1:  thr 0.435 --> F1: 0.5021929824561403 PR: 0.33775811209439527 RE: 0.9786324786324786
Best mean ACC:  thr 0.519 --> ACC: 53.928648707129724 Hate ACC: 62.39316239316239 Not Hate ACC: 45.46413502109704
precision: 0.330028328611898 0.6666666666666666
recall: 0.002136752136752137 1.0
f1: 0.00425531914893617 0.5021929824561403
accuracy: 0.3298022598870056 0.6701977401129944
None
0 1416
1.0


In [37]:
# threshold for lfcm
evaluate(lfcm_results, 'lfcm')
print(lfcm_results)

_r = 0
_nr = 0
for r in lfcm_results:
    if r[1] >= 0.47700000000000004:
        _r += 1
    else:
        _nr += 1

print(_nr, _r)
print((_r) / (_r + _nr))

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
Best F1:  thr 0.441 --> F1: 0.49786324786324787 PR: 0.3319088319088319 RE: 0.9957264957264957
Best mean ACC:  thr 0.47700000000000004 --> ACC: 53.05907172995781 Hate ACC: 83.33333333333334 Not Hate ACC: 22.78481012658228
precision: 0.13333333333333333 0.36363636363636365
recall: 0.002136752136752137 1.0
f1: 0.004228329809725159 0.49786324786324787
accuracy: 0.3298022598870056 0.6694915254237288
[[1, 0.4971608228665581], [0, 0.4944499854623517], [0, 0.495740563273872], [1, 0.5172420235602779], [1, 0.49139282857582994], [0, 0.5282944003679252], [1, 0.49838731158964267], [0, 0.49716409542868395], [0, 0.4821800250337796], [0, 0.49156056828918615], [0, 0.49274186182879454], [0, 0.48110991303392425], [0, 0.5026539203138589], [1, 0.47855707482990667], [1, 0.5143135223435475], [0, 0.5335798082793655], [0, 0.4624013815954899], [1, 0.4564455738784724], [1, 0.49712895963498605], [0, 0.49493250699129393], [0, 0.4575597944304519], [1, 0.497132010547

In [38]:
def re_evaluate(results, model):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    threshold = 0
    for _r in results:
        label = _r[0]
        softmax_score = _r[1]

        if (model) == 'fcm':
            threshold = 0.5
        elif (model) == 'lfcm':
            threshold = 0.5
        
        if label == 1 and softmax_score >= threshold:
                tp += 1
        elif label == 1 and softmax_score < threshold:
                fn += 1
        elif label == 0 and softmax_score < threshold:
                tn += 1
        elif label == 0 and softmax_score >= threshold:
                fp += 1
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = (2 * precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + fn + tn + fp) 

    print('tp:',tp, 'tn:',tn, 'fp:',fp, 'fn:',fn)
    print('precision:', precision)
    print('recall:', recall)
    print('f1:', f1)
    print('accuracy:', accuracy)

re_evaluate(fcm_results, 'lfcm')
print()
re_evaluate(fcm_results, 'fcm')


tp: 357 tn: 266 fp: 682 fn: 111
precision: 0.343599615014437
recall: 0.7628205128205128
f1: 0.4737889847378899
accuracy: 0.4399717514124294

tp: 357 tn: 266 fp: 682 fn: 111
precision: 0.343599615014437
recall: 0.7628205128205128
f1: 0.4737889847378899
accuracy: 0.4399717514124294


In [39]:
# re_evaluate(fcm_results, 'fcm')