In [1]:
import os
from pathlib import Path
from importlib.util import find_spec

import numpy as np
import pandas as pd
import fasttext

if find_spec("src") is None:
    import sys

    sys.path.append("..")

from src.evaluate import (
    evaluate_model,
    compute_bias_metrics_for_model,
    get_final_metric,
    calculate_overall_auc,
)


In [2]:
data_path = Path("..") / "data"
input_path = data_path / "interim"
model_save_path = "../models/ft.bin"


## Train Model

In [3]:
if not os.path.isfile(model_save_path):
    model = fasttext.train_supervised(
        input=str(input_path / "fasttext_train.txt"),
        autotuneValidationFile=str(input_path / "fasttext_valid.txt"),
        minCount=5,
        autotuneDuration=300
    )
    model.save_model(model_save_path)
else:
    model = fasttext.load_model(model_save_path)



In [4]:
model.save_model(model_save_path)

## Evaluate Model

In [5]:
df_test = pd.read_parquet(input_path / "test.parquet")
df_test.head()

Unnamed: 0,id,comment_text,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,...,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,label
0,7097320,[ Integrity means that you pay your debts.]\n\...,2017-09-13 20:12:01.484121+00,21,5945023.0,376974,approved,0,0,0,...,False,False,,,,,,False,,0
1,7097321,This is malfeasance by the Administrator and t...,2017-05-17 07:01:51.902566+00,55,,335003,approved,0,0,0,...,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,0
2,7097322,@Rmiller101 - Spoken like a true elitist. But ...,2016-12-02 17:12:12.920957+00,54,649389.0,154126,approved,0,0,0,...,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,0
3,7097323,"Paul: Thank you for your kind words. I do, in...",2017-04-21 14:58:05.474657+00,13,5158666.0,328376,approved,0,0,0,...,False,False,,,,,,False,,0
4,7097324,Sorry you missed high school. Eisenhower sent ...,2017-10-01 19:43:12.373229+00,102,6061626.0,383983,approved,1,0,0,...,False,False,,,,,,False,,0


In [6]:
label, prob = model.predict(list(df_test["comment_text"].str.replace("\n", "").values))
y_pred_prob = []
for i, j in zip(label, prob):
    if i[0] == "__label__1":
        y_pred_prob.append(j.item())
    else:
        y_pred_prob.append(1 - j.item())
y_pred_prob = np.array(y_pred_prob)

In [7]:
df_test['y_pred_proba'] = y_pred_prob
df_test['y_pred'] = (y_pred_prob >= 0.5).astype(int) 

In [8]:
df_test.head(2)

Unnamed: 0,id,comment_text,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,...,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,label,y_pred_proba,y_pred
0,7097320,[ Integrity means that you pay your debts.]\n\...,2017-09-13 20:12:01.484121+00,21,5945023.0,376974,approved,0,0,0,...,,,,,,False,,0,0.000879,0
1,7097321,This is malfeasance by the Administrator and t...,2017-05-17 07:01:51.902566+00,55,,335003,approved,0,0,0,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0,0.020275,0


In [9]:
evaluate_model(df_test)

Unnamed: 0,metrics,value
0,accuracy,0.941
1,f1,0.528754
2,auc_roc,0.87724


In [10]:
identity_columns = [
    "male",
    "female",
    "homosexual_gay_or_lesbian",
    "christian",
    "jewish",
    "muslim",
    "black",
    "white",
    "psychiatric_or_mental_illness",
]

In [11]:
df_bias = compute_bias_metrics_for_model(df_test, identity_columns, "y_pred_proba", "label")
df_bias

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
5,muslim,108,0.666171,0.833442,0.749009
2,homosexual_gay_or_lesbian,56,0.75,0.63162,0.960295
4,jewish,40,0.78355,0.854145,0.819688
6,black,84,0.810031,0.729043,0.94154
8,psychiatric_or_mental_illness,26,0.8125,0.727311,0.957825
3,christian,231,0.831104,0.846547,0.876775
1,female,306,0.850064,0.787451,0.940198
7,white,112,0.851562,0.742717,0.935283
0,male,225,0.882342,0.772786,0.961301


In [12]:
get_final_metric(df_bias, calculate_overall_auc(df_test, 'y_pred_proba'))

0.8244654876965803