In [1]:
import os
from pathlib import Path
from importlib.util import find_spec

import numpy as np
import pandas as pd
import fasttext
from sklearn.model_selection import train_test_split

if find_spec("src") is None:
    import sys

    sys.path.append("..")

from src.preprocess import convert_dataframe_to_bool, create_binary_label
from src.evaluate import (
    evaluate_model,
    compute_bias_metrics_for_model,
    get_final_metric,
    calculate_overall_auc,
)


In [2]:
is_kaggle = False
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None:
    is_kaggle = True

In [3]:
data_path = Path("..") / "data"
input_path = data_path / "interim"
output_path = data_path / "submissions"
output_file_name = "rnn.csv"

if is_kaggle:
    data_path = (
        Path("/kaggle") / "input" / "jigsaw-unintended-bias-in-toxicity-classification"
    )
    output_path = Path("/kaggle") / "working"
    output_file_name = "submission.csv"


In [4]:
df = pd.read_parquet(input_path / "train.parquet")

In [5]:
df.head(2)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,label
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,rejected,0,0,0,0,0,0.0,0,4,0
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,rejected,0,0,0,0,0,0.0,0,4,0


## Preprocess Data

In [6]:
df['comment_text'] = df['comment_text'].str.lower()

In [7]:
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=32)

In [8]:
def df_to_txt(df: pd.DataFrame, input_col: str, label_col: str, output_file_path: str) -> None:
    with open(output_file_path, "w") as f:
        for label, feature in zip(df[label_col], df[input_col]):
            one_line = (
                "__label__"
                + str(label).replace("\n", "").replace("\r", "")
                + " "
                + str(feature).replace("\n", "").replace("\r", "")
                + "\n"
            )
            f.write(one_line)

In [9]:
if not os.path.isfile(input_path / 'fasttext_train.txt'):
    df_to_txt(df, input_col="comment_text", label_col="label", output_file_path=input_path / 'fasttext_train.txt')

if not os.path.isfile(input_path / 'fasttext_valid.txt'):
    df_to_txt(df, input_col="comment_text", label_col="label", output_file_path=input_path / 'fasttext_valid.txt')

## Train Model

In [10]:
model = fasttext.train_supervised(
    input=str(input_path / "fasttext_train.txt"),
    autotuneValidationFile=str(input_path / "fasttext_valid.txt"),
    minCount=5,
    autotuneDuration=300
)


Progress: 100.0% Trials:    5 Best score:  0.987017 ETA:   0h 0m 0s
Training again with best arguments
Read 95M words
Number of words:  238005
Number of labels: 2
Progress: 100.0% words/sec/thread:  332359 lr:  0.000000 avg.loss:  0.125018 ETA:   0h 0m 0s


## Evaluate Model

In [11]:
df_test = pd.read_csv(data_path / "raw" / "test_private_expanded.csv")
df_test = create_binary_label(df_test, target_col="toxicity")
df_test = convert_dataframe_to_bool(df_test)
df_test.head()

Unnamed: 0,id,comment_text,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,...,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,label
0,7097320,[ Integrity means that you pay your debts.]\n\...,2017-09-13 20:12:01.484121+00,21,5945023.0,376974,approved,0,0,0,...,False,False,,,,,,False,,0
1,7097321,This is malfeasance by the Administrator and t...,2017-05-17 07:01:51.902566+00,55,,335003,approved,0,0,0,...,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,0
2,7097322,@Rmiller101 - Spoken like a true elitist. But ...,2016-12-02 17:12:12.920957+00,54,649389.0,154126,approved,0,0,0,...,False,False,0.0,0.0,0.0,0.0,0.0,False,0.0,0
3,7097323,"Paul: Thank you for your kind words. I do, in...",2017-04-21 14:58:05.474657+00,13,5158666.0,328376,approved,0,0,0,...,False,False,,,,,,False,,0
4,7097324,Sorry you missed high school. Eisenhower sent ...,2017-10-01 19:43:12.373229+00,102,6061626.0,383983,approved,1,0,0,...,False,False,,,,,,False,,0


In [12]:
label, prob = model.predict(list(df_test["comment_text"].str.replace("\n", "").values))
y_pred_prob = []
for i, j in zip(label, prob):
    if i[0] == "__label__1":
        y_pred_prob.append(j.item())
    else:
        y_pred_prob.append(1 - j.item())
y_pred_prob = np.array(y_pred_prob)

In [13]:
df_test['y_pred_proba'] = y_pred_prob
df_test['y_pred'] = (y_pred_prob >= 0.5).astype(int) 

In [14]:
df_test.head(2)

Unnamed: 0,id,comment_text,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,...,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,label,y_pred_proba,y_pred
0,7097320,[ Integrity means that you pay your debts.]\n\...,2017-09-13 20:12:01.484121+00,21,5945023.0,376974,approved,0,0,0,...,,,,,,False,,0,0.000555,0
1,7097321,This is malfeasance by the Administrator and t...,2017-05-17 07:01:51.902566+00,55,,335003,approved,0,0,0,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0,0.001479,0


In [15]:
evaluate_model(df_test)

Unnamed: 0,metrics,value
0,accuracy,0.941811
1,f1,0.546996
2,auc_roc,0.859903


In [16]:
identity_columns = [
    "male",
    "female",
    "homosexual_gay_or_lesbian",
    "christian",
    "jewish",
    "muslim",
    "black",
    "white",
    "psychiatric_or_mental_illness",
]

In [17]:
df_bias = compute_bias_metrics_for_model(df_test, identity_columns, "y_pred_proba", "label")
df_bias

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
5,muslim,1054,0.710379,0.793948,0.81868
6,black,761,0.730275,0.702588,0.905646
2,homosexual_gay_or_lesbian,538,0.735965,0.699802,0.908056
7,white,1178,0.739459,0.689368,0.917556
4,jewish,411,0.760333,0.785654,0.854867
1,female,2602,0.821323,0.781947,0.89813
3,christian,2109,0.825136,0.811523,0.882115
0,male,2112,0.830248,0.76914,0.918397
8,psychiatric_or_mental_illness,238,0.854972,0.73475,0.952706


In [18]:
get_final_metric(df_bias, calculate_overall_auc(df_test, 'y_pred_proba'))

0.8159016007508819