# 3.3 - Probability threshold otimization

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn import base

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif

from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.impute import KNNImputer

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
import itertools
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_curve
from collections import defaultdict 

from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

import scipy.stats as stats


from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from utils import *
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from collections import Counter
import inspect
from collections import defaultdict

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split

In [2]:

df = pd.read_csv("train_new_feats.csv")


target = [[f"target_{i}" for i in range(1, 9)] + ["Claim Injury Type"] + ["WCB Decision"] + ["Agreement Reached"] + ["Claim Injury Type_encoded"]]
target = [item for sublist in target for item in sublist]
target

binary_target = [f"target_{i}" for i in range(1, 9)]

original_target  = [col for col in target if col not in binary_target]

ordinal_target = ["Claim Injury Type_encoded"]

features = [feat for feat in df.columns if feat not in target]

features = [feat for feat in features if df[feat].dtype != "datetime64[ns]"]

num_feats = [feat for feat in features if df[feat].dtype != "object"]

cat_feats = [feat for feat in features if df[feat].dtype == "object"]
cat_feats_index = [features.index(feat) for feat in cat_feats]

df.columns

Index(['Age at Injury', 'Alternative Dispute Resolution',
       'Attorney/Representative', 'Carrier Name', 'Carrier Type',
       'Claim Injury Type', 'County of Injury', 'COVID-19 Indicator',
       'District Name', 'Gender', 'IME-4 Count', 'Industry Code Description',
       'Medical Fee Region', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Description',
       'Zip Code', 'Agreement Reached', 'WCB Decision', 'Number of Dependents',
       'Accident Date_year', 'Accident Date_missing', 'Accident_weekend',
       'Accident Date_month_cos', 'Accident Date_month_sin',
       'Accident Date_quarter_cos', 'Accident Date_quarter_sin',
       'Accident Date_assembly_gap_days', 'C3-C2_gap_days', 'C2_missing',
       'C3_missing', 'C2_Accident_gap_weeks', 'C3_Accident_gap_weeks',
       'Hearing Date_missing', 'Hearing_C3 gap_months',
       'Hearing_C2 gap_months', 'Hearing_assembly_gap_months',
       'Days to Assembly', 'Days to First 

In [3]:

def num_imputing(X_train, X_val):
    feats_imput_max = ["C2_Accident_gap_weeks", "C3_Accident_gap_weeks", "Accident Date_assembly_gap_days", "Hearing_C3 gap_months", "Hearing_C2 gap_months", "Hearing_assembly_gap_months", "Days to First Hearing"]

    feat_imput_min = ["C3-C2_gap_days"]
    
    for feat in X_train.columns:
        if X_train[feat].isna().sum() > 0 or X_val[feat].isna().sum() > 0:
            if feat in feats_imput_max:
                X_train[feat] = X_train[feat].fillna(X_train[feat].max())
                X_val[feat] = X_val[feat].fillna(X_train[feat].max())
            elif feat in feat_imput_min:
                X_train[feat] = X_train[feat].fillna(X_train[feat].min())
                X_val[feat] = X_val[feat].fillna(X_train[feat].min())
            else:
                X_train[feat] = X_train[feat].fillna(X_train[feat].mean())
                X_val[feat] = X_val[feat].fillna(X_train[feat].mean())
    return X_train, X_val

def frequency_encoding(train_df, val_df, column):
    """
    Apply frequency encoding on the training set and use the same encoding to impute the validation set.
    
    Parameters:
    train_df (pd.DataFrame): Training dataset.
    val_df (pd.DataFrame): Validation dataset.
    column (str): Column to encode.
    
    Returns:
    train_encoded (pd.DataFrame): Encoded training set.
    val_encoded (pd.DataFrame): Encoded validation set.
    freq_map (dict): Mapping of frequency counts for the column.
    """
    # Compute frequency encoding for the training set
    freq_map = train_df[column].value_counts(normalize=True)  # Relative frequency
    train_df[f"{column}_freq"] = train_df[column].map(freq_map)

    # Impute frequency encoding on the validation set using the same mapping
    val_df[f"{column}_freq"] = val_df[column].map(freq_map)

    # Handle unseen categories in validation by imputing 0 frequency
    val_df[f"{column}_freq"] = val_df[f"{column}_freq"].fillna(0)
    
    train_df = train_df.drop(columns=[column])
    val_df = val_df.drop(columns=[column])

    # Return encoded datasets and frequency map
    return train_df, val_df, freq_map

def target_guided_ordinal_encoding(X_train, X_val, categorical_column, target_column, y_train, i):
    # Combine X_train with y_train temporarily to calculate means
    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()
    X_train_encoded[target_column] = y_train

    means = X_train_encoded.groupby(categorical_column)[target_column].mean()
    #print(means)

    sorted_means = means.sort_values(by=target_column)
    #print(sorted_means)
    # if i == 1:
    #     print(f"Showing sorted means for {categorical_column}")
    #     lst_names = sorted_means.index.tolist()
    #     lst_values = sorted_means.values.tolist()
    #     dict_final = dict(zip(lst_names, lst_values))
    #     print(dict_final)
    
    ordinal_mapping = {category: rank for rank, category in enumerate(sorted_means.index, start=1)}
    # if i == 1:
    #     print(f"Showing ordinal mapping for {categorical_column}")
    #     print(ordinal_mapping)
    #     print("--------------------------------")
        
    X_train_encoded[f"{categorical_column}_encoded"] = X_train_encoded[categorical_column].map(ordinal_mapping)
    X_val_encoded[f"{categorical_column}_encoded"] = X_val_encoded[categorical_column].map(ordinal_mapping)

    #X_train_encoded = X_train_encoded.drop(columns=[categorical_column])
    X_train_encoded = X_train_encoded.drop(columns=[target_column[0]])
    #X_val_encoded = X_val_encoded.drop(columns=[categorical_column])
    X_train_encoded = X_train_encoded.fillna(1)
    X_val_encoded = X_val_encoded.fillna(1)

    return X_train_encoded, X_val_encoded, ordinal_mapping

In [4]:

selected_features=['Attorney/Representative',
 'IME-4 Count',
 'Accident Date_year',
 'Accident Date_assembly_gap_days',
 'C3-C2_gap_days',
 'C2_missing',
 'C3_missing',
 'C3_Accident_gap_weeks',
 'Hearing_C3 gap_months',
 'Hearing_C2 gap_months',
 'Days to Assembly',
 'Days to First Hearing',
 'Average Weekly Wage_log',
 'Carrier Name_encoded',
 'Carrier Type_encoded',
 'Industry Code Description_encoded',
 'WCIO Cause of Injury Description_encoded',
 'WCIO Nature of Injury Description_encoded',
 'WCIO Part Of Body Description_encoded',
 'Carrier Name_freq',
 'Carrier Type_freq',
 'Industry Code Description_freq',
 'WCIO Nature of Injury Description_freq',
 'WCIO Part Of Body Description_freq']

naive_features = [feat.replace("_encoded", "") for feat in selected_features]
naive_features = [feat.replace(f"_freq", "") for feat in naive_features]
naive_features = set(naive_features)
naive_features = list(naive_features)

cat_feats = [feat for feat in naive_features if feat in cat_feats]


In [5]:
X = df[naive_features]
y = df[ordinal_target]

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()

In [7]:
print(f"Ordinal encoding...")
for cat in cat_feats:
    X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y_train, 0)

print(f"Frequency encoding...")
for cat in cat_feats:
    X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)

X_train_encoded  = X_train_encoded[selected_features]
X_val_encoded = X_val_encoded[selected_features]

X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)






clf = CatBoostClassifier(random_state=42, verbose=10, iterations=1000, depth=6, boosting_type='Ordered', auto_class_weights='SqrtBalanced', loss_function="MultiClassOneVsAll")

clf.fit(X_train_imputed, y_train)



Ordinal encoding...
Frequency encoding...
0:	learn: 0.6714469	total: 2.18s	remaining: 36m 15s
10:	learn: 0.5094364	total: 17.6s	remaining: 26m 24s
20:	learn: 0.4100677	total: 34s	remaining: 26m 23s
30:	learn: 0.3451700	total: 49.9s	remaining: 25m 59s
40:	learn: 0.3014844	total: 1m 2s	remaining: 24m 33s
50:	learn: 0.2707459	total: 1m 18s	remaining: 24m 22s
60:	learn: 0.2491847	total: 1m 37s	remaining: 24m 58s
70:	learn: 0.2334113	total: 1m 54s	remaining: 24m 52s
80:	learn: 0.2220211	total: 2m 9s	remaining: 24m 31s
90:	learn: 0.2135865	total: 2m 25s	remaining: 24m 16s
100:	learn: 0.2071378	total: 2m 39s	remaining: 23m 39s
110:	learn: 0.2022865	total: 2m 49s	remaining: 22m 34s
120:	learn: 0.1985600	total: 2m 59s	remaining: 21m 44s
130:	learn: 0.1954808	total: 3m 13s	remaining: 21m 22s
140:	learn: 0.1929047	total: 3m 25s	remaining: 20m 54s
150:	learn: 0.1906462	total: 3m 39s	remaining: 20m 31s
160:	learn: 0.1888497	total: 3m 51s	remaining: 20m 3s
170:	learn: 0.1873172	total: 4m 3s	remainin

<catboost.core.CatBoostClassifier at 0x1b4dac19c40>

In [8]:
train_preds_initial = clf.predict(X_train_imputed)
val_preds_initial = clf.predict(X_val_imputed)

In [9]:
first_model_train_proba = clf.predict_proba(X_train_imputed)
first_model_val_proba = clf.predict_proba(X_val_imputed)

In [10]:
def optimize_thresholds(y_true, probabilities):
    best_thresholds = []
    for i in range(probabilities.shape[1]):  # Loop over each class
        precision, recall, thresholds = precision_recall_curve((y_true == i).astype(int), probabilities[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
        best_thresholds.append(thresholds[np.argmax(f1_scores)])  # Store best threshold
    return best_thresholds

def predict_with_thresholds(probabilities, thresholds):
    weighted_probs = probabilities / np.array(thresholds)  
    predictions = np.argmax(weighted_probs, axis=1)  
    return predictions

In [11]:
best_thresholds = optimize_thresholds(y_val, first_model_val_proba)
best_thresholds

[0.5523639280183358,
 0.2706995022796855,
 0.24118589499033843,
 0.2641656406998126,
 0.37531474703930623,
 0.3281035672210727,
 0.5098517158018644,
 0.634415672799047]

In [12]:
train_predictions = predict_with_thresholds(first_model_train_proba, best_thresholds)
val_predictions = predict_with_thresholds(first_model_val_proba, best_thresholds)

f1_score_train = f1_score(y_train, train_predictions, average="macro")
f1_score_val = f1_score(y_val, val_predictions, average="macro")


In [13]:
print(f"Train F1 with model.predict: {f1_score(y_train, train_preds_initial, average='macro')}")
print(f"Validation F1 with model.predict: {f1_score(y_val, val_preds_initial, average='macro')}")
print(f"F1 Score Train with adjusted thesholds: {f1_score_train}")
print(f"F1 Score Validation with adjusted thesholds: {f1_score_val}")

Train F1 with model.predict: 0.5487136534674067
Validation F1 with model.predict: 0.4775971219853013
F1 Score Train with adjusted thesholds: 0.5547015091185956
F1 Score Validation with adjusted thesholds: 0.48638799912268993
