In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn import base

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif

from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.impute import KNNImputer

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import itertools
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier


from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_curve
from collections import defaultdict 

from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

import scipy.stats as stats


from sklearn.model_selection import TimeSeriesSplit
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from utils import *
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from collections import Counter
import inspect
from collections import defaultdict

from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
# imports dataset

In [2]:
df = pd.read_csv("train_new_feats.csv")


In [3]:

target = [[f"target_{i}" for i in range(1, 9)] + ["Claim Injury Type"] + ["WCB Decision"] + ["Agreement Reached"] + ["Claim Injury Type_encoded"]]
target = [item for sublist in target for item in sublist]
target

binary_target = [f"target_{i}" for i in range(1, 9)]

original_target  = [col for col in target if col not in binary_target]

ordinal_target = ["Claim Injury Type_encoded"]

features = [feat for feat in df.columns if feat not in target]

features = [feat for feat in features if df[feat].dtype != "datetime64[ns]"]

num_feats = [feat for feat in features if df[feat].dtype != "object"]

cat_feats = [feat for feat in features if df[feat].dtype == "object"]
cat_feats_index = [features.index(feat) for feat in cat_feats]




In [4]:

def num_imputing(X_train, X_val):
    feats_imput_max = ["C2_Accident_gap_weeks", "C3_Accident_gap_weeks", "Accident Date_assembly_gap_days", "Hearing_C3 gap_months", "Hearing_C2 gap_months", "Hearing_assembly_gap_months", "Days to First Hearing"]

    feat_imput_min = ["C3-C2_gap_days"]
    
    for feat in X_train.columns:
        if X_train[feat].isna().sum() > 0 or X_val[feat].isna().sum() > 0:
            if feat in feats_imput_max:
                X_train[feat] = X_train[feat].fillna(X_train[feat].max())
                X_val[feat] = X_val[feat].fillna(X_train[feat].max())
            elif feat in feat_imput_min:
                X_train[feat] = X_train[feat].fillna(X_train[feat].min())
                X_val[feat] = X_val[feat].fillna(X_train[feat].min())
            else:
                X_train[feat] = X_train[feat].fillna(X_train[feat].mean())
                X_val[feat] = X_val[feat].fillna(X_train[feat].mean())
    return X_train, X_val

In [5]:

def optimize_thresholds(y_true, probabilities):
    best_thresholds = []
    for i in range(probabilities.shape[1]):  # Loop over each class
        precision, recall, thresholds = precision_recall_curve((y_true == i).astype(int), probabilities[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
        best_thresholds.append(thresholds[np.argmax(f1_scores)])  # Store best threshold
    return best_thresholds

def predict_with_thresholds(probabilities, thresholds):
    weighted_probs = probabilities / np.array(thresholds)  
    predictions = np.argmax(weighted_probs, axis=1)  
    return predictions


def model_predictions_global(X_train, y_train, X_val, y_val, model, clf_name, specific_target, cv_i, params):
    
    train_proba = model.predict_proba(X_train)
    val_proba = model.predict_proba(X_val)
    
    best_thresholds = optimize_thresholds(y_val, val_proba)
    
    train_predictions = predict_with_thresholds(train_proba, best_thresholds)
    val_predictions = predict_with_thresholds(val_proba, best_thresholds)
    
    
    f1_score_train = f1_score(y_train, train_predictions, average="macro")
    f1_score_val = f1_score(y_val, val_predictions, average="macro")
    
    print(f"{clf_name}...")
    print(f"Params: {params}")
    print(f"Train F1-score: {round(f1_score_train, 3)}")
    print(f"Thresholds: {best_thresholds}")
    print(f"Validation F1-score: {round(f1_score_val, 3)}")
    print(classification_report(y_val, val_predictions))
    
    return params, f1_score_train, f1_score_val 

In [6]:

def frequency_encoding(train_df, val_df, column):
    """
    Apply frequency encoding on the training set and use the same encoding to impute the validation set.
    
    Parameters:
    train_df (pd.DataFrame): Training dataset.
    val_df (pd.DataFrame): Validation dataset.
    column (str): Column to encode.
    
    Returns:
    train_encoded (pd.DataFrame): Encoded training set.
    val_encoded (pd.DataFrame): Encoded validation set.
    freq_map (dict): Mapping of frequency counts for the column.
    """
    # Compute frequency encoding for the training set
    freq_map = train_df[column].value_counts(normalize=True)  # Relative frequency
    train_df[f"{column}_freq"] = train_df[column].map(freq_map)

    # Impute frequency encoding on the validation set using the same mapping
    val_df[f"{column}_freq"] = val_df[column].map(freq_map)

    # Handle unseen categories in validation by imputing 0 frequency
    val_df[f"{column}_freq"] = val_df[f"{column}_freq"].fillna(0)
    
    train_df = train_df.drop(columns=[column])
    val_df = val_df.drop(columns=[column])

    # Return encoded datasets and frequency map
    return train_df, val_df, freq_map

In [7]:

def target_guided_ordinal_encoding(X_train, X_val, categorical_column, target_column, y_train, i):
    # Combine X_train with y_train temporarily to calculate means
    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()
    X_train_encoded[target_column] = y_train

    means = X_train_encoded.groupby(categorical_column)[target_column].mean()
    #print(means)

    sorted_means = means.sort_values(by=target_column)
    #print(sorted_means)
    # if i == 1:
    #     print(f"Showing sorted means for {categorical_column}")
    #     lst_names = sorted_means.index.tolist()
    #     lst_values = sorted_means.values.tolist()
    #     dict_final = dict(zip(lst_names, lst_values))
    #     print(dict_final)
    
    ordinal_mapping = {category: rank for rank, category in enumerate(sorted_means.index, start=1)}
    # if i == 1:
    #     print(f"Showing ordinal mapping for {categorical_column}")
    #     print(ordinal_mapping)
    #     print("--------------------------------")
        
    X_train_encoded[f"{categorical_column}_encoded"] = X_train_encoded[categorical_column].map(ordinal_mapping)
    X_val_encoded[f"{categorical_column}_encoded"] = X_val_encoded[categorical_column].map(ordinal_mapping)

    #X_train_encoded = X_train_encoded.drop(columns=[categorical_column])
    X_train_encoded = X_train_encoded.drop(columns=[target_column[0]])
    #X_val_encoded = X_val_encoded.drop(columns=[categorical_column])
    X_train_encoded = X_train_encoded.fillna(1)
    X_val_encoded = X_val_encoded.fillna(1)

    return X_train_encoded, X_val_encoded, ordinal_mapping

In [8]:


selected_features=['Attorney/Representative',
 'IME-4 Count',
 'Accident Date_year',
 'Accident Date_assembly_gap_days',
 'C3-C2_gap_days',
 'C2_missing',
 'C3_missing',
 'C3_Accident_gap_weeks',
 'Hearing_C3 gap_months',
 'Hearing_C2 gap_months',
 'Days to Assembly',
 'Days to First Hearing',
 'Average Weekly Wage_log',
 'Carrier Name_encoded',
 'Carrier Type_encoded',
 'Industry Code Description_encoded',
 'WCIO Cause of Injury Description_encoded',
 'WCIO Nature of Injury Description_encoded',
 'WCIO Part Of Body Description_encoded',
 'Carrier Name_freq',
 'Carrier Type_freq',
 'Industry Code Description_freq',
 'WCIO Nature of Injury Description_freq',
 'WCIO Part Of Body Description_freq']

naive_features = [feat.replace("_encoded", "") for feat in selected_features]
naive_features = [feat.replace(f"_freq", "") for feat in naive_features]
naive_features = set(naive_features)
naive_features = list(naive_features)

cat_feats = [feat for feat in naive_features if feat in cat_feats]


In [9]:
X_train, X_val, y_train, y_val = train_test_split(df[naive_features], df[ordinal_target], test_size=0.25, random_state=42, stratify=df[ordinal_target])


In [10]:


stacking_model_params = {"C": [1], "multi_class": ["ovr"], "class_weight": ["balanced"]}
stacking_results = []


X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()

print(f"Ordinal encoding...")
X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()
for cat in cat_feats:
    X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y_train, 1)

print(f"Frequency encoding...")
for cat in cat_feats:
    X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)

X_train_encoded  = X_train_encoded[selected_features]
X_val_encoded = X_val_encoded[selected_features]

print(f"Impuiting missing values...")
X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)

print(f"Scaling numericals ...")
X_train_imputed, X_val_imputed = num_scaling(X_train_imputed, X_val_imputed)


# ----------------- Stacking single models 
print(f"Pre-fitting models...")
estimators = [("nn", MLPClassifier(random_state=42, verbose=10, hidden_layer_sizes = (25,8), learning_rate_init=0.01).fit(X_train_imputed, y_train)),
                ("catboost", CatBoostClassifier(random_state=42, verbose=100, iterations=1000, depth=6, boosting_type='Ordered', auto_class_weights='SqrtBalanced', loss_function="MultiClassOneVsAll").fit(X_train_imputed, y_train))]


metalearner_best_params = {}
keys, values = zip(*stacking_model_params.items())
for combination in itertools.product(*values):
    params = dict(zip(keys, combination))
    
    print(f"Testing {params} for metalearner...")
    print(f"Stacking {estimators[0][0]} and {estimators[1][0]}...")
    st = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, ** params), cv="prefit").fit(X_train_imputed, y_train)

    params, f1_score_train, f1_score_val = model_predictions_global(X_train_imputed, y_train, X_val_imputed, y_val, st, f"stacking_{estimators[0][0]}_{estimators[1][0]}", "", 1, None)
    
    stacking_results.append({"params": params, "f1_score_train": f1_score_train, "f1_score_val": f1_score_val})
    

Ordinal encoding...
Frequency encoding...
Impuiting missing values...
Scaling numericals ...
Pre-fitting models...
Iteration 1, loss = 0.65646500
Iteration 2, loss = 0.62093225
Iteration 3, loss = 0.61367077
Iteration 4, loss = 0.60988526
Iteration 5, loss = 0.60731536
Iteration 6, loss = 0.60543196
Iteration 7, loss = 0.60407227
Iteration 8, loss = 0.60350148
Iteration 9, loss = 0.60279669
Iteration 10, loss = 0.60254505
Iteration 11, loss = 0.60209650
Iteration 12, loss = 0.60168521
Iteration 13, loss = 0.60089948
Iteration 14, loss = 0.60133125
Iteration 15, loss = 0.60030259
Iteration 16, loss = 0.60015537
Iteration 17, loss = 0.60046001
Iteration 18, loss = 0.60015482
Iteration 19, loss = 0.60010909
Iteration 20, loss = 0.59929760
Iteration 21, loss = 0.60011643
Iteration 22, loss = 0.59978173
Iteration 23, loss = 0.59934325
Iteration 24, loss = 0.59931830
Iteration 25, loss = 0.59941536
Iteration 26, loss = 0.59964759
Iteration 27, loss = 0.59927294
Iteration 28, loss = 0.5988434

In [11]:
stacking_results1 = stacking_results

In [16]:
stacking_results1

[{'params': None,
  'f1_score_train': 0.45606447250558896,
  'f1_score_val': 0.43191298089588276}]

In [12]:

stacking_model_params = {"C": [1], "multi_class": ["ovr"], "class_weight": ["balanced"]}
stacking_results = []

X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()

print(f"Ordinal encoding...")
X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()
for cat in cat_feats:
    X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y_train, 1)

print(f"Frequency encoding...")
for cat in cat_feats:
    X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)

X_train_encoded  = X_train_encoded[selected_features]
X_val_encoded = X_val_encoded[selected_features]

print(f"Impuiting missing values...")
X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)

print(f"Scaling numericals ...")
X_train_imputed, X_val_imputed = num_scaling(X_train_imputed, X_val_imputed)


# ----------------- Stacking single models 
print(f"Pre-fitting models...")
estimators = [("NB", GaussianNB(var_smoothing=0.1).fit(X_train_imputed, y_train)),
                ("catboost", CatBoostClassifier(random_state=42, verbose=100, iterations=1000, depth=6, boosting_type='Ordered', auto_class_weights='SqrtBalanced', loss_function="MultiClassOneVsAll").fit(X_train_imputed, y_train))]

metalearner_best_params = {}
keys, values = zip(*stacking_model_params.items())
for combination in itertools.product(*values):
    params = dict(zip(keys, combination))
    print(f"Testing {params} for metalearner...")
        
    print(f"Stacking {estimators[0][0]} and {estimators[1][0]}...")
    st = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, ** params), cv="prefit").fit(X_train_imputed, y_train)

    params, f1_score_train, f1_score_val = model_predictions_global(X_train_imputed, y_train, X_val_imputed, y_val, st, f"stacking_{estimators[0][0]}_{estimators[1][0]}", "", 1, None)
    
    stacking_results.append({"params": params, "f1_score_train": f1_score_train, "f1_score_val": f1_score_val})
    

Ordinal encoding...
Frequency encoding...
Impuiting missing values...
Scaling numericals ...
Pre-fitting models...
0:	learn: 0.6715310	total: 4.58s	remaining: 1h 16m 17s
100:	learn: 0.2075468	total: 2m 50s	remaining: 25m 14s
200:	learn: 0.1834566	total: 5m 29s	remaining: 21m 51s
300:	learn: 0.1771620	total: 8m 13s	remaining: 19m 4s
400:	learn: 0.1736516	total: 10m 34s	remaining: 15m 47s
500:	learn: 0.1711641	total: 12m 49s	remaining: 12m 46s
600:	learn: 0.1693719	total: 15m 12s	remaining: 10m 5s
700:	learn: 0.1679694	total: 17m 27s	remaining: 7m 26s
800:	learn: 0.1667604	total: 19m 40s	remaining: 4m 53s
900:	learn: 0.1658017	total: 22m 5s	remaining: 2m 25s
999:	learn: 0.1649023	total: 24m 17s	remaining: 0us
Testing {'C': 1, 'multi_class': 'ovr', 'class_weight': 'balanced'} for metalearner...
Stacking NB and catboost...
stacking_NB_catboost...
Params: None
Train F1-score: 0.455
Thresholds: [0.526392525527319, 0.23193046277441545, 0.3170130517148906, 0.23098867317845276, 0.31261397547201

In [15]:
stacking_results2 = stacking_results
stacking_results2

[{'params': None,
  'f1_score_train': 0.4551124268119682,
  'f1_score_val': 0.433842239269234}]