# 4 - Model deployment 

In [1]:
!pip install catboost

[0m

In [2]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn import base

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif

from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.impute import KNNImputer

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
import itertools
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier


from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_curve
from collections import defaultdict 

from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

import scipy.stats as stats


from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from utils import *
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from collections import Counter
import inspect
from collections import defaultdict

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier

In [3]:
df = pd.read_csv("train_new_feats.csv")


In [4]:
target = [[f"target_{i}" for i in range(1, 9)] + ["Claim Injury Type"] + ["WCB Decision"] + ["Agreement Reached"] + ["Claim Injury Type_encoded"]]
target = [item for sublist in target for item in sublist]
target

binary_target = [f"target_{i}" for i in range(1, 9)]

original_target  = [col for col in target if col not in binary_target]

ordinal_target = ["Claim Injury Type_encoded"]

features = [feat for feat in df.columns if feat not in target]

features = [feat for feat in features if df[feat].dtype != "datetime64[ns]"]

num_feats = [feat for feat in features if df[feat].dtype != "object"]

cat_feats = [feat for feat in features if df[feat].dtype == "object"]
cat_feats_index = [features.index(feat) for feat in cat_feats]

In [5]:

def num_imputing(X_train, X_val):
    feats_imput_max = ["C2_Accident_gap_weeks", "C3_Accident_gap_weeks", "Accident Date_assembly_gap_days", "Hearing_C3 gap_months", "Hearing_C2 gap_months", "Hearing_assembly_gap_months", "Days to First Hearing"]

    feat_imput_min = ["C3-C2_gap_days"]
    
    for feat in X_train.columns:
        if X_train[feat].isna().sum() > 0 or X_val[feat].isna().sum() > 0:
            if feat in feats_imput_max:
                X_train[feat] = X_train[feat].fillna(X_train[feat].max())
                X_val[feat] = X_val[feat].fillna(X_train[feat].max())
            elif feat in feat_imput_min:
                X_train[feat] = X_train[feat].fillna(X_train[feat].min())
                X_val[feat] = X_val[feat].fillna(X_train[feat].min())
            else:
                X_train[feat] = X_train[feat].fillna(X_train[feat].mean())
                X_val[feat] = X_val[feat].fillna(X_train[feat].mean())
    return X_train, X_val

def frequency_encoding(train_df, val_df, column):
    """
    Apply frequency encoding on the training set and use the same encoding to impute the validation set.
    
    Parameters:
    train_df (pd.DataFrame): Training dataset.
    val_df (pd.DataFrame): Validation dataset.
    column (str): Column to encode.
    
    Returns:
    train_encoded (pd.DataFrame): Encoded training set.
    val_encoded (pd.DataFrame): Encoded validation set.
    freq_map (dict): Mapping of frequency counts for the column.
    """
    # Compute frequency encoding for the training set
    freq_map = train_df[column].value_counts(normalize=True)  # Relative frequency
    train_df[f"{column}_freq"] = train_df[column].map(freq_map)

    # Impute frequency encoding on the validation set using the same mapping
    val_df[f"{column}_freq"] = val_df[column].map(freq_map)

    # Handle unseen categories in validation by imputing 0 frequency
    val_df[f"{column}_freq"] = val_df[f"{column}_freq"].fillna(0)
    
    train_df = train_df.drop(columns=[column])
    val_df = val_df.drop(columns=[column])

    # Return encoded datasets and frequency map
    return train_df, val_df, freq_map

def target_guided_ordinal_encoding(X_train, X_val, categorical_column, target_column, y_train, i):
    # Combine X_train with y_train temporarily to calculate means
    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()
    X_train_encoded[target_column] = y_train

    means = X_train_encoded.groupby(categorical_column)[target_column].mean()
    #print(means)

    sorted_means = means.sort_values(by=target_column)
    #print(sorted_means)
    # if i == 1:
    #     print(f"Showing sorted means for {categorical_column}")
    #     lst_names = sorted_means.index.tolist()
    #     lst_values = sorted_means.values.tolist()
    #     dict_final = dict(zip(lst_names, lst_values))
    #     print(dict_final)
    
    ordinal_mapping = {category: rank for rank, category in enumerate(sorted_means.index, start=1)}
    # if i == 1:
    #     print(f"Showing ordinal mapping for {categorical_column}")
    #     print(ordinal_mapping)
    #     print("--------------------------------")
        
    X_train_encoded[f"{categorical_column}_encoded"] = X_train_encoded[categorical_column].map(ordinal_mapping)
    X_val_encoded[f"{categorical_column}_encoded"] = X_val_encoded[categorical_column].map(ordinal_mapping)

    #X_train_encoded = X_train_encoded.drop(columns=[categorical_column])
    X_train_encoded = X_train_encoded.drop(columns=[target_column[0]])
    #X_val_encoded = X_val_encoded.drop(columns=[categorical_column])
    X_train_encoded = X_train_encoded.fillna(1)
    X_val_encoded = X_val_encoded.fillna(1)

    return X_train_encoded, X_val_encoded, ordinal_mapping


In [6]:
def optimize_thresholds(y_true, probabilities):
    best_thresholds = []
    for i in range(probabilities.shape[1]):  # Loop over each class
        precision, recall, thresholds = precision_recall_curve((y_true == i).astype(int), probabilities[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
        best_thresholds.append(thresholds[np.argmax(f1_scores)])  # Store best threshold
    return best_thresholds

def predict_with_thresholds(probabilities, thresholds):
    weighted_probs = probabilities / np.array(thresholds)  
    predictions = np.argmax(weighted_probs, axis=1)  
    return predictions

In [7]:

selected_features=['Attorney/Representative',
 'IME-4 Count',
 'Accident Date_year',
 'Accident Date_assembly_gap_days',
 'C3-C2_gap_days',
 'C2_missing',
 'C3_missing',
 'C3_Accident_gap_weeks',
 'Hearing_C3 gap_months',
 'Hearing_C2 gap_months',
 'Days to Assembly',
 'Days to First Hearing',
 'Average Weekly Wage_log',
 'Carrier Name_encoded',
 'Carrier Type_encoded',
 'Industry Code Description_encoded',
 'WCIO Cause of Injury Description_encoded',
 'WCIO Nature of Injury Description_encoded',
 'WCIO Part Of Body Description_encoded',
 'Carrier Name_freq',
 'Carrier Type_freq',
 'Industry Code Description_freq',
 'WCIO Nature of Injury Description_freq',
 'WCIO Part Of Body Description_freq']

naive_features = [feat.replace("_encoded", "") for feat in selected_features]
naive_features = [feat.replace(f"_freq", "") for feat in naive_features]
naive_features = set(naive_features)
naive_features = list(naive_features)

cat_feats = [feat for feat in naive_features if feat in cat_feats]


In [8]:

X = df[naive_features]
y = df[ordinal_target]
# # ---------------  ------------------------------------

# from sklearn.model_selection import train_test_split

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# X_train_encoded = X_train.copy()
# X_val_encoded = X_val.copy()

# # --------------- ------------------------------------
X_encoded = X.copy()
X_encoded_ = X.copy()





In [9]:

print(f"Ordinal encoding...")
X_train_encoded = X_encoded.copy()
X_val_encoded = X_encoded_.copy()
for cat in cat_feats:
    X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y, 0)

print(f"Frequency encoding...")
for cat in cat_feats:
    X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)


X_train_encoded  = X_train_encoded[selected_features]
X_val_encoded = X_val_encoded[selected_features]

X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)

Ordinal encoding...
Frequency encoding...


In [10]:
clf = CatBoostClassifier(random_state=42, verbose=10, iterations=1000, depth=6, boosting_type='Ordered', auto_class_weights='SqrtBalanced', loss_function="MultiClassOneVsAll")

clf.fit(X_train_imputed, y)

0:	learn: 0.6715820	total: 467ms	remaining: 7m 46s
10:	learn: 0.5091293	total: 5.38s	remaining: 8m 4s
20:	learn: 0.4094734	total: 10.2s	remaining: 7m 53s
30:	learn: 0.3444584	total: 15.1s	remaining: 7m 50s
40:	learn: 0.3007453	total: 19.9s	remaining: 7m 45s
50:	learn: 0.2701485	total: 24.7s	remaining: 7m 39s
60:	learn: 0.2484519	total: 29.5s	remaining: 7m 34s
70:	learn: 0.2327412	total: 34.4s	remaining: 7m 29s
80:	learn: 0.2214850	total: 39.1s	remaining: 7m 23s
90:	learn: 0.2129569	total: 44s	remaining: 7m 19s
100:	learn: 0.2069734	total: 49s	remaining: 7m 16s
110:	learn: 0.2020307	total: 54.4s	remaining: 7m 15s
120:	learn: 0.1979187	total: 59.4s	remaining: 7m 11s
130:	learn: 0.1948048	total: 1m 4s	remaining: 7m 6s
140:	learn: 0.1924779	total: 1m 9s	remaining: 7m 1s
150:	learn: 0.1903922	total: 1m 14s	remaining: 7m 1s
160:	learn: 0.1885564	total: 1m 20s	remaining: 6m 58s
170:	learn: 0.1869775	total: 1m 26s	remaining: 6m 57s
180:	learn: 0.1857232	total: 1m 32s	remaining: 6m 56s
190:	lea

<catboost.core.CatBoostClassifier at 0x7fb91dbb7e90>

In [11]:
test = pd.read_csv('test_data.csv')


In [12]:


def test_preprocessing(train, test, cat_feats, selected_features, y):
    # --------------- Date times
    for feat in test.columns:
        if "Date" in feat:
            test[feat] = pd.to_datetime(test[feat], format="%Y-%m-%d")
    date_feats = [feat for feat in test.columns if "Date" in feat]
    date_feats.append("Birth Year")
    
    
    test["Birth Year"] = np.where(test["Birth Year"] == 0, test["Accident Date"].dt.year - test["Age at Injury"], test["Birth Year"])
    test["Birth Year"] = pd.to_datetime(test["Birth Year"], format="%Y")
    
    
    test["Accident Date_year"] = test["Accident Date"].dt.year
    test["Accident Date_year"] = np.where(test["Accident Date_year"] < 2019, 2019, test["Accident Date_year"])
    
    
    test["Accident Date_assembly_gap_months"] = (test["Assembly Date"] - test["Accident Date"]).dt.days/30
    test["Accident Date"] = np.where(test["Accident Date_assembly_gap_months"] < 0, test[["C-2 Date", "C-3 Date"]].min(axis=1), test["Accident Date"])
    test["Accident Date_assembly_gap_months"] = (test["Assembly Date"] - test["Accident Date"]).dt.days/30
    test["Assembly Date"] = np.where(test["Accident Date_assembly_gap_months"] < 0, test["Assembly Date"] + pd.DateOffset(years=1), test["Assembly Date"])
    test.drop(columns=["Accident Date_assembly_gap_months"], inplace=True)
    test["Accident Date_assembly_gap_days"] = (test["Assembly Date"] - test["Accident Date"]).dt.days
    test["Accident Date_assembly_gap_days"] = np.where(test["Accident Date_assembly_gap_days"] > 30, 30, test["Accident Date_assembly_gap_days"])
    
    
    test["C3-C2_gap_days"] = (test["C-3 Date"] - test["C-2 Date"]).dt.days
    test["C3-C2_gap_days"] = np.where(test["C3-C2_gap_days"] < -60, -60, test["C3-C2_gap_days"])
    test["C3-C2_gap_days"] = np.where(test["C3-C2_gap_days"] > 60, 60, test["C3-C2_gap_days"])
    
    
    test["C3_Accident_gap_weeks"] = ((test["C-3 Date"] - test["Accident Date"]).dt.days/7)
    test["C3_Accident_gap_weeks"] = np.where(test["C3_Accident_gap_weeks"] < -4, -4, test["C3_Accident_gap_weeks"])
    test["C3_Accident_gap_weeks"] = np.where(test["C3_Accident_gap_weeks"] > 24, 24, test["C3_Accident_gap_weeks"])
    
    
    test["C3_missing"] = np.where(test["C-3 Date"].isna(), True, False)
    
    
    test["C2_missing"] = np.where(test["C-2 Date"].isna(), True, False)


    test["Hearing_C3 gap_months"] = ((test["First Hearing Date"].dt.year - test["C-3 Date"].dt.year) * 12 + (test["First Hearing Date"].dt.month - test["C-3 Date"].dt.month))
    test["Hearing_C3 gap_months"] = np.where(test["Hearing_C3 gap_months"] > 50, 50, test["Hearing_C3 gap_months"])
    test["Hearing_C3 gap_months"] = np.where(test["Hearing_C3 gap_months"] < -20, -20, test["Hearing_C3 gap_months"])
    
    
    test["Hearing_C2 gap_months"] = ((test["First Hearing Date"].dt.year - test["C-2 Date"].dt.year) * 12 + (test["First Hearing Date"].dt.month - test["C-2 Date"].dt.month))
    test["Hearing_C2 gap_months"] = np.where(test["Hearing_C2 gap_months"] > 50, 50, test["Hearing_C2 gap_months"])
    test["Hearing_C2 gap_months"] = np.where(test["Hearing_C2 gap_months"] < -20, -20, test["Hearing_C2 gap_months"])
    
    
    test["Hearing_assembly_gap_months"] = ((test["First Hearing Date"].dt.year - test["Assembly Date"].dt.year) * 12 + (test["First Hearing Date"].dt.month - test["Assembly Date"].dt.month))
    test["First Hearing Date"] = np.where(test["Hearing_assembly_gap_months"] < 0, test["First Hearing Date"] + pd.DateOffset(years=1), test["First Hearing Date"])
    test["Hearing_assembly_gap_months"] = ((test["First Hearing Date"].dt.year - test["Assembly Date"].dt.year) * 12 + (test["First Hearing Date"].dt.month - test["Assembly Date"].dt.month))
    
    
    test["Days to Assembly"] = (test["Assembly Date"] - pd.to_datetime('2020-01-01 00:00:00', format="%Y-%m-%d %H:%M:%S")).dt.days
    
        
    test["Days to First Hearing"] = (test["First Hearing Date"] - pd.to_datetime('2020-01-30 00:00:00', format="%Y-%m-%d %H:%M:%S")).dt.days
    

    # --------------- Numerical feats

    test["Age at Injury"] = np.where(test["Age at Injury"] == 0, test["Accident Date"].dt.year - test["Birth Year"].dt.year, test["Age at Injury"])
    test["Birth Year"] = np.where(test["Birth Year"].isna() & ~(test["Accident Date"].isna()), pd.to_datetime(test["Accident Date"].dt.year - test["Age at Injury"], format="%Y"), test["Birth Year"])
    test["Age at Injury"] = np.where(test["Age at Injury"] == 0, test["Accident Date"].dt.year - test["Birth Year"].dt.year, test["Age at Injury"])
    test["Age at Injury"] = np.where(test["Age at Injury"] < 14, np.nan, test["Age at Injury"])
    test["Age at Injury"] = np.where(test["Age at Injury"] > 90, 90, test["Age at Injury"])
    
    
    test["IME-4 Count"] = np.where(test["IME-4 Count"].isna(), 0, test["IME-4 Count"])
    test["IME-4 Count"] = np.where(test["IME-4 Count"] > 12, 12, test["IME-4 Count"])


    test["Average Weekly Wage_log"] = test["Average Weekly Wage"].apply(lambda x: np.log1p(x))


    # --------------- Boolean feats 
    test["Alternative Dispute Resolution"] = np.where(test["Alternative Dispute Resolution"] == "U", "Y", test["Alternative Dispute Resolution"])
    test["Alternative Dispute Resolution"] = test["Alternative Dispute Resolution"].map({"Y": True, "N": False})


    test["Attorney/Representative"] = test["Attorney/Representative"].map({"Y": True, "N": False})

    
    # --------------- Categorical feats
    train_categories = train["Carrier Name"].value_counts().index
    test["Carrier Name"] = np.where(~test["Carrier Name"].isin(train_categories), "Other", test["Carrier Name"])


    train_categories = train["Carrier Type"].value_counts().index
    test["Carrier Type"] = np.where(~test["Carrier Type"].isin(train_categories), "5D. SPECIAL FUND - UNKNOWN", test["Carrier Type"])
    
    
    test["Industry Code Description"] = np.where(test["Industry Code Description"].isna(), train["Industry Code Description"].min(), test["Industry Code Description"])
    
    
    test["WCIO Cause of Injury Description"] = np.where(test["WCIO Cause of Injury Description"] == "CRASH OF AIRPLANE", np.nan, test["WCIO Cause of Injury Description"])
    test["WCIO Cause of Injury Description"] = np.where(test["WCIO Cause of Injury Description"].isna(), "Missing", test["WCIO Cause of Injury Description"])
    
    
    test["WCIO Nature of Injury Description"] = np.where(test["WCIO Nature of Injury Description"].isna(), "Missing", test["WCIO Nature of Injury Description"])
    underepresented = ['POISONING - METAL', 'SILICOSIS', 'RADIATION', 'ENUCLEATION', 'BLACK LUNG', 'VDT - RELATED DISEASES', 'HEPATITIS C', 'BYSSINOSIS']
    test["WCIO Nature of Injury Description"] = np.where(test["WCIO Nature of Injury Description"].isin(underepresented), "Other", test["WCIO Nature of Injury Description"])
    
    
    test["WCIO Part Of Body Description"] = np.where(test["WCIO Part Of Body Description"].isna(), "Missing", test["WCIO Part Of Body Description"])
    

    # --------------- Categorical encoding
    print(f"Ordinal encoding...")
    X_train_encoded = train.copy()
    X_val_encoded = test.copy()
    for cat in cat_feats:
        X_train_encoded, X_val_encoded, ordinal_mapping = target_guided_ordinal_encoding(X_train_encoded, X_val_encoded, cat, ordinal_target, y, 0)
        
        
    # --------------- Frequency encoding
    print(f"Frequency encoding...")
    for cat in cat_feats:
        X_train_encoded, X_val_encoded, freq_map = frequency_encoding(X_train_encoded, X_val_encoded, cat)

    
    X_train_encoded  = X_train_encoded[selected_features]
    X_val_encoded = X_val_encoded[selected_features]
    
    
    # --------------- Missing values imputation
    print(f"Imputing missing values...")
    X_train_imputed, X_val_imputed = num_imputing(X_train_encoded, X_val_encoded)
    
    
    return X_val_imputed


In [13]:
test_processed = test_preprocessing(df, test, cat_feats, selected_features, y)

test_processed_df = pd.DataFrame(test_processed, columns=selected_features)



Ordinal encoding...
Frequency encoding...
Imputing missing values...


In [14]:
test_processed_df.columns


Index(['Attorney/Representative', 'IME-4 Count', 'Accident Date_year',
       'Accident Date_assembly_gap_days', 'C3-C2_gap_days', 'C2_missing',
       'C3_missing', 'C3_Accident_gap_weeks', 'Hearing_C3 gap_months',
       'Hearing_C2 gap_months', 'Days to Assembly', 'Days to First Hearing',
       'Average Weekly Wage_log', 'Carrier Name_encoded',
       'Carrier Type_encoded', 'Industry Code Description_encoded',
       'WCIO Cause of Injury Description_encoded',
       'WCIO Nature of Injury Description_encoded',
       'WCIO Part Of Body Description_encoded', 'Carrier Name_freq',
       'Carrier Type_freq', 'Industry Code Description_freq',
       'WCIO Nature of Injury Description_freq',
       'WCIO Part Of Body Description_freq'],
      dtype='object')

In [15]:
test_processed_df.isna().sum()

Attorney/Representative                      0
IME-4 Count                                  0
Accident Date_year                           0
Accident Date_assembly_gap_days              0
C3-C2_gap_days                               0
C2_missing                                   0
C3_missing                                   0
C3_Accident_gap_weeks                        0
Hearing_C3 gap_months                        0
Hearing_C2 gap_months                        0
Days to Assembly                             0
Days to First Hearing                        0
Average Weekly Wage_log                      0
Carrier Name_encoded                         0
Carrier Type_encoded                         0
Industry Code Description_encoded            0
WCIO Cause of Injury Description_encoded     0
WCIO Nature of Injury Description_encoded    0
WCIO Part Of Body Description_encoded        0
Carrier Name_freq                            0
Carrier Type_freq                            0
Industry Code

In [16]:

best_thresholds = [0.5523639280183358,
 0.2706995022796855,
 0.24118589499033843,
 0.2641656406998126,
 0.37531474703930623,
 0.3281035672210727,
 0.5098517158018644,
 0.634415672799047]


In [17]:
model_probas = clf.predict_proba(test_processed_df)
model_probas.shape
predictions = predict_with_thresholds(model_probas, best_thresholds)

In [18]:
ordinalencoder = OrdinalEncoder(categories=[["1. CANCELLED", "2. NON-COMP", "3. MED ONLY", "4. TEMPORARY", "5. PPD SCH LOSS", "6. PPD NSL", "7. PTD", "8. DEATH"]])
ordinalencoder.categories
ordinalencoder.fit(df[["Claim Injury Type"]])


In [19]:
model_adjusted_preds = ordinalencoder.inverse_transform(predictions.reshape(-1, 1))


In [20]:
sub_adjuted = pd.DataFrame(model_adjusted_preds, columns=["Claim Injury Type"], index=test["Claim Identifier"])

In [21]:
sub_adjuted.to_csv("submission.csv")