In [1]:
# General Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFE

from sklearn.model_selection import StratifiedKFold

# embedded methods
from sklearn.linear_model import LassoCV
import scipy.stats as stats
from scipy.stats import chi2_contingency

from sklearn.preprocessing import LabelEncoder

#from imblearn.over_sampling import SMOTE
#from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report, f1_score

#from utils_train import *
from utils import *
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
import ray
ray.init()

2024-11-19 12:26:05,010	INFO worker.py:1819 -- Started a local Ray instance.


0,1
Python version:,3.12.6
Ray version:,2.39.0




In [3]:
reference_date = pd.to_datetime("2023-01-01")

In [4]:
train_df = pd.read_csv("./data/train_data_after_EDA.csv", index_col="Claim Identifier")

In [5]:
numerical_features = [
    "Age at Injury",
    "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code",
    "WCIO Part Of Body Code",
    "Number of Dependents",
    "Years Past Accident",
    "Assembly Years past Accident",
    "Industry Code",
    "Birth Year",
    "Average Weekly Wage",
    "IME-4 Count"
]

categorical_features = [
    "Carrier Name",
    "Carrier Type",
    "County of Injury",
    "District Name",
    "Gender",
    "Medical Fee Region",
    "Zip Code"
]

binary_features = [
    "Alternative Dispute Resolution",
    "Attorney/Representative",
    "COVID-19 Indicator",
    "First Hearing Date Occurred",
    "C-2 Date Occurred",
    "C-3 Date Occurred"
]


In [6]:
for col in numerical_features+categorical_features+binary_features:
    if col not in train_df.columns:
        print(col)
print("-------")
for col in train_df.columns:
    if col not in numerical_features+categorical_features+binary_features:
        print(col)

-------
Accident Date
Claim Injury Type


# Remove Some NAs

In [7]:
train_df.dropna(subset = ["Accident Date", "Age at Injury","Birth Year","Years Past Accident","Assembly Years past Accident"], inplace=True)

In [8]:
X = train_df.drop(["Claim Injury Type"], axis = 1)
y = train_df["Claim Injury Type"]

In [9]:
# Install Ray and Cuda

# What to do with NAs in Wage (and Industry Code)

# Model Selection

    # Feature Selection

    # Kfold load (Have a script that will create 3 versions for Kfold for 4-6 folds)
    # Impute

    # Which models (catboosted)
    # Compare validation to 10% test

# Model Gridsearch
    # Find 2-3 models that are good
    # Look at the parameters and create dict with them
    # Run Ray.Tune on models and paramenter - how?
    # Compare best for each model to 10%

# Test Predict
    # Train best model
    # Predict
    # Profit

# Need to create function

# 10% data split

In [10]:
 X, X_test, y, y_test = train_test_split(X,y, test_size = 0.1, random_state = None, stratify = y, shuffle = True)

## Train Val Split

In [11]:
 X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.25, random_state = None, stratify = y, shuffle = True)

## Impute NAs

In [12]:
to_impute = ["Average Weekly Wage","Industry Code"]
percent_missing = X_train[to_impute].isnull().mean()
imputation_value = percent_missing / ((1 / 0.97) - 1)
for col in to_impute:
        X_train[col].fillna(imputation_value[col], inplace=True)
        X_val[col].fillna(imputation_value[col], inplace=True)

# Apply Feature Selection

In [13]:
features_to_drop = ["Number of Dependents","Assembly Years past Accident","Birth Year",
                   "Age at Injury","WCIO Part Of Body Code","WCIO Cause of Injury Code","Accident Date"
                   ]
features_to_scale = ["IME-4 Count","WCIO Nature of Injury Code",
                     "Years Past Accident","Industry Code","Average Weekly Wage"] + categorical_features

In [14]:
scaler = StandardScaler().fit(X_train[features_to_scale])
X_train[features_to_scale] = scaler.transform(X_train[features_to_scale])
X_val[features_to_scale] = scaler.transform(X_val[features_to_scale])

In [15]:
X_train.drop(features_to_drop, axis=1, inplace=True)
X_val.drop(features_to_drop, axis=1, inplace=True)

In [16]:
def XGB_GridSearch(config):

    X_train = ray.get(X_train_ref)
    y_train = ray.get(y_train_ref)
    
    # Create and fit the model
    model = xgb.XGBClassifier(
        max_depth=config["max_depth"],
        learning_rate=config["learning_rate"],
        n_estimators=config["n_estimators"]
    )
    
    model.fit(X_train, y_train)
    
    # Validation Predictions and F1 Score
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average="macro")

    # Report Results to Ray
    session.report({"f1_score": f1})

In [17]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.air import session

In [18]:
X_train_ref = ray.put(X_train)
y_train_ref = ray.put(y_train)

# Define the hyperparameter search space
search_space = {
    "max_depth": tune.grid_search([3, 5, 7]),
    "learning_rate": tune.grid_search([0.01, 0.1, 0.2]),
    "n_estimators": tune.grid_search([50, 100, 200])
}

# Execute the grid search
analysis = tune.run(
    XGB_GridSearch,
    config=search_space,
    resources_per_trial={"cpu": 1},  # Allocate 1 CPU per trial
    scheduler=ASHAScheduler(metric="f1_score", mode="max"),  # Manage trials efficiently
    verbose=1
)

0,1
Current time:,2024-11-19 12:34:18
Running for:,00:08:04.10
Memory:,9.6/15.8 GiB

Trial name,status,loc,learning_rate,max_depth,n_estimators,iter,total time (s),f1_score
XGB_GridSearch_77b9a_00000,TERMINATED,127.0.0.1:16360,0.01,3,50,1,39.668,0.301677
XGB_GridSearch_77b9a_00001,TERMINATED,127.0.0.1:16408,0.1,3,50,1,41.5571,0.343747
XGB_GridSearch_77b9a_00002,TERMINATED,127.0.0.1:12660,0.2,3,50,1,42.0366,0.351795
XGB_GridSearch_77b9a_00003,TERMINATED,127.0.0.1:2540,0.01,5,50,1,45.6763,0.340776
XGB_GridSearch_77b9a_00004,TERMINATED,127.0.0.1:19204,0.1,5,50,1,46.2984,0.357443
XGB_GridSearch_77b9a_00005,TERMINATED,127.0.0.1:7272,0.2,5,50,1,46.2227,0.380638
XGB_GridSearch_77b9a_00006,TERMINATED,127.0.0.1:15528,0.01,7,50,1,48.7423,0.35342
XGB_GridSearch_77b9a_00007,TERMINATED,127.0.0.1:20616,0.1,7,50,1,52.4735,0.392322
XGB_GridSearch_77b9a_00008,TERMINATED,127.0.0.1:15948,0.2,7,50,1,59.5732,0.412069
XGB_GridSearch_77b9a_00009,TERMINATED,127.0.0.1:4052,0.01,3,100,1,90.241,0.304668


2024-11-19 12:34:18,354	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/ruipb/ray_results/XGB_GridSearch_2024-11-19_12-26-14' in 0.0612s.
2024-11-19 12:34:18,374	INFO tune.py:1041 -- Total run time: 484.37 seconds (484.03 seconds for the tuning loop).


In [19]:
# Retrieve the best configuration
best_config = analysis.get_best_config(metric="f1_score", mode="max")
print(f"Best hyperparameters: {best_config}")

Best hyperparameters: {'max_depth': 5, 'learning_rate': 0.2, 'n_estimators': 200}


In [20]:
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
X_test.drop(features_to_drop, axis=1, inplace=True)

In [21]:
# Instantiate the model with the best configuration
model = xgb.XGBClassifier(**best_config)

# Train the model on the training dataset
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.49      0.59      1074
           1       0.86      0.93      0.90     28803
           2       0.49      0.07      0.13      6862
           3       0.66      0.90      0.76     14791
           4       0.67      0.53      0.59      4826
           5       0.00      0.00      0.00       421
           6       0.00      0.00      0.00        10
           7       0.43      0.21      0.29        47

    accuracy                           0.77     56834
   macro avg       0.48      0.39      0.41     56834
weighted avg       0.74      0.77      0.73     56834



In [None]:
#import joblib

#joblib.dump(model, "best_xgb_model.pkl")

# model = joblib.load("best_xgb_model.pkl")