
- 20240536 Inês Jacinto 20240536@novaims.unl.pt
- 20240561 Antônio Ramos 20240561@novaims.unl.pt 
- 20240598 Sofia Jacinto 20240598@novaims.unl.pt 
- 20240750 Marisa Marques 20240750@novaims.unl.pt

In [1]:
# General Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# embedded methods
from sklearn.linear_model import LassoCV
import scipy.stats as stats
from scipy.stats import chi2_contingency

from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report, f1_score

from util_train import *
from utils import *
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
!pip install ray



In [3]:
# Importing the Ray library for distributed parallel processing
import ray

In [4]:
# Initializing Ray, necessary for parallel distributed processing
ray.init()

2024-12-06 14:33:07,411	INFO worker.py:1518 -- Started a local Ray instance.


0,1
Python version:,3.9.12
Ray version:,2.0.0


In [5]:
#Set the reference date
reference_date = pd.to_datetime("2023-01-01")

In [6]:
# Loading the dataset after EDA (Exploratory Data Analysis)
train_df = pd.read_csv("train_data_after_EDA.csv", index_col="Claim Identifier")

In [7]:
numerical_features = [
    "Age at Injury",
    "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code",
    "WCIO Part Of Body Code",
    "Number of Dependents",
    "Years Past Accident",
    "Assembly Years past Accident",
    "Industry Code",
    "Birth Year",
    "Average Weekly Wage",
    "IME-4 Count"
]

categorical_features = [
    "Carrier Name",
    "Carrier Type",
    "County of Injury",
    "District Name",
    "Gender",
    "Medical Fee Region",
    "Zip Code"
]

binary_features = [
    "Alternative Dispute Resolution",
    "Attorney/Representative",
    "COVID-19 Indicator",
    "First Hearing Date Occurred",
    "C-2 Date Occurred",
    "C-3 Date Occurred"
]


In [8]:
# Separating the independent variables (X) and the target variable (y) for the model
X = train_df.drop(["Claim Injury Type"], axis = 1)
y = train_df["Claim Injury Type"]

In [9]:
# Splitting the data into training (90%) and test (10%) sets
X, X_test, y, y_test = train_test_split(X,y, test_size = 0.1, random_state = None, stratify = y, shuffle = True)

In [10]:
# Further splitting the training data into train (75%) and validation (25%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = None, stratify = y)

In [11]:
# Defining the features to be dropped and to be scaled before training
features_to_drop = ["Number of Dependents","Assembly Years past Accident","Birth Year",
                   "Age at Injury","WCIO Part Of Body Code","WCIO Cause of Injury Code","Accident Date"]
features_to_scale = ["IME-4 Count","WCIO Nature of Injury Code",
                     "Years Past Accident","Industry Code","Average Weekly Wage"] + categorical_features

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
# Imputing missing values in the selected columns
to_impute = ["Average Weekly Wage","Industry Code"]
imputer = SimpleImputer(strategy="mean")
X_train[to_impute] = imputer.fit_transform(X_train[to_impute])
X_val[to_impute] = imputer.transform(X_val[to_impute])

In [14]:
# Scaling the numerical columns with StandardScaler
scaler = StandardScaler().fit(X_train[features_to_scale])
X_train[features_to_scale] = scaler.transform(X_train[features_to_scale])
X_val[features_to_scale] = scaler.transform(X_val[features_to_scale])

In [15]:
# Dropping the unnecessary columns
X_train.drop(features_to_drop, axis=1, inplace=True)
X_val.drop(features_to_drop, axis=1, inplace=True)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [17]:
# Function to execute GridSearch using Ray
def RF_GridSearch(config):
    # Retrieving data references from Ray
    X_train = ray.get(X_train_ref)
    y_train = ray.get(y_train_ref)
    # Initializing the RandomForest model with the hyperparameters from the search
    rf = RandomForestClassifier(
        max_depth=config["max_depth"],
        n_estimators=config["n_estimators"],
        random_state=42)
    # Training the model
    rf.fit(X_train, y_train)
    # Returning the F1 score on the validation set
    return {"f1_score": rf.score(X_val, y_val)}

In [18]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [19]:
# Putting data references into Ray for parallel execution
X_train_ref = ray.put(X_train)
y_train_ref = ray.put(y_train)

# Define the hyperparameter search space
search_space = {"max_depth": tune.grid_search([3, 5, 7]),
                "n_estimators": tune.grid_search([50, 100, 200])}

# Execute the grid search
analysis = tune.run(
    RF_GridSearch,
    config=search_space,
    resources_per_trial={"cpu": 1},  
    scheduler=ASHAScheduler(metric="f1_score", mode="max"),  
    verbose=1)

2024-12-06 14:37:17,411	INFO tune.py:758 -- Total run time: 242.42 seconds (241.57 seconds for the tuning loop).


In [20]:
# Retrieving the best configuration
best_config = analysis.get_best_config(metric="f1_score", mode="max")
print(f"Best hyperparameters: {best_config}")

Best hyperparameters: {'max_depth': 7, 'n_estimators': 200}


In [21]:
# Preprocessing the test data
X_test.drop(features_to_drop, axis=1, inplace=True)
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [22]:
# Instantiate the model with the best configuration
model = RandomForestClassifier(**best_config)

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [23]:
# Printing the classification report with performance metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.36      0.50      1017
           1       0.84      0.99      0.91     27522
           2       0.60      0.04      0.08      6646
           3       0.67      0.91      0.77     14239
           4       0.76      0.31      0.44      4644
           5       0.00      0.00      0.00       403
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00        41

    accuracy                           0.77     54521
   macro avg       0.46      0.33      0.34     54521
weighted avg       0.75      0.77      0.72     54521

