# **Imported libraries**

#### **Encoding Categorical Features**

In [66]:
categorical_features=engineered_data_train.select_dtypes(include=['object', 'category']).columns
categorical_features

Index(['Alternative Dispute Resolution', 'Attorney/Representative',
       'Carrier Name', 'Carrier Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'Gender', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code',
       'Carrier Name Standardized', 'Accident_Month', 'Accident_DayOfWeek'],
      dtype='object')

In [67]:
engineered_data_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 401818 entries, 5479022 to 5566085
Data columns (total 46 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   Age at Injury                      401818 non-null  Int64   
 1   Alternative Dispute Resolution     401818 non-null  category
 2   Attorney/Representative            401818 non-null  category
 3   Average Weekly Wage                401818 non-null  float64 
 4   Birth Year                         401818 non-null  Int64   
 5   Carrier Name                       401818 non-null  category
 6   Carrier Type                       401818 non-null  category
 7   County of Injury                   401818 non-null  category
 8   COVID-19 Indicator                 401818 non-null  category
 9   District Name                      401818 non-null  category
 10  Gender                             401818 non-null  category
 11  IME-4 Count             

In [68]:
train_data_encoded=engineered_data_train.copy()
test_data_encoded=engineered_data_test.copy()
val_data_encoded=engineered_data_val.copy()

In [69]:
#Convert Birth Year to an int
train_data_encoded['Birth Year'] = train_data_encoded['Birth Year'].astype("int")
val_data_encoded['Birth Year'] = val_data_encoded['Birth Year'].astype("int")
test_data_encoded['Birth Year']=test_data_encoded['Birth Year'].astype("int")

# Converted IME-4 Count to an int
train_data_encoded['IME-4 Count'] = train_data_encoded['IME-4 Count'].astype("int")
test_data_encoded['IME-4 Count'] = test_data_encoded['IME-4 Count'].astype("int")
val_data_encoded['IME-4 Count'] = val_data_encoded['IME-4 Count'].astype("int")

In [70]:
# Initialize encoder
frequency = CountFrequencyEncoder(encoding_method='frequency')

# Fit and transform the training data
train_data_encoded[categorical_features] = frequency.fit_transform(train_data_encoded[categorical_features])

# Create a function to handle unseen categories
def safe_transform(encoder, X, train_data):
    """
    Safely transform the dataset using a fitted encoder.
    Unseen categories are replaced with a frequency of 0.
    """
    transformed = encoder.transform(X)
    for col in categorical_features:
        if col in X.columns:
            # Replace NaN (unseen categories) with 0
            transformed[col] = transformed[col].fillna(0)
    return transformed

# Apply the encoder safely to validation and test sets
val_data_encoded[categorical_features] = safe_transform(frequency, val_data_encoded[categorical_features], train_data_encoded)
test_data_encoded[categorical_features] = safe_transform(frequency, test_data_encoded[categorical_features], train_data_encoded)

In [71]:
with open('frequency_encoder.pkl', 'wb') as f:
    pickle.dump(frequency, f)

In [72]:
test_data_encoded.isna().sum()

Age at Injury                        0
Alternative Dispute Resolution       0
Attorney/Representative              0
Average Weekly Wage                  0
Birth Year                           0
Carrier Name                         0
Carrier Type                         0
County of Injury                     0
COVID-19 Indicator                   0
District Name                        0
Gender                               0
IME-4 Count                          0
Industry Code                        0
Industry Code Description            0
Medical Fee Region                   0
WCIO Cause of Injury Code            0
WCIO Cause of Injury Description     0
WCIO Nature of Injury Code           0
WCIO Nature of Injury Description    0
WCIO Part Of Body Code               0
WCIO Part Of Body Description        0
Zip Code                             0
Number of Dependents                 0
Accident_Date_Flag                   0
Age_Outlier_Flag                     0
C2_After_C3_Flag         

In [73]:
train_data_encoded.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,District Name,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Accident_Date_Flag,Age_Outlier_Flag,C2_After_C3_Flag,C2_After_Assembly_Flag,C2_After_First_Hearing_Flag,C3_After_Assembly_Flag,C3_After_First_Hearing_Flag,C-3 Date Converted,First Hearing Date Converted,Carrier Name Standardized,IME-4 Count Converted,valid_Zip_Code,Accident_Month,Accident_DayOfWeek,Days_Accident_to_Assembly,Days_Accident_to_C2,Days_Accident_to_C3,Days_Accident_to_First_Hearing,Region_Risk_Score,County_Claims_Normalized,Industry_Claim_Volume,Industry_Avg_Age,Industry_Wage_Rank
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
5479022,24,0.995,0.684,0.0,1996,0.193,0.193,0.008,0.952,0.078,0.584,0,0.004,0.004,0.236,0.021,0.021,0.295,0.295,0.043,0.043,0.0,3,0,0,0,0,0,0,0,0,0,0.193,0,1,0.075,0.176,3,3,0,10833,0.485,0.076,1674.0,41.146,0.5
6023025,36,0.995,0.316,1097.0,1986,0.001,0.498,0.102,0.952,0.473,0.584,4,0.018,0.067,0.464,0.055,0.055,0.193,0.193,0.009,0.009,0.005,2,0,0,1,1,0,0,0,1,1,0.001,1,1,0.075,0.176,46,65,37,111,0.913,0.965,7088.0,41.619,0.5
5851908,41,0.995,0.316,623.35,1980,0.025,0.498,0.042,0.952,0.07,0.407,2,0.03,0.067,0.236,0.109,0.109,0.295,0.295,0.066,0.066,0.002,3,0,0,1,0,0,0,0,1,1,0.025,1,1,0.075,0.174,173,173,169,674,0.485,0.396,12192.0,44.952,0.5
5913931,59,0.995,0.316,1042.13,1963,0.025,0.498,0.024,0.952,0.15,0.584,2,0.018,0.067,0.236,0.021,0.021,0.097,0.097,0.017,0.017,0.002,1,0,0,0,0,0,0,0,0,1,0.025,1,1,0.084,0.17,1,1,0,284,0.485,0.227,7088.0,41.619,0.5
5845976,58,0.995,0.684,0.0,1963,0.005,0.498,0.093,0.952,0.473,0.407,0,0.216,0.216,0.464,0.051,0.051,0.022,0.022,0.04,0.04,0.004,5,0,0,0,0,0,0,0,0,0,0.005,0,1,0.086,0.176,38,38,0,10329,0.913,0.884,86849.0,44.231,0.5


#### **Data Scaling**

In [75]:
train_data_scaled=train_data_encoded.copy()
val_data_scaled=val_data_encoded.copy()
test_data_scaled=test_data_encoded.copy()

In [76]:
# Initialize the scaler
scaler = RobustScaler()

# Scale the training data and retain DataFrame structure
train_data_scaled = pd.DataFrame(
    scaler.fit_transform(train_data_encoded), 
    columns=train_data_encoded.columns, 
    index=train_data_encoded.index
)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Scale the validation data and retain DataFrame structure
val_data_scaled = pd.DataFrame(
    scaler.transform(val_data_encoded), 
    columns=val_data_encoded.columns, 
    index=val_data_encoded.index
)

# Scale the test data and retain DataFrame structure
test_data_scaled = pd.DataFrame(
    scaler.transform(test_data_encoded), 
    columns=test_data_encoded.columns, 
    index=test_data_encoded.index
)

In [77]:
train_data_scaled.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,District Name,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Accident_Date_Flag,Age_Outlier_Flag,C2_After_C3_Flag,C2_After_Assembly_Flag,C2_After_First_Hearing_Flag,C3_After_Assembly_Flag,C3_After_First_Hearing_Flag,C-3 Date Converted,First Hearing Date Converted,Carrier Name Standardized,IME-4 Count Converted,valid_Zip_Code,Accident_Month,Accident_DayOfWeek,Days_Accident_to_Assembly,Days_Accident_to_C2,Days_Accident_to_C3,Days_Accident_to_First_Hearing,Region_Risk_Score,County_Claims_Normalized,Industry_Claim_Volume,Industry_Avg_Age,Industry_Wage_Rank
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
5479022,-0.783,0.0,0.0,0.0,0.783,6.407,-0.065,-0.609,0.0,-0.183,0.0,0.0,-0.558,-0.842,0.0,-0.219,-0.499,0.791,0.791,0.0,0.0,-0.666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.306,0.0,0.0,-3.67,0.317,-0.368,-0.304,0.0,0.064,-1.209,-0.609,-0.558,-0.428,8.914
6023025,-0.261,0.0,-1.0,1.374,0.348,-0.316,0.935,0.674,0.0,0.821,0.0,4.0,-0.453,-0.256,0.721,0.698,0.429,0.384,0.384,-0.64,-0.648,1.537,-0.25,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,-0.417,1.0,0.0,-3.67,0.317,1.895,2.391,1.947,-1.034,0.0,0.674,-0.453,-0.252,1.711
5851908,-0.043,0.0,-1.0,0.781,0.087,0.538,0.935,-0.147,0.0,-0.202,-1.0,2.0,-0.355,-0.256,0.0,2.109,1.856,0.791,0.791,0.418,0.423,0.378,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.437,1.0,0.0,-3.67,0.249,8.579,7.087,8.895,-0.976,-1.209,-0.147,-0.355,0.992,0.778
5913931,0.739,0.0,-1.0,1.305,-0.652,0.538,0.935,-0.391,0.0,0.0,0.0,2.0,-0.453,-0.256,0.0,-0.212,-0.491,0.0,0.0,-0.491,-0.497,0.318,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.437,1.0,0.0,-0.529,0.0,-0.474,-0.391,0.0,-1.016,-1.209,-0.391,-0.453,-0.252,1.711
5845976,0.696,0.0,0.0,0.0,-0.652,-0.16,0.935,0.558,0.0,0.821,-1.0,0.0,1.086,1.142,0.721,0.58,0.309,-0.299,-0.299,-0.074,-0.074,1.271,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.261,0.0,0.0,0.064,0.317,1.474,1.217,0.0,0.012,0.0,0.558,1.086,0.723,-0.335


In [78]:
train_data_scaled.shape

(401818, 46)

In [138]:
columns_modelling=['Days_Accident_to_First_Hearing','IME-4 Count','Average Weekly Wage','Attorney/Representative',
                   'WCIO Cause of Injury Code','WCIO Part Of Body Code','Region_Risk_Score','C2_After_C3_Flag',
                  'C3_After_Assembly_Flag','C-3 Date Converted','Days_Accident_to_C3','Carrier Name',
                  'Birth Year','Days_Accident_to_Assembly','District Name','Medical Fee Region',
                  'Accident_Month','Age_Outlier_Flag','County_Claims_Normalized']

### **Save data preprocessed**

In [140]:
# Save scaled datasets to CSV
train_data_scaled[columns_modelling].to_csv('train_data_scaled.csv', index=True)
val_data_scaled[columns_modelling].to_csv('val_data_scaled.csv', index=True)
test_data_scaled[columns_modelling].to_csv('test_data_scaled.csv', index=True)
y_train.to_csv('y_train.csv', index=True, header=['Claim Injury Type'])
y_val.to_csv('y_val.csv', index=True, header=['Claim Injury Type'])