In [1]:
import pandas as pd
import numpy as np

In [2]:
# Columns identified for removal
DROP_COLUMNS = ['accidentdate','claimdate',]

# columns to convert to binary
BINARY_COLUMNS = ['exceptionalcircumstances','minorpsychologicalinjury','whiplash',
                  'policereportfiled','witnesspresent']

# categorical columns for one-hot encoding
CATEGORY_COLUMNS = ['dominantinjury','vehicletype','weatherconditions','gender',
                    'accidenttype','numberofpassengers', 'accidentdescription', 'injurydescription']

# numerica data columns
NUMERIC_COLUMNS = ['specialreduction', 'specialrehabilitation', 'specialmedications',
    'specialhealthexpenses', 'specialoverage', 'generalrest','driverage',
    'specialadditionalinjury', 'specialearningsloss', 'specialusageloss',
    'specialassetdamage', 'specialfixes', 'generaluplift', 'specialtripcosts',
    'specialloanervehicle', 'specialjourneyexpenses', 'specialtherapy','vehicleage']

# columns that require special handling
SPECIAL_COLUMN = ['injuryprognosis',]

# target value
TARGET_COLUMN = 'settlementvalue'


In [3]:
# import dataset from csv
ml_dataset = pd.read_csv('./dataset-excel-cleaned.csv')

# confirm number of imported records matches expected size
print("Number of records imported: " + str(len(ml_dataset.index)))

Number of records imported: 4894


In [4]:
ml_dataset.info() # General info
ml_dataset.describe() # Statistics
ml_dataset.head(10) # Peek @ top-most data
ml_dataset.isnull().sum() # Any null values in columns?
ml_dataset.duplicated() # are there duplicate rows?
ml_dataset.nunique() # number of unique values
ml_dataset.dtypes # What data types?
# ml_dataset.corr() # Check for any correlations

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4894 entries, 0 to 4893
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SettlementValue             4894 non-null   int64  
 1   Injury_Prognosis            4748 non-null   float64
 2   Injury Description          4894 non-null   object 
 3   Dominant injury             4894 non-null   object 
 4   Whiplash                    4771 non-null   float64
 5   Minor_Psychological_Injury  4787 non-null   float64
 6   Exceptional_Circumstances   4797 non-null   float64
 7   GeneralFixed                4780 non-null   float64
 8   GeneralUplift               4768 non-null   float64
 9   GeneralRest                 4775 non-null   float64
 10  SpecialHealthExpenses       4777 non-null   float64
 11  SpecialTherapy              4768 non-null   float64
 12  SpecialRehabilitation       4787 non-null   float64
 13  SpecialMedications          4767 

SettlementValue                 int64
Injury_Prognosis              float64
Injury Description             object
Dominant injury                object
Whiplash                      float64
Minor_Psychological_Injury    float64
Exceptional_Circumstances     float64
GeneralFixed                  float64
GeneralUplift                 float64
GeneralRest                   float64
SpecialHealthExpenses         float64
SpecialTherapy                float64
SpecialRehabilitation         float64
SpecialMedications            float64
SpecialAdditionalInjury       float64
SpecialEarningsLoss           float64
SpecialUsageLoss              float64
SpecialReduction              float64
SpecialOverage                float64
SpecialAssetDamage            float64
SpecialFixes                  float64
SpecialLoanerVehicle          float64
SpecialTripCosts              float64
SpecialJourneyExpenses        float64
AccidentType                   object
Accident Description           object
Vehicle Type

In [5]:
import janitor
clean_df = ml_dataset.clean_names().remove_empty()
for column in clean_df.columns:
    if "_" in column:
        new_column = column.replace("_", "")
        clean_df.rename(columns={column: new_column}, inplace=True)
duplicates = clean_df.get_dupes()
if len(duplicates.index) > 0:
    clean_df = clean_df[~duplicates]
print("Records remaining after initial processing: " + str(len(clean_df.index)))

Records remaining after initial processing: 4894


In [6]:
# populate missing values in the 'accidentdate' and 'claimdate' columns
clean_df['accidentdate'] = clean_df['accidentdate'].fillna(clean_df['claimdate'])
clean_df['claimdate'] = clean_df['claimdate'].fillna(clean_df['accidentdate'])
clean_df = clean_df.dropna(subset=['accidentdate','claimdate'])
print("Records remaining after date processing: " + str(len(clean_df.index)))

Records remaining after date processing: 4887


In [7]:
# Limit numberofpassengers to 2 for motorbikes
clean_df.loc[clean_df['vehicletype'] == 'Motorcycle', 'numberofpassengers'] = clean_df.loc[clean_df['vehicletype'] == 'Motorcycle', 'numberofpassengers'].clip(upper=2)

In [8]:
# Fill missing values in 'generalfixed' with 'settlementvalue' if it matches distinct values in 'generalfixed'
distinct_values = clean_df['generalfixed'].dropna().unique()
clean_df.loc[clean_df['generalfixed'].isna() & clean_df['settlementvalue'].isin(distinct_values), 'generalfixed'] = clean_df['settlementvalue']

In [9]:
# create bins for injury prognosis based on the whiplash tariff scale
import numpy as np
bin_edges = [0, 3, 6, 9, 12, 15, 18, 24, np.inf]
clean_df = clean_df.bin_numeric(from_column_name='injuryprognosis',
                    to_column_name='prognosisgroup',
                    bins=bin_edges,
                    right=True)

In [10]:
# Fill missing 'generalfixed' values based on 'prognosisgroup'
clean_df['generalfixed'] = clean_df.apply(
    lambda row: clean_df.loc[
        (clean_df['prognosisgroup'] == row['prognosisgroup']) & 
        (clean_df['generalfixed'].notna()), 'generalfixed'
    ].iloc[0] if pd.isna(row['generalfixed']) and 
    clean_df.loc[
        (clean_df['prognosisgroup'] == row['prognosisgroup']) & 
        (clean_df['generalfixed'].notna())
    ].shape[0] > 0 else row['generalfixed'], axis=1
)

In [11]:
# Fill missing prognosisgroup using generalfixed matches
clean_df['prognosisgroup'] = clean_df.apply(
    lambda row: clean_df.loc[
        (clean_df['generalfixed'] == row['generalfixed']) & 
        (clean_df['prognosisgroup'].notna()), 'prognosisgroup'
    ].iloc[0] if pd.isna(row['prognosisgroup']) and 
    clean_df.loc[
        (clean_df['generalfixed'] == row['generalfixed']) & 
        (clean_df['prognosisgroup'].notna())
    ].shape[0] > 0 else row['prognosisgroup'], axis=1
)

In [12]:
# remaining rows with missing value in 'generalfixed' are dropped as it 
# not possible to reliably impute these values based on the available data
clean_df = clean_df.dropna(subset=['generalfixed'])
print("Records remaining after generalfixed processing: " + str(len(clean_df.index)))

Records remaining after generalfixed processing: 4878


In [13]:
clean_df.head(20)



Unnamed: 0,settlementvalue,injuryprognosis,injurydescription,dominantinjury,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift,generalrest,...,weatherconditions,vehicleage,driverage,numberofpassengers,policereportfiled,witnesspresent,gender,accidentdate,claimdate,prognosisgroup
0,520,5.0,Whiplash and minor bruises.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,13.0,33.0,2,1,1,Male,10/11/2023 11:22,11/06/2024 11:22,"(3.0, 6.0]"
1,870,2.0,Minor cuts and scrapes.,Multiple,1.0,1.0,0.0,260.0,0.0,520.0,...,Snowy,4.0,45.0,2,1,1,Female,25/06/2023 00:55,09/01/2024 00:55,"(0.0, 3.0]"
2,2140,7.0,Whiplash and minor bruises.,Legs,1.0,0.0,0.0,840.0,0.0,1400.0,...,Sunny,9.0,45.0,2,1,0,Female,23/02/2020 17:43,01/03/2020 17:43,"(6.0, 9.0]"
3,520,4.0,Minor cuts and scrapes.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,5.0,62.0,1,1,1,Female,02/10/2021 04:36,13/10/2021 04:36,"(3.0, 6.0]"
4,260,3.0,Concussion and bruised ribs.,Arms,0.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,9.0,78.0,1,1,1,Other,02/04/2023 05:13,14/04/2023 05:13,"(0.0, 3.0]"
5,520,6.0,Minor cuts and scrapes.,Multiple,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,10.0,55.0,1,1,0,Other,12/03/2020 13:18,30/06/2020 13:18,"(3.0, 6.0]"
6,1015,,Whiplash and minor bruises.,Legs,1.0,1.0,0.0,520.0,0.0,0.0,...,Sunny,5.0,72.0,1,0,0,Female,08/04/2023 08:24,19/08/2023 08:24,"(3.0, 6.0]"
7,1032,8.0,Concussion and bruised ribs.,Legs,1.0,0.0,0.0,840.0,0.0,0.0,...,Sunny,9.0,77.0,4,0,1,Female,14/10/2021 11:00,19/02/2022 11:00,"(6.0, 9.0]"
8,808,6.0,Minor cuts and scrapes.,Arms,0.0,1.0,0.0,520.0,0.0,0.0,...,Snowy,13.0,23.0,2,1,1,Female,09/09/2020 15:07,04/02/2021 15:07,"(3.0, 6.0]"
9,500,2.0,Minor cuts and scrapes.,Hips,1.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,19.0,59.0,2,1,1,Female,01/09/2021 12:37,25/10/2021 12:37,"(0.0, 3.0]"


In [14]:
# Fill missing injuryprognosis with midpoint of prognosisgroup range
imputed_prognosis = clean_df.copy()

# Fill missing injuryprognosis with midpoint of prognosisgroup range
for idx, row in clean_df[pd.isna(clean_df['injuryprognosis'])].iterrows():
    if hasattr(row['prognosisgroup'], 'left') and hasattr(row['prognosisgroup'], 'right'):
        clean_df.at[idx, 'injuryprognosis'] = ((row['prognosisgroup'].left + row['prognosisgroup'].right) / 2).round(0)

# Verify the number of remaining null values in injuryprognosis
print(f"Remaining null values in injuryprognosis: {clean_df['injuryprognosis'].isna().sum()}")

Remaining null values in injuryprognosis: 0


In [15]:
clean_df.head(20)

Unnamed: 0,settlementvalue,injuryprognosis,injurydescription,dominantinjury,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift,generalrest,...,weatherconditions,vehicleage,driverage,numberofpassengers,policereportfiled,witnesspresent,gender,accidentdate,claimdate,prognosisgroup
0,520,5.0,Whiplash and minor bruises.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,13.0,33.0,2,1,1,Male,10/11/2023 11:22,11/06/2024 11:22,"(3.0, 6.0]"
1,870,2.0,Minor cuts and scrapes.,Multiple,1.0,1.0,0.0,260.0,0.0,520.0,...,Snowy,4.0,45.0,2,1,1,Female,25/06/2023 00:55,09/01/2024 00:55,"(0.0, 3.0]"
2,2140,7.0,Whiplash and minor bruises.,Legs,1.0,0.0,0.0,840.0,0.0,1400.0,...,Sunny,9.0,45.0,2,1,0,Female,23/02/2020 17:43,01/03/2020 17:43,"(6.0, 9.0]"
3,520,4.0,Minor cuts and scrapes.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,5.0,62.0,1,1,1,Female,02/10/2021 04:36,13/10/2021 04:36,"(3.0, 6.0]"
4,260,3.0,Concussion and bruised ribs.,Arms,0.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,9.0,78.0,1,1,1,Other,02/04/2023 05:13,14/04/2023 05:13,"(0.0, 3.0]"
5,520,6.0,Minor cuts and scrapes.,Multiple,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,10.0,55.0,1,1,0,Other,12/03/2020 13:18,30/06/2020 13:18,"(3.0, 6.0]"
6,1015,4.0,Whiplash and minor bruises.,Legs,1.0,1.0,0.0,520.0,0.0,0.0,...,Sunny,5.0,72.0,1,0,0,Female,08/04/2023 08:24,19/08/2023 08:24,"(3.0, 6.0]"
7,1032,8.0,Concussion and bruised ribs.,Legs,1.0,0.0,0.0,840.0,0.0,0.0,...,Sunny,9.0,77.0,4,0,1,Female,14/10/2021 11:00,19/02/2022 11:00,"(6.0, 9.0]"
8,808,6.0,Minor cuts and scrapes.,Arms,0.0,1.0,0.0,520.0,0.0,0.0,...,Snowy,13.0,23.0,2,1,1,Female,09/09/2020 15:07,04/02/2021 15:07,"(3.0, 6.0]"
9,500,2.0,Minor cuts and scrapes.,Hips,1.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,19.0,59.0,2,1,1,Female,01/09/2021 12:37,25/10/2021 12:37,"(0.0, 3.0]"


In [16]:
# where settlement and generalfixed are equal indicates a whiplash claim
# and the whiplash column should be filled with 1
clean_df.loc[
    (clean_df['settlementvalue'] == clean_df['generalfixed']) & 
    (clean_df[['whiplash']].isnull().any(axis=1)),
    ['whiplash']
] = clean_df[['whiplash']].fillna(1)

In [17]:
# Fill missing values in generaluplift and exceptionalcircumstances
clean_df.loc[
    (clean_df['settlementvalue'] == clean_df['generalfixed']) & 
    (clean_df[['generaluplift', 'exceptionalcircumstances']].isnull().any(axis=1)),
    ['generaluplift', 'exceptionalcircumstances']
] = clean_df[['generaluplift', 'exceptionalcircumstances']].fillna(0)

In [18]:
# where settlement and generalfixed are equal indicates a whiplash claim
# and the whiplash column should be filled with 1
clean_df.loc[pd.isna(clean_df['generaluplift']) & (clean_df['exceptionalcircumstances'] == 0), 'generaluplift'] = 0
clean_df.loc[pd.isna(clean_df['generaluplift']) & (clean_df['exceptionalcircumstances'] == 1), 'generaluplift'] = clean_df.loc[pd.isna(clean_df['generaluplift']) & (clean_df['exceptionalcircumstances'] == 1), 'generalfixed'] * 0.2

In [19]:
# drop the remaining 4 rows with null values in the generaluplift column
# as they cannot be reliably imputed based on the available data
clean_df = clean_df.dropna(subset=['generaluplift'])
print("Records remaining after generaluplift processing: " + str(len(clean_df.index)))


Records remaining after generaluplift processing: 4874


In [22]:
from sklearn.impute import KNNImputer

def knn_impute(df, columns_to_impute):
    index_backup = pd.Series(df.index, index=df.index)
    input_data_for_imputation = df[columns_to_impute]
    imputer = KNNImputer(n_neighbors=5)
    imputed_np = imputer.fit_transform(input_data_for_imputation)
    imputed_df = pd.DataFrame(imputed_np, columns=columns_to_impute)
    imputed_df['original_index'] = index_backup.values

    # Verify alignment using the robust check
    index_from_series = pd.Index(imputed_df['original_index'])
    if not index_from_series.equals(df.index):
        print("Debug Info:")
        print("Original df index type:", type(df.index))
        print("Original df index head:", df.index[:5])
        print("Index from series type:", type(index_from_series))
        print("Index from series head:", index_from_series[:5])
        print("Original index series dtype:", imputed_df['original_index'].dtype)
        print("Length Match:", len(index_from_series) == len(df.index))
        if len(index_from_series) == len(df.index):
            diff = index_from_series != df.index
            print("Value mismatches:", diff.sum())
            print("Mismatch indices (original):", df.index[diff])
            print("Mismatch indices (reconstructed):", index_from_series[diff])

        raise ValueError("Row alignment mismatch detected! (Index types/values differ)")

    imputed_df_final = imputed_df.drop('original_index', axis=1)
    imputed_df_final.index = df.index

    return imputed_df_final.round(0)

In [None]:
# KNN_COLUMNS = ['whiplash','minorpsychologicalinjury','exceptionalcircumstances','generalfixed','generaluplift']
# copy_df = clean_df.copy()
# imputed_df = knn_impute(copy_df, KNN_COLUMNS)
# imputed_df.head()

Unnamed: 0,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift
0,1.0,1.0,0.0,520.0,0.0
1,1.0,1.0,0.0,260.0,0.0
2,1.0,0.0,0.0,840.0,0.0
3,1.0,1.0,0.0,520.0,0.0
4,0.0,1.0,0.0,260.0,0.0


In [24]:
KNN_COLUMNS = ['whiplash','minorpsychologicalinjury','exceptionalcircumstances','generalfixed','generaluplift']
copy_df = clean_df.copy()
imputed_df = knn_impute(copy_df, KNN_COLUMNS)

print("Before copying imputed values:")
print(f"clean_df shape: {clean_df.shape}")
print(f"imputed_df shape: {imputed_df.shape}")
print("\nMissing values in imputed_df:")
print(imputed_df[KNN_COLUMNS].isna().sum())

# Reset indices before copying
imputed_df = imputed_df.reset_index(drop=True)
clean_df = clean_df.reset_index(drop=True)

for col in KNN_COLUMNS:
    clean_df[col] = imputed_df[col]

print("\nAfter copying imputed values:")
print("Missing values in clean_df:")
print(clean_df[KNN_COLUMNS].isna().sum())

# Check if indices match
print("\nIndex comparison:")
print(f"clean_df index is continuous: {clean_df.index.is_monotonic_increasing}")
print(f"imputed_df index is continuous: {imputed_df.index.is_monotonic_increasing}")
print(f"Number of different indices: {sum(clean_df.index != imputed_df.index)}")

# Verify results
print("Missing values after fix:")
print(clean_df[KNN_COLUMNS].isna().sum())
clean_df.head()


Before copying imputed values:
clean_df shape: (4874, 37)
imputed_df shape: (4874, 5)

Missing values in imputed_df:
whiplash                    0
minorpsychologicalinjury    0
exceptionalcircumstances    0
generalfixed                0
generaluplift               0
dtype: int64

After copying imputed values:
Missing values in clean_df:
whiplash                    0
minorpsychologicalinjury    0
exceptionalcircumstances    0
generalfixed                0
generaluplift               0
dtype: int64

Index comparison:
clean_df index is continuous: True
imputed_df index is continuous: True
Number of different indices: 0
Missing values after fix:
whiplash                    0
minorpsychologicalinjury    0
exceptionalcircumstances    0
generalfixed                0
generaluplift               0
dtype: int64


Unnamed: 0,settlementvalue,injuryprognosis,injurydescription,dominantinjury,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift,generalrest,...,weatherconditions,vehicleage,driverage,numberofpassengers,policereportfiled,witnesspresent,gender,accidentdate,claimdate,prognosisgroup
0,520,5.0,Whiplash and minor bruises.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,13.0,33.0,2,1,1,Male,10/11/2023 11:22,11/06/2024 11:22,"(3.0, 6.0]"
1,870,2.0,Minor cuts and scrapes.,Multiple,1.0,1.0,0.0,260.0,0.0,520.0,...,Snowy,4.0,45.0,2,1,1,Female,25/06/2023 00:55,09/01/2024 00:55,"(0.0, 3.0]"
2,2140,7.0,Whiplash and minor bruises.,Legs,1.0,0.0,0.0,840.0,0.0,1400.0,...,Sunny,9.0,45.0,2,1,0,Female,23/02/2020 17:43,01/03/2020 17:43,"(6.0, 9.0]"
3,520,4.0,Minor cuts and scrapes.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,5.0,62.0,1,1,1,Female,02/10/2021 04:36,13/10/2021 04:36,"(3.0, 6.0]"
4,260,3.0,Concussion and bruised ribs.,Arms,0.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,9.0,78.0,1,1,1,Other,02/04/2023 05:13,14/04/2023 05:13,"(0.0, 3.0]"


In [None]:
# # Impute zeros for specific columns if settlementvalue and generalfixed match
# IMPUTE_ZERO_COLUMNS = [
#     'specialreduction', 'specialrehabilitation', 'specialmedications',
#     'specialhealthexpenses', 'specialoverage', 'generalrest',
#     'specialadditionalinjury', 'specialearningsloss', 'specialusageloss',
#     'specialassetdamage', 'specialfixes', 'generaluplift', 'specialtripcosts',
#     'specialloanervehicle', 'specialjourneyexpenses', 'specialtherapy'
# ]
# # Filter rows where settlementvalue equals generalfixed
# matching_rows = clean_df['settlementvalue'] == clean_df['generalfixed']
# # Impute zeros for missing values in the specified columns
# clean_df.loc[matching_rows, IMPUTE_ZERO_COLUMNS] = clean_df.loc[matching_rows, IMPUTE_ZERO_COLUMNS].fillna(0)

In [None]:
# # Filter rows where settlementvalue equals generalfixed + generaluplift
# matching_rows = clean_df['settlementvalue'] == (clean_df['generalfixed'] + clean_df['generaluplift'])
# # Impute zeros for missing values in the specified columns
# clean_df.loc[matching_rows, IMPUTE_ZERO_COLUMNS] = clean_df.loc[matching_rows, IMPUTE_ZERO_COLUMNS].fillna(0)

In [None]:
# # Print unique value counts for numeric columns only
# for col in IMPUTE_ZERO_COLUMNS:
#     null_count = clean_df[col].isna().sum()
#     unique_count = clean_df[col].nunique()
#     values_count = clean_df[col].value_counts()
#     print(f"{col} - unique values: {unique_count}, null values: {null_count}")
#     print(f"{col} - total of unique values: {values_count}")

specialreduction - unique values: 1, null values: 88
specialreduction - total of unique values: specialreduction
0.0    4786
Name: count, dtype: int64
specialrehabilitation - unique values: 3, null values: 83
specialrehabilitation - total of unique values: specialrehabilitation
0.0     4786
21.0       4
11.0       1
Name: count, dtype: int64
specialmedications - unique values: 10, null values: 93
specialmedications - total of unique values: specialmedications
0.0     4743
19.0      12
14.0      10
6.0        4
1.0        4
9.0        3
28.0       2
30.0       1
10.0       1
20.0       1
Name: count, dtype: int64
specialhealthexpenses - unique values: 7, null values: 90
specialhealthexpenses - total of unique values: specialhealthexpenses
0.0       4768
1152.0       4
120.0        4
3024.0       3
612.0        3
1008.0       1
585.0        1
Name: count, dtype: int64
specialoverage - unique values: 23, null values: 77
specialoverage - total of unique values: specialoverage
0.0       464

In [28]:
IMPUTE_ZERO_COLUMNS = [
    'specialreduction', 'specialrehabilitation', 'specialmedications',
    'specialhealthexpenses', 'specialoverage', 'generalrest',
    'specialadditionalinjury', 'specialearningsloss', 'specialusageloss',
    'specialassetdamage', 'specialfixes', 'generaluplift', 'specialtripcosts',
    'specialloanervehicle', 'specialjourneyexpenses']

for col in IMPUTE_ZERO_COLUMNS:
    clean_df[col] = clean_df[col].fillna(0)

In [29]:
# use mean imputaton for driverage and vehicleage columns
for col in ['driverage','vehicleage']:
    clean_df[col] = clean_df[col].fillna(clean_df[col].mean()).round(0)
    clean_df[col] = clean_df[col].round(0)

In [30]:
for col in CATEGORY_COLUMNS:
    clean_df[col] = clean_df[col].astype('category')
    print(f"{col} - unique values: {clean_df[col].unique()}")

dominantinjury - unique values: ['Arms', 'Multiple', 'Legs', 'Hips', 'unknown']
Categories (5, object): ['Arms', 'Hips', 'Legs', 'Multiple', 'unknown']
vehicletype - unique values: ['Motorcycle', 'Truck', 'Car', 'unknown']
Categories (4, object): ['Car', 'Motorcycle', 'Truck', 'unknown']
weatherconditions - unique values: ['Rainy', 'Snowy', 'Sunny', 'unknown']
Categories (4, object): ['Rainy', 'Snowy', 'Sunny', 'unknown']
gender - unique values: ['Male', 'Female', 'Other']
Categories (3, object): ['Female', 'Male', 'Other']
accidenttype - unique values: ['Rear end', 'Other side pulled out of side road', 'Rear end - Clt pushed into next vehicle', 'Other side pulled on to roundabout', 'unknown', ..., 'Other side pulled from parked position into t..., 'Other side opened their door, hitting clt's v..., 'Other side overtook and hit Clt when pulling in', 'Other side overtook and pulled in too soon', 'Other side overtook whilst clt was turning ri...]
Length: 19
Categories (19, object): ['Othe

In [31]:
dominantinjury_dict = {'Arms':1, 'Hips':2, 'Legs':3, 'Multiple':4, 'unknown':5}
injurydescription_dict = {'Concussion and bruised ribs.':1, 'Fractured arm and leg.':2, 'Minor cuts and scrapes.':3, 'Sprained ankle and wrist.':4, 'Whiplash and minor bruises.':5, 'unknown':6}
temp_df = clean_df.copy()
temp_df['dominantinjury'] = temp_df['dominantinjury'].map(dominantinjury_dict)
temp_df['dominantinjury'] = temp_df['dominantinjury'].astype('category')
temp_df['injurydescription'] = temp_df['injurydescription'].map(injurydescription_dict)
temp_df['injurydescription'] = temp_df['injurydescription'].astype('category')
temp_df.head()

Unnamed: 0,settlementvalue,injuryprognosis,injurydescription,dominantinjury,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift,generalrest,...,weatherconditions,vehicleage,driverage,numberofpassengers,policereportfiled,witnesspresent,gender,accidentdate,claimdate,prognosisgroup
0,520,5.0,5,1,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,13.0,33.0,2,1,1,Male,10/11/2023 11:22,11/06/2024 11:22,"(3.0, 6.0]"
1,870,2.0,3,4,1.0,1.0,0.0,260.0,0.0,520.0,...,Snowy,4.0,45.0,2,1,1,Female,25/06/2023 00:55,09/01/2024 00:55,"(0.0, 3.0]"
2,2140,7.0,5,3,1.0,0.0,0.0,840.0,0.0,1400.0,...,Sunny,9.0,45.0,2,1,0,Female,23/02/2020 17:43,01/03/2020 17:43,"(6.0, 9.0]"
3,520,4.0,3,1,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,5.0,62.0,1,1,1,Female,02/10/2021 04:36,13/10/2021 04:36,"(3.0, 6.0]"
4,260,3.0,1,1,0.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,9.0,78.0,1,1,1,Other,02/04/2023 05:13,14/04/2023 05:13,"(0.0, 3.0]"


In [32]:
# Impute missing values in specialtherapy using KNN
SPECIAL_THERAPY_KNN = ['settlementvalue','injuryprognosis','injurydescription','dominantinjury','specialtherapy']

copy_df = temp_df.copy()
imputed_df = knn_impute(copy_df, SPECIAL_THERAPY_KNN)

print("Before copying imputed values:")
print(f"clean_df shape: {clean_df.shape}")
print(f"imputed_df shape: {imputed_df.shape}")
print("\nMissing values in imputed_df:")
print(imputed_df[SPECIAL_THERAPY_KNN].isna().sum())

# Reset indices before copying
imputed_df = imputed_df.reset_index(drop=True)
clean_df = clean_df.reset_index(drop=True)

# for col in SPECIAL_THERAPY_KNN:
#     clean_df[col] = imputed_df[col]

clean_df['specialtherapy'] = imputed_df['specialtherapy']

print("\nAfter copying imputed values:")
print("Missing values in clean_df:")
print(clean_df[SPECIAL_THERAPY_KNN].isna().sum())

# Check if indices match
print("\nIndex comparison:")
print(f"clean_df index is continuous: {clean_df.index.is_monotonic_increasing}")
print(f"imputed_df index is continuous: {imputed_df.index.is_monotonic_increasing}")
print(f"Number of different indices: {sum(clean_df.index != imputed_df.index)}")

# Verify results
print("Missing values after fix:")
print(clean_df[SPECIAL_THERAPY_KNN].isna().sum())
clean_df.head()

Before copying imputed values:
clean_df shape: (4874, 37)
imputed_df shape: (4874, 5)

Missing values in imputed_df:
settlementvalue      0
injuryprognosis      0
injurydescription    0
dominantinjury       0
specialtherapy       0
dtype: int64

After copying imputed values:
Missing values in clean_df:
settlementvalue      0
injuryprognosis      0
injurydescription    0
dominantinjury       0
specialtherapy       0
dtype: int64

Index comparison:
clean_df index is continuous: True
imputed_df index is continuous: True
Number of different indices: 0
Missing values after fix:
settlementvalue      0
injuryprognosis      0
injurydescription    0
dominantinjury       0
specialtherapy       0
dtype: int64


Unnamed: 0,settlementvalue,injuryprognosis,injurydescription,dominantinjury,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift,generalrest,...,weatherconditions,vehicleage,driverage,numberofpassengers,policereportfiled,witnesspresent,gender,accidentdate,claimdate,prognosisgroup
0,520,5.0,Whiplash and minor bruises.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,13.0,33.0,2,1,1,Male,10/11/2023 11:22,11/06/2024 11:22,"(3.0, 6.0]"
1,870,2.0,Minor cuts and scrapes.,Multiple,1.0,1.0,0.0,260.0,0.0,520.0,...,Snowy,4.0,45.0,2,1,1,Female,25/06/2023 00:55,09/01/2024 00:55,"(0.0, 3.0]"
2,2140,7.0,Whiplash and minor bruises.,Legs,1.0,0.0,0.0,840.0,0.0,1400.0,...,Sunny,9.0,45.0,2,1,0,Female,23/02/2020 17:43,01/03/2020 17:43,"(6.0, 9.0]"
3,520,4.0,Minor cuts and scrapes.,Arms,1.0,1.0,0.0,520.0,0.0,0.0,...,Rainy,5.0,62.0,1,1,1,Female,02/10/2021 04:36,13/10/2021 04:36,"(3.0, 6.0]"
4,260,3.0,Concussion and bruised ribs.,Arms,0.0,1.0,0.0,260.0,0.0,0.0,...,Rainy,9.0,78.0,1,1,1,Other,02/04/2023 05:13,14/04/2023 05:13,"(0.0, 3.0]"


In [None]:
# remove prognosisgroup column as it is not in the database
clean_df = clean_df.drop('prognosisgroup', axis=1)

In [None]:
## clean up date format to ISO version with hypeh instead of forward slash
clean_df['accidentdate'] = pd.to_datetime(clean_df['accidentdate'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')
clean_df['claimdate'] = pd.to_datetime(clean_df['claimdate'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')