# Dataset Research

# Global Library Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import sklearn
import ipywidgets as widgets

# set matplotlib to display graphics in the notebook
%matplotlib inline

# Dataset Import

This section imports the diabetes dataset provided for the assignment. The dataset is saved in csv format in the same folder as the project notebook. The file is imported into the project as a pandas dataframe object.

In [2]:
# import dataset from csv
ml_dataset = pd.read_csv('./Synthetic_Data_For_Students.csv')

# confirm number of imported records matches expected size
print("Number of records imported: " + str(len(ml_dataset.index)))

Number of records imported: 5000


# Dataset Initial Exploration

## Display Dataset Summary Info

In [3]:
ml_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SettlementValue             4894 non-null   float64
 1   AccidentType                4878 non-null   object 
 2   Injury_Prognosis            4844 non-null   object 
 3   SpecialHealthExpenses       4870 non-null   float64
 4   SpecialReduction            4879 non-null   float64
 5   SpecialOverage              4883 non-null   float64
 6   GeneralRest                 4872 non-null   float64
 7   SpecialAdditionalInjury     4866 non-null   float64
 8   SpecialEarningsLoss         4872 non-null   float64
 9   SpecialUsageLoss            4870 non-null   float64
 10  SpecialMedications          4870 non-null   float64
 11  SpecialAssetDamage          4889 non-null   float64
 12  SpecialRehabilitation       4884 non-null   float64
 13  SpecialFixes                4879 

All numeric columns are float64 type. 
All string columns are Object type.

There are null values in multiple columns, these will need to be handled appropriately. Discussion with the client has identified that null values in the settlement column indicate unsettled claims, so these should not be used in model training.


## Removal of unsettled claims entries

In [4]:
# remove rows where the settlement column is null
ml_dataset = ml_dataset.dropna(subset=['SettlementValue'])


## Display Column Value Ranges

In [5]:
# display summary statistics for numerical columns
ml_dataset.describe()

Unnamed: 0,SettlementValue,SpecialHealthExpenses,SpecialReduction,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,SpecialMedications,SpecialAssetDamage,...,SpecialFixes,GeneralFixed,GeneralUplift,SpecialLoanerVehicle,SpecialTripCosts,SpecialJourneyExpenses,SpecialTherapy,Vehicle Age,Driver Age,Number of Passengers
count,4894.0,4777.0,4781.0,4785.0,4775.0,4768.0,4780.0,4773.0,4767.0,4792.0,...,4782.0,4780.0,4768.0,4764.0,4786.0,4758.0,4768.0,4776.0,4776.0,4779.0
mean,1218.010685,3.682018,0.0,13.470086,462.605569,0.278922,51.429741,9.122145,0.112068,33.54038,...,3.185705,687.285565,10.452076,7.84984,1.95189,11.735666,184.580357,9.507956,48.792714,2.482737
std,858.866309,85.870386,0.0,84.760462,766.942861,13.101063,392.065706,65.947574,1.404761,284.998118,...,102.311077,399.701043,50.363289,142.570855,13.052007,49.41796,224.298369,5.730669,17.817638,1.109656
min,240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,240.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,1.0
25%,669.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,495.0,0.0,0.0,0.0,0.0,0.0,4.0,33.0,1.0
50%,988.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,520.0,0.0,0.0,0.0,0.0,56.85,10.0,49.0,2.0
75%,1510.0,0.0,0.0,0.0,903.0,0.0,0.0,0.0,0.0,0.0,...,0.0,895.0,0.0,0.0,0.0,0.0,350.0,14.0,64.0,3.0
max,7862.9,3024.0,0.0,1250.0,3912.64,889.0,7735.58,1050.0,30.25,6070.0,...,4000.0,4345.0,1430.0,4408.16,254.2,880.0,1225.0,19.0,79.0,4.0


## Show total values for each numerical column

In [6]:
# for each numerical column, diplay the count of non-zero values
print("column - count of non-zero - sum of values - mean values (excluding zero rows)")
for column in ml_dataset.select_dtypes(include='float64'):
    print(column + " - " + str(ml_dataset[ml_dataset[column] != 0][column].count()) + " - " + str(ml_dataset[column].sum()) + " - " + str((ml_dataset[column].sum() / ml_dataset[ml_dataset[column] != 0][column].count())))



column - count of non-zero - sum of values - mean values (excluding zero rows)
SettlementValue - 4894 - 5960944.29 - 1218.010684511647
SpecialHealthExpenses - 16 - 17589.0 - 1099.3125
SpecialReduction - 0 - 0.0 - nan
SpecialOverage - 157 - 64454.36 - 410.5373248407643
GeneralRest - 1633 - 2208941.59 - 1352.6892774035516
SpecialAdditionalInjury - 8 - 1329.9 - 166.2375
SpecialEarningsLoss - 289 - 245834.16 - 850.63723183391
SpecialUsageLoss - 237 - 43540.0 - 183.71308016877637
SpecialMedications - 38 - 534.23 - 14.058684210526316
SpecialAssetDamage - 461 - 160725.5 - 348.6453362255965
SpecialRehabilitation - 5 - 96.19999999999999 - 19.24
SpecialFixes - 9 - 15234.04 - 1692.6711111111113
GeneralFixed - 4780 - 3285225.0 - 687.2855648535565
GeneralUplift - 306 - 49835.5 - 162.86111111111111
SpecialLoanerVehicle - 67 - 37396.64 - 558.1588059701493
SpecialTripCosts - 252 - 9341.747500000001 - 37.07042658730159
SpecialJourneyExpenses - 987 - 55838.3 - 56.57375886524823
SpecialTherapy - 2664 - 8

  print(column + " - " + str(ml_dataset[ml_dataset[column] != 0][column].count()) + " - " + str(ml_dataset[column].sum()) + " - " + str((ml_dataset[column].sum() / ml_dataset[ml_dataset[column] != 0][column].count())))


Looking at the values above, SpecialReduction can be ignored as there are no non-zero values.

SpecialRehabilitation can be ignored as it appears infrequently and contributes little to overall settlement amount (max value £21).

SpecialMedication can be ignored as it appears infrequently and contributes little to overall settlement amount (max value £30).

Columns with a high maximum value but low mean are candidates for grouping and further analysis is required.

SpecialAdditionalInjury, SpecialFixes, SpecialHealthExpenses and SpecialLoanerVehicle could all be relevant groups, as they have low frequency but can be high value.

GeneralUplift - there are a small number of high values in this column

SpecialTripCosts and SpecialJourneyExpenses may be important - need to perform further analysis


Recommend investigating the relationship and correlations between the different types of columns (eg. can medical / trip+journey expenses be grouped)


## Removal of Columns

Along with the numerical columns identified above, Gender and age values should be dropped as they are protected characteristics and should not be used for training models due to the possibility of introducing bias into the model.

In [7]:
ml_dataset_dropcols = ml_dataset.drop(columns=['SpecialReduction','SpecialRehabilitation','SpecialMedications','Driver Age','Gender'])
ml_dataset = ml_dataset_dropcols

## Display the first x rows of the dataset using a slider widget

In [8]:
widgets.interact(lambda x: ml_dataset.head(x), x=(widgets.IntSlider(min=5, max=50, step=5, value=5)))

Unnamed: 0,SettlementValue,AccidentType,Injury_Prognosis,SpecialHealthExpenses,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,SpecialAssetDamage,...,Vehicle Type,Weather Conditions,Accident Date,Claim Date,Vehicle Age,Number of Passengers,Accident Description,Injury Description,Police Report Filed,Witness Present
0,520.0,Rear end,E. 5 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Motorcycle,Rainy,2023-11-10 11:22:24.508901,2024-06-11 11:22:24.508901,13.0,4.0,Side collision at an intersection.,Whiplash and minor bruises.,Yes,Yes
1,870.0,Rear end,B. 2 months,0.0,0.0,520.0,0.0,0.0,90.0,0.0,...,Motorcycle,Snowy,2023-06-25 00:55:01.140228,2024-01-09 00:55:01.140228,4.0,2.0,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes
2,2140.0,Other side pulled out of side road,G. 7 months,0.0,0.0,1400.0,0.0,0.0,0.0,0.0,...,Motorcycle,Sunny,2020-02-23 17:43:47.805561,2020-03-01 17:43:47.805561,9.0,4.0,Lost control on a snowy road.,Whiplash and minor bruises.,Yes,No
3,520.0,Rear end - Clt pushed into next vehicle,D. 4 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Truck,Rainy,2021-10-02 04:36:32.118423,2021-10-13 04:36:32.118423,5.0,1.0,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes
4,260.0,Rear end,C. 3 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Motorcycle,Rainy,2023-04-02 05:13:07.117423,2023-04-14 05:13:07.117423,9.0,1.0,Lost control on a snowy road.,Concussion and bruised ribs.,Yes,Yes


interactive(children=(IntSlider(value=5, description='x', max=50, min=5, step=5), Output()), _dom_classes=('wi…

<function __main__.<lambda>(x)>

# Data Preparation

## Show unique values for each text field

In [12]:
def display_text_values(col):
    print(col.name, 'Unique values:', col.unique())

In [16]:
for column in ml_dataset.select_dtypes(include='object'):
    display_text_values(ml_dataset[column])

AccidentType Unique values: ['Rear end' 'Other side pulled out of side road'
 'Rear end - Clt pushed into next vehicle'
 'Other side pulled on to roundabout' nan
 "Other side reversed into Clt's vehicle"
 "Other side changed lanes and collided with clt's vehicle" 'Other'
 "Other side turned across Clt's path"
 'Other side drove on wrong side of the road'
 "Other side changed lanes on a roundabout colliding with clt's vehicle"
 'Rear end - 3 car - Clt at front'
 "Other side reversed into clt's stationary vehicle"
 "Other side collided with Clt's parked vehicle"
 "Other side pulled from parked position into the path of clt's vehicle"
 "Other side opened their door, hitting clt's vehicle"
 'Other side overtook and hit Clt when pulling in'
 'Other side overtook and pulled in too soon'
 'Other side overtook whilst clt was turning right']
Exceptional_Circumstances Unique values: ['No' 'Yes' nan]
Minor_Psychological_Injury Unique values: ['Yes' 'No' nan]
Dominant injury Unique values: ['Arms'

The InjuryPrognosis column should be converted to an integer value representing number of months

Exceptional_Circumstances, Minor_Psychological_Injury, Whiplash, Police Report Filed, Witness Present should be converted to binary value using one-hot, however handling of any null values must be addressed first. Most likely conclusion is that a blank field is treated as a no, however the claim entry form should use a drop down allowing only Yes or No, with a default value of No to prevent missing data in future records. Likewise for other input fields (weather, vehicle, injury type) these should offer a drop down selection menu, but be blank by default and input validation should prevent record creation without a value being entered.

Accident Date and Claim Date are unlikely to be relevant to the claim details, however the difference between these dates may be indicative of claim success due to difficulty in proving fault retrospectively. As such, a new column could be created showing the time elapsed between these dates.

Unless there are significant correlations to the settlement outcomes, retaining Injury description and dominant injury type is probably not needed, will perform analysis to determine which can be dropped. Likewise for accident type and accident description.

## Convert prognosis to integer
Prognosis is likely to be an important factor in determining the outcome. Initilly considered dropping the rows with NaN for prognosis, however some of the claims with NaN prognosis resulted in large payouts, so it may be important to keep the data. I suspect that once the remaining categorical data has been properly encoded or removed, then it will be possible to use a KNN imputer to fill in the prognosis column null values

In [14]:
# Extract numbers and convert to nullable integer type
ml_dataset['Injury_Prognosis'] = (
    ml_dataset['Injury_Prognosis']
    .str.extract('(\d+)', expand=False)
    .astype('Int64')
)
print(ml_dataset['Injury_Prognosis'].head(20))

0        5
1        2
2        7
3        4
4        3
5        6
6     <NA>
7        8
8        6
9        2
10       6
11       6
12       7
13       5
14       9
15       9
16       5
17       5
18       7
19       4
Name: Injury_Prognosis, dtype: Int64


## Check for Duplicate Rows

Duplicate rows contaminate the data and could skew the training process, negatively impacting prediction performance.

In [31]:
# calculate duplicates
duplicates = ml_dataset.duplicated()
# report duplicates
print(duplicates.any())
# list duplicate rows
print(ml_dataset[duplicates])

False
Empty DataFrame
Columns: [SettlementValue, AccidentType, Injury_Prognosis, SpecialHealthExpenses, SpecialOverage, GeneralRest, SpecialAdditionalInjury, SpecialEarningsLoss, SpecialUsageLoss, SpecialAssetDamage, SpecialFixes, GeneralFixed, GeneralUplift, SpecialLoanerVehicle, SpecialTripCosts, SpecialJourneyExpenses, SpecialTherapy, Exceptional_Circumstances, Minor_Psychological_Injury, Dominant injury, Whiplash, Vehicle Type, Weather Conditions, Accident Date, Claim Date, Vehicle Age, Number of Passengers, Accident Description, Injury Description, Police Report Filed, Witness Present]
Index: []

[0 rows x 31 columns]


*No Duplicate rows exist*

## Check for Negative Values

Negative values where they are not expected/possible would also affect the reliability of the analysis and should be removed prior to proceeding

In [18]:
# Check for negative values in all columns of Dataframe
for column in ml_dataset.select_dtypes(include='float64'):
    column = ml_dataset[column]
    # Get the count of negatives in column
    count = (column < 0).sum()
    print('Count of negative values in column ', column.name, ' is : ', count)

Count of negative values in column  SettlementValue  is :  0
Count of negative values in column  SpecialHealthExpenses  is :  0
Count of negative values in column  SpecialOverage  is :  0
Count of negative values in column  GeneralRest  is :  0
Count of negative values in column  SpecialAdditionalInjury  is :  0
Count of negative values in column  SpecialEarningsLoss  is :  0
Count of negative values in column  SpecialUsageLoss  is :  0
Count of negative values in column  SpecialAssetDamage  is :  0
Count of negative values in column  SpecialFixes  is :  0
Count of negative values in column  GeneralFixed  is :  0
Count of negative values in column  GeneralUplift  is :  0
Count of negative values in column  SpecialLoanerVehicle  is :  0
Count of negative values in column  SpecialTripCosts  is :  0
Count of negative values in column  SpecialJourneyExpenses  is :  0
Count of negative values in column  SpecialTherapy  is :  0
Count of negative values in column  Vehicle Age  is :  0
Count o

*No negative values exist*

## Convert numerical values to integers
Numerical columns are primarily money amounts so can be rounded to whole pounds - prediction amount does not require accuracy in pence

It doesn't make sense to have number of passengers as float

Age of car also does not need to be accurate to fractions of a year

In [21]:
for column in ml_dataset.select_dtypes(include='float64'):
    ml_dataset[column] = ml_dataset[column].round()
    ml_dataset[column] = ml_dataset[column].astype('Int64')

In [22]:
ml_dataset.head(20)

Unnamed: 0,SettlementValue,AccidentType,Injury_Prognosis,SpecialHealthExpenses,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,SpecialAssetDamage,...,Vehicle Type,Weather Conditions,Accident Date,Claim Date,Vehicle Age,Number of Passengers,Accident Description,Injury Description,Police Report Filed,Witness Present
0,520,Rear end,5.0,0,0,0,0,0.0,0,0,...,Motorcycle,Rainy,2023-11-10 11:22:24.508901,2024-06-11 11:22:24.508901,13.0,4,Side collision at an intersection.,Whiplash and minor bruises.,Yes,Yes
1,870,Rear end,2.0,0,0,520,0,0.0,90,0,...,Motorcycle,Snowy,2023-06-25 00:55:01.140228,2024-01-09 00:55:01.140228,4.0,2,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes
2,2140,Other side pulled out of side road,7.0,0,0,1400,0,0.0,0,0,...,Motorcycle,Sunny,2020-02-23 17:43:47.805561,2020-03-01 17:43:47.805561,9.0,4,Lost control on a snowy road.,Whiplash and minor bruises.,Yes,No
3,520,Rear end - Clt pushed into next vehicle,4.0,0,0,0,0,0.0,0,0,...,Truck,Rainy,2021-10-02 04:36:32.118423,2021-10-13 04:36:32.118423,5.0,1,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes
4,260,Rear end,3.0,0,0,0,0,0.0,0,0,...,Motorcycle,Rainy,2023-04-02 05:13:07.117423,2023-04-14 05:13:07.117423,9.0,1,Lost control on a snowy road.,Concussion and bruised ribs.,Yes,Yes
5,520,Rear end,6.0,0,0,0,0,0.0,0,0,...,Car,Rainy,2020-03-12 13:18:12.458491,2020-06-30 13:18:12.458491,10.0,1,Lost control on a snowy road.,Minor cuts and scrapes.,Yes,No
6,1015,Rear end,,0,0,0,0,0.0,0,9,...,Motorcycle,Sunny,2023-04-08 08:24:57.899579,2023-08-19 08:24:57.899579,5.0,1,Hit a deer on the highway.,Whiplash and minor bruises.,No,No
7,1032,Other side pulled out of side road,8.0,0,0,0,0,0.0,0,0,...,Car,Sunny,2021-10-14 11:00:13.682736,2022-02-19 11:00:13.682736,9.0,4,Side collision at an intersection.,Concussion and bruised ribs.,No,Yes
8,808,Rear end,6.0,0,0,0,0,0.0,0,0,...,Motorcycle,Snowy,2020-09-09 15:07:57.455491,2021-02-04 15:07:57.455491,13.0,4,Swerved to avoid another vehicle.,Minor cuts and scrapes.,Yes,Yes
9,500,Rear end,2.0,0,0,0,0,0.0,0,0,...,Motorcycle,Rainy,2021-09-01 12:37:18.207641,2021-10-25 12:37:18.207641,19.0,4,Rear-ended at a stoplight.,Minor cuts and scrapes.,Yes,Yes


## Display Null Value Counts

Null values indicate missing data and impact a feature column's usefulness in the training process

In [23]:
# find nulls
ml_dataset.isnull().sum()

SettlementValue                 0
AccidentType                  111
Injury_Prognosis              146
SpecialHealthExpenses         117
SpecialOverage                109
GeneralRest                   119
SpecialAdditionalInjury       126
SpecialEarningsLoss           114
SpecialUsageLoss              121
SpecialAssetDamage            102
SpecialFixes                  112
GeneralFixed                  114
GeneralUplift                 126
SpecialLoanerVehicle          130
SpecialTripCosts              108
SpecialJourneyExpenses        136
SpecialTherapy                126
Exceptional_Circumstances      97
Minor_Psychological_Injury    107
Dominant injury               103
Whiplash                      123
Vehicle Type                  122
Weather Conditions            110
Accident Date                 131
Claim Date                    104
Vehicle Age                   118
Number of Passengers          115
Accident Description          111
Injury Description            109
Police Report 

There are significant numbers of null values across the dataset, and most likely removing all of them would drastically reduce the number of rows, negatively impact model training. An approach is needed to identify which feature columns are correlated with the settlement value before determining how to proceed with NaN values in any column.

## Visualise correlations

In [24]:
def check_correlation(df, target_col, feature_col):
    temp_df = df[[target_col, feature_col]]
    temp_df = temp_df.dropna()
    correlation = temp_df[target_col].corr(df[feature_col])
    print(f'Correlation between {target_col} and {feature_col}: {correlation}')

In [30]:
ml_dataset.describe()

Unnamed: 0,SettlementValue,Injury_Prognosis,SpecialHealthExpenses,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,SpecialAssetDamage,SpecialFixes,GeneralFixed,GeneralUplift,SpecialLoanerVehicle,SpecialTripCosts,SpecialJourneyExpenses,SpecialTherapy,Vehicle Age,Number of Passengers
count,4894.0,4748.0,4777.0,4785.0,4775.0,4768.0,4780.0,4773.0,4792.0,4782.0,4780.0,4768.0,4764.0,4786.0,4758.0,4768.0,4776.0,4779.0
mean,1217.869841,6.105518,3.682018,13.470219,462.595602,0.278943,51.430962,9.122145,33.538606,3.185905,687.285565,10.45344,7.849916,1.950063,11.732871,184.576971,9.507956,2.482737
std,858.824382,2.753099,85.870386,84.760448,766.934839,13.10117,392.074827,65.947574,285.0,102.311695,399.701043,50.365019,142.568894,13.044567,49.420832,224.298158,5.730669,1.109656
min,240.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,669.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,495.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
50%,988.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,520.0,0.0,0.0,0.0,0.0,57.0,10.0,2.0
75%,1510.0,8.0,0.0,0.0,903.0,0.0,0.0,0.0,0.0,0.0,895.0,0.0,0.0,0.0,0.0,350.0,14.0,3.0
max,7862.0,24.0,3024.0,1250.0,3913.0,889.0,7736.0,1050.0,6070.0,4000.0,4345.0,1430.0,4408.0,254.0,880.0,1225.0,19.0,4.0


In [29]:
for column in ml_dataset:
    if column != 'SettlementValue' and ml_dataset[column].dtype != 'object':
        check_correlation(ml_dataset, 'SettlementValue', column)


Correlation between SettlementValue and Injury_Prognosis: 0.5243528386266065
Correlation between SettlementValue and SpecialHealthExpenses: 0.024042364865717303
Correlation between SettlementValue and SpecialOverage: 0.06265953199816122
Correlation between SettlementValue and GeneralRest: 0.5763339019958738
Correlation between SettlementValue and SpecialAdditionalInjury: 0.014337219382340035
Correlation between SettlementValue and SpecialEarningsLoss: 0.3371770913408868
Correlation between SettlementValue and SpecialUsageLoss: 0.1296267645448482
Correlation between SettlementValue and SpecialAssetDamage: 0.2702249745742846
Correlation between SettlementValue and SpecialFixes: 0.10789129549610699
Correlation between SettlementValue and GeneralFixed: 0.5340317479419371
Correlation between SettlementValue and GeneralUplift: 0.09521772096319822
Correlation between SettlementValue and SpecialLoanerVehicle: 0.13171460587347641
Correlation between SettlementValue and SpecialTripCosts: 0.15848

## Convert simple binary columns
Replace binary column text to int and convert NaNs to 0

Columns to convert:

Exceptional_Circumstances, Minor_Psychological_Injury, Whiplash, Police Report Filed, Witness Present

In [None]:
def binary_encode(df, column, positive_value):
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    df[column] = df[column].astype('Int8')
    return df

In [None]:
binary_cols = ['Exceptional_Circumstances', 'Minor_Psychological_Injury', 'Whiplash', 'Police Report Filed', 'Witness Present']
for col in binary_cols:
    ml_dataset = binary_encode(ml_dataset, col, 'Yes')

In [None]:
ml_dataset.head(20)

## Fill NaN values in numeric columns with zero
As Injury prognosis, Number of passengers and age of vehicle could be significant in determining the settlement value, the NaN rows will be removed until further analysis has determined whether they should be imputed based on other rows, replaced with a fixed value or dropped altogether


## Imputing NaN text fields to No

In [None]:
# for column in ml_dataset.select_dtypes(include=[object]):
#     ml_dataset[column] = ml_dataset[column].fillna('No')

In [None]:
ml_dataset.isnull().sum()

## Display count of zero values for all columns

Zero values are often used as placeholders for null values, so any row containing a zero value should be validated to determine if it is appropriate.

In [None]:
# Count number of zeros in all columns of Dataframe
for column_name in ml_dataset.columns:
    column = ml_dataset[column_name]
    # Get the count of Zeros in column
    count = (column == 0).sum()
    print('Count of zeros in column ', column_name, ' is : ', count)

## Impute Missing Values

### Convert zero values to NaN

In [None]:
# display describe summary table to confirm the changes
ml_dataset.describe()

### Impute mean average

In [None]:
# # replace NaN values with mean value for the SkinThickness column
# ml_dataset["SkinThickness"] = ml_dataset["SkinThickness"].fillna((ml_dataset["SkinThickness"].mean())).round(2)

# # display column summary to confirm changes
# ml_dataset["SkinThickness"].describe()

### Impute using KNN

In [None]:
# from sklearn.impute import KNNImputer

# # create a KNN imputer object
# imputer = KNNImputer(n_neighbors=5)

# # impute missing values using KNN for specified columns
# columns = ['Glucose','BloodPressure','BMI']
# for col in columns:
#     temp_df = pd.DataFrame(ml_dataset[col])
#     ml_dataset[col] = pd.DataFrame(imputer.fit_transform(temp_df)).round(2)

# ml_dataset.describe()


## Outlier Detection

### The following section is retained for reference only, as applying outlier removal to the dataset ultimately reduced performance of the trained model. Code relating to inspecting the outliers has been left intact, but all code relating to removal of outlier data has been commented out to prevent it from impacting model training.

Outlier detection is useful to remove datapoints that may disproportionately affect the model training

### Extreme Value Analysis
Calculate the interquartile range (IQR)

IQR (Inter quantiles range)= 75th quantile — 25th quantile

An outlier will be in the following upper and lower boundaries:
- Upper Boundary = 75th quantile +(IQR * 1.5)
- Lower Boundary = 25th quantile — (IQR * 1.5)

Or for extreme cases:
- Upper Boundary = 75th quantile +(IQR * 3)
- Lower Boundary = 25th quantile — (IQR * 3)

If the data point is above the upper boundary or below the lower boundary, it can be considered as an outlier.

In [None]:
# # function to calculate inter-quartile ranges for a given column
# def calc_iqr(column):
#     temp_df = pd.DataFrame(ml_dataset[column])

#     # calculate inter-quartile range
#     IQR = (temp_df.quantile(0.75) - temp_df.quantile(0.25)).round(3)

#     # Calculate lower limit and lower limit extreme
#     lower_limit = (temp_df.quantile(0.25) - (IQR * 1.5)).round(3)
#     lower_limit_extreme = (temp_df.quantile(0.25) - (IQR * 3)).round(3)

#     # prevent negative numbers being evaluated
#     lower_limit[lower_limit < 0] = 0
#     lower_limit_extreme[lower_limit_extreme < 0] = 0

#     # get lower boundary and lower boundary extreme from the dataframe
#     compare_lower = lower_limit.iloc[0]
#     compare_lower_ex = lower_limit_extreme.iloc[0]

#     # compare the column data with the boundary value
#     lower_criteria = temp_df[(temp_df.iloc[:,0]) < compare_lower]
#     lower_ex_criteria = temp_df[(temp_df.iloc[:,0]) < compare_lower_ex]

#     # Calculate upper limit and upper limit extreme
#     upper_limit = (temp_df.quantile(0.75) + (IQR * 1.5)).round(3)
#     upper_limit_extreme = (temp_df.quantile(0.75) + (IQR * 3)).round(3)

#     # get upper boundary and upper boundary extreme from the dataframe
#     compare_upper = upper_limit.iloc[0]
#     compare_upper_ex = upper_limit_extreme.iloc[0]

#     # compare the column data with the boundary value
#     upper_criteria = temp_df[(temp_df.iloc[:,0]) > compare_upper]
#     upper_ex_criteria = temp_df[(temp_df.iloc[:,0]) > compare_upper_ex]

#     # display results of the calculations
#     print('\nTotal participants:',temp_df.size)
#     print(column, 'Inter-Quartile Range (IQR) = ', IQR[0])

#     print('\n', column, 'Lower Limit = ', lower_limit[0])
#     print('Participants with', column, 'below Lower Limit:', lower_criteria.size)

#     print('\n', column, 'Lower Limit Extreme = ', lower_limit_extreme[0])
#     print('Participants with', column, 'below Lower Limit Extreme:', lower_ex_criteria.size)

#     print('\n', column, 'Upper Limit = ', upper_limit[0])
#     print('Participants with', column, 'above Upper Limit:', upper_criteria.size)

#     print('\n', column, 'Upper Limit Extreme = ', upper_limit_extreme[0])
#     print('Participants with', column, 'above Upper Limit Extreme:', upper_ex_criteria.size)

In [None]:
# # widget to display IQR values for the column selected in a drop-down box
# widgets.interact(lambda column: calc_iqr(column), column=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'])

### Removal of Extreme Outliers

Remove extreme outliers identified in the previous section

*The following code section is commented out to prevent data removal*

In [None]:
# ml_dataset_ex_outs = ml_dataset

# index = ml_dataset[(ml_dataset['SkinThickness'] > 53)].index
# ml_dataset_ex_outs.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BMI'] > 63.90)].index
# ml_dataset_ex_outs.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['DiabetesPedigreeFunction'] > 1.77375)].index
# ml_dataset_ex_outs.drop(index, inplace=True)


### Visualizing Outliers

Using a box plot is a quick method of visualizing outliers

In [None]:
# # sns.boxplot(y='annual_inc', data = data)
# widgets.interact(lambda X: sns.boxplot(data=ml_dataset, x=X), X=['Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction','Age'])


### List Outliers Based on Limits Identifed in the Boxplot

In [None]:
# print(ml_dataset[(ml_dataset['SkinThickness'] > 80)].index)
# print(ml_dataset[(ml_dataset['Age'] > 65)].index)
# print(ml_dataset[(ml_dataset['BMI'] > 50)].index)
# print(ml_dataset[(ml_dataset['BloodPressure'] > 105)].index)
# print(ml_dataset[(ml_dataset['BloodPressure'] < 40)].index)
# print(ml_dataset[(ml_dataset['Pregnancies'] > 13)].index)

### Outlier Removal Based on Boxplot

*This section is commented out to prevent data removal*

In [None]:
# index = ml_dataset[(ml_dataset['SkinThickness'] > 80)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['Age'] > 65)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BMI'] > 50)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BloodPressure'] > 105)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BloodPressure'] < 40)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['Pregnancies'] > 13)].index
# ml_dataset.drop(index, inplace=True)

# Initial Visualizations

## Outcome Distribution

In [None]:
# sns.countplot(x=ml_dataset['Settlement'])

## Histogram chart

Histogram chart using dropdown widget to allow switching between x values. This allows quick viewing of the recorded frequency of the dataset feature values.

In [None]:
# widgets.interact(lambda X: ml_dataset[X].plot.hist(bins=10, figsize=(10,5)), X=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'])

## Distribution Plot Chart

Distribution Plot chart using dropdown widget to allow switching between x values. This allows comparison of the feature distributions for each outcome.

In [None]:
# widgets.interact(lambda X: sns.displot(data=ml_dataset, x=X, col='Outcome', kind='kde'), X=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'])

## Scatter Plot Chart

Scatter Plot chart, using X-axis and Y-axis dropdown widgets to allow bivariate analysis for identifying potential relationships between features.

In [None]:
# widgets.interact(lambda X, Y: sns.scatterplot(data=ml_dataset, style=ml_dataset['Outcome'], hue=ml_dataset['Outcome'], x=X, y=Y), X=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'], Y=['Glucose','Pregnancies','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction','Age'])

## Correlation Matrix

A correlation matrix quantifies and visualizes the linear relationships between variables, aiding in feature selection and understanding variable interactions.

In [None]:
# # calculate the feature correlation values
# c = ml_dataset.select_dtypes('number').corr().round(3)

# # Plot the correlation matrix as a heatmap
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.heatmap(c, annot=True)
# plt.show()

# Recursive Feature Elimination

As a final data processing step, we will apply recursive feature elimination with cross validation (RFECV) to identify the most useful features on which to train the models. Reducing the dimensionality and overall size of the dataset decreases training time and improves efficiency, which both contribute to a lower financial costs through reduced compute requirements and reduction in energy consumption, resulting in a more sustainable approach.

In [None]:
# # import required library functions
# from sklearn.feature_selection import RFECV
# from sklearn.linear_model import LinearRegression

# # split dataset to features and classifications
# X = ml_dataset.drop(["Outcome"], axis = 1)
# y = ml_dataset["Outcome"]

# # use a linear regressino model for cross validation testing
# regressor = LinearRegression()
# feature_selector = RFECV(regressor)

# # train the model
# fit = feature_selector.fit(X,y)

# # determine and print result of feature evaluation
# optimal_feature_count = feature_selector.n_features_
# print(f"Optimal numer of features: {optimal_feature_count}")

# print(X.columns)
# print(feature_selector.ranking_)
# print(feature_selector.support_)

# # plot chart of evaluation runs
# plt.plot(range(1, len(fit.grid_scores_) + 1), fit.grid_scores_, marker = "o")
# plt.ylabel("Model Score")
# plt.xlabel("Number of Features")
# plt.title(f"Feature Selection using RFE")
# plt.tight_layout()
# plt.show()

### Removal of features recommended by the RFECV process.

In [None]:
# # drop specificed columns to a new dataframe
# ml_dataset_4col = ml_dataset.drop(['BloodPressure', 'SkinThickness','Age'], axis=1)

# Model 1: Baseline for Comparison

## Split Features and Classifications

Split the dataset into two ndarrays, one for the feature matrix and another for the corresponding classifications.

In [None]:
# # Create X for features
# X=df.drop(['Settlement'],axis=1)

# # Create y for classes
# y=df['Settlement']

# # Display first 5 rows of X
# X[:5]

## Divide Train and Test

Split the dataset to utilise 70% of the data for training and 30% for testing. The model training was repeated with an 80:20 split (the usual recommendation) and with a 90:10 split, but the 70:30 ratio outperformed both of those options.

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

# print('Training dataframes have shape:', X_train.shape, y_train.shape, '\nTest dataframes have shape:',X_test.shape, y_test.shape)

## Stratification

In [None]:
## split dataframe using stratification
# train, test = train_test_split(ml_dataset_4col, test_size=0.3, random_state=1, stratify=ml_dataset_4col['Outcome'])

## split stratified training data into features and classes
# X_train=train.drop(['Outcome'],axis=1)
# y_train=train['Outcome']

## split stratified test data into features and classes
# X_test=test.drop(['Outcome'],axis=1)
# y_test=test['Outcome']

## Display Train and Test dataframes

Use dropdown menu to display initial records for selected dataframe - choose between features (X) and classes (y) for either train or test.

In [None]:
# def display_head(display):
#     if display == 'X_train':
#         print(X_train.head(5))

#     if display == 'X_test':
#         print(X_test.head(5))

#     if display == 'y_train':
#         print(y_train.head(5))

#     if display == 'y_test':
#         print(y_test.head(5))

# widgets.interact(lambda Selection: display_head(Selection), Selection=['X_train', 'y_train', 'X_test', 'y_test'])

## Create Processing and Training Pipeline

In [None]:
# # Import libraries required for SVM model, pipeline and scaling
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVC

# # create pipeline to scale features
# default_pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC())])

## Train Default Model

In [None]:
# # import libraries required for reporting
# from sklearn import metrics

# # train default model using the pipeline
# default_pipe.fit(X_train, y_train)

# # make predictions based on the default model
# y_pred_def = default_pipe.predict(X_test)

## Performance Metrics Function

As we will be re-using these performance metrics throughout the notebook, it makes sense to create a function that can be called as needed.

In [None]:
# # import library functions
# from sklearn.metrics import classification_report

# def display_metrics(y_test,y_pred):
#     # generate and display confusion matrix
#     conf_matrix = confusion_matrix(y_test,y_pred)
#     plot_confusion_matrix(conf_matrix)

#     # display classification report
#     print(classification_report(y_test,y_pred))

#     # display f1 score
#     print('F1 Score:', f1_score(y_test,y_pred))

#     # display roc-auc score
#     print('ROC-AUC Score:', roc_auc_score(y_test,y_pred))

## Default Model Performance Metrics Report

To demonstrate the complete metrics report, it is now applied to the default model predictions.

In [None]:
# # display metrics using previously defined function
# display_metrics(y_test,y_pred_def)

## Apply Class Weighting

In [None]:
# # create weighted pipeline to scale features
# weighted_pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC(class_weight='balanced'))])

# # train model using the weighted pipeline
# weighted_pipe.fit(X_train, y_train)

# # make predictions based on the weighted model
# y_pred_weighted = weighted_pipe.predict(X_test)

# # display metrics using previously defined function
# display_metrics(y_test,y_pred_weighted)

## Cross Validation

Rather than just training the model once on the entire training dataset, cross validation partitions the data into multiple subsets, training the model on some subsets and validating it on others, and then averaging the results to better estimate its performance on unseen data, resulting in a more generalized model with reduced tendency to overfit.

The sklearn RepeatedStratifiedKFold function is a cross-validation method that implements stratification of the training and validation partitions during cross validation. It achieves this by repeatedly splitting the dataset into 'K' stratified folds, ensuring that each fold is a good representative of the whole, and it is used multiple times to provide a more robust estimate of model performance.

In [None]:
# # importing libraries and functions needed for cross validation
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold

# # define reusable cross validation test harness
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)