# Dataset Research

# Global Library Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import sklearn
import ipywidgets as widgets

# set matplotlib to display graphics in the notebook
%matplotlib inline

# Dataset Import

This section imports the diabetes dataset provided for the assignment. The dataset is saved in csv format in the same folder as the project notebook. The file is imported into the project as a pandas dataframe object.

In [3]:
# import dataset from csv
ml_dataset = pd.read_csv('./Synthetic_Data_For_Students.csv')

# confirm number of imported records matches expected size
print("Number of records imported: " + str(len(ml_dataset.index)))

Number of records imported: 5000


# Dataset Initial Exploration

## Display Dataset Summary Info

In [4]:
ml_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SettlementValue             4894 non-null   float64
 1   AccidentType                4878 non-null   object 
 2   Injury_Prognosis            4844 non-null   object 
 3   SpecialHealthExpenses       4870 non-null   float64
 4   SpecialReduction            4879 non-null   float64
 5   SpecialOverage              4883 non-null   float64
 6   GeneralRest                 4872 non-null   float64
 7   SpecialAdditionalInjury     4866 non-null   float64
 8   SpecialEarningsLoss         4872 non-null   float64
 9   SpecialUsageLoss            4870 non-null   float64
 10  SpecialMedications          4870 non-null   float64
 11  SpecialAssetDamage          4889 non-null   float64
 12  SpecialRehabilitation       4884 non-null   float64
 13  SpecialFixes                4879 

There are null values in multiple columns, these will need to be handled appropriately. Discussion with the client has identified that null values in the settlement column indicate unsettled claims, so these should not be used in model training.


## Display Column Value Ranges

In [None]:
# display summary statistics for numerical columns
ml_dataset.describe()

Unnamed: 0,SettlementValue,SpecialHealthExpenses,SpecialReduction,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,SpecialMedications,SpecialAssetDamage,...,SpecialFixes,GeneralFixed,GeneralUplift,SpecialLoanerVehicle,SpecialTripCosts,SpecialJourneyExpenses,SpecialTherapy,Vehicle Age,Driver Age,Number of Passengers
count,4894.0,4870.0,4879.0,4883.0,4872.0,4866.0,4872.0,4870.0,4870.0,4889.0,...,4879.0,4879.0,4863.0,4861.0,4885.0,4853.0,4868.0,4874.0,4871.0,4878.0
mean,1218.010685,3.611704,0.0,13.36358,463.305386,0.28358,52.191115,9.10883,0.109698,33.460761,...,3.942209,687.509736,10.407465,7.71972,1.959881,11.63912,183.600286,9.508617,48.78916,2.48237
std,858.866309,85.047845,0.0,84.223612,766.187669,12.988075,392.90913,65.505181,1.389916,282.692529,...,116.335053,399.361279,50.165743,141.155658,13.117419,49.086924,223.88578,5.727625,17.819725,1.109911
min,240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,240.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,1.0
25%,669.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,495.0,0.0,0.0,0.0,0.0,0.0,4.0,33.0,1.0
50%,988.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,520.0,0.0,0.0,0.0,0.0,50.0,10.0,49.0,2.0
75%,1510.0,0.0,0.0,0.0,906.0,0.0,0.0,0.0,0.0,0.0,...,0.0,895.0,0.0,0.0,0.0,0.0,350.0,14.0,64.0,3.0
max,7862.9,3024.0,0.0,1250.0,3912.64,889.0,7735.58,1050.0,30.25,6070.0,...,4000.0,4345.0,1430.0,4408.16,254.2,880.0,1225.0,19.0,79.0,4.0


## Show total values for each numerical column

In [None]:
# for each numerical column, diplay the count of non-zero values
print("column - count of non-zero - sum of values - mean values (excluding zero rows)")
for column in ml_dataset.select_dtypes(include=[np.number]):
    print(column + " - " + str(ml_dataset[ml_dataset[column] != 0][column].count()) + " - " + str(ml_dataset[column].sum()) + " - " + str((ml_dataset[column].sum() / ml_dataset[ml_dataset[column] != 0][column].count())))



column - count of non-zero - sum of values - mean values
SettlementValue - 4894 - 5960944.289999999 - 1218.0106845116468
SpecialHealthExpenses - 16 - 17589.0 - 1099.3125
SpecialReduction - 0 - 0.0 - nan
SpecialOverage - 160 - 65254.36 - 407.83975
GeneralRest - 1671 - 2257223.84 - 1350.8221663674447
SpecialAdditionalInjury - 9 - 1379.9 - 153.32222222222222
SpecialEarningsLoss - 296 - 254275.11 - 859.0375337837837
SpecialUsageLoss - 244 - 44360.0 - 181.80327868852459
SpecialMedications - 38 - 534.23 - 14.058684210526316
SpecialAssetDamage - 472 - 163589.66 - 346.5882627118644
SpecialRehabilitation - 5 - 96.19999999999999 - 19.24
SpecialFixes - 10 - 19234.04 - 1923.404
GeneralFixed - 4879 - 3354360.0 - 687.5097356015577
GeneralUplift - 311 - 50611.5 - 162.7379421221865
SpecialLoanerVehicle - 69 - 37525.56 - 543.8486956521739
SpecialTripCosts - 255 - 9574.0175 - 37.54516666666667
SpecialJourneyExpenses - 1003 - 56484.65 - 56.315702891326026
SpecialTherapy - 2711 - 893766.1900000001 - 329.6

  print(column + " - " + str(ml_dataset[ml_dataset[column] != 0][column].count()) + " - " + str(ml_dataset[column].sum()) + " - " + str((ml_dataset[column].sum() / ml_dataset[ml_dataset[column] != 0][column].count())))


Looking at the values above, SpecialReduction can be ignored as there are no non-zero values.

SpecialRehabilitation can be ignored as it appears infrequently and contributes little to overall settlement amount (max value £21).

SpecialMedication can be ignored as it appears infrequently and contributes little to overall settlement amount (max value £30).

Columns with a high maximum value but low mean are candidates for grouping and further analysis is required.

SpecialAdditionalInjury, SpecialFixes, SpecialHealthExpenses and SpecialLoanerVehicle could all be relevant groups, as they have low frequency but can be high value.

GeneralUplift - there are a small number of high values in this column

SpecialTripCosts and SpecialJourneyExpenses may be important - need to perform further analysis


Recommend investigating the relationship and correlations between the different types of columns (eg. can medical / trip+journey expenses be grouped)


## Display the first x rows of the dataset using a slider widget

In [None]:
widgets.interact(lambda x: ml_dataset.head(x), x=(widgets.IntSlider(min=5, max=50, step=5, value=5)))

# Data Preparation

## Display Null Value Counts

Null values indicate missing data and impact a feature column's usefulness in the training process

In [None]:
# find nulls
ml_dataset.isnull()
ml_dataset.isnull().sum()

## Check for Duplicate Rows

Duplicate rows contaminate the data and could skew the training process, negatively impacting prediction performance.

In [None]:
# calculate duplicates
duplicates = ml_dataset.duplicated()
# report duplicates
print(duplicates.any())
# list duplicate rows
print(ml_dataset[duplicates])

## Check for Negative Values

Negative values where they are not expected/possible would also affect the reliability of the analysis and should be removed prior to proceeding

In [None]:
# Check for negative values in all columns of Dataframe
for column_name in ml_dataset.columns:
    column = ml_dataset[column_name]
    # Get the count of negatives in column
    count = (column < 0).sum()
    print('Count of negative values in column ', column_name, ' is : ', count)

## Display count of zero values for all columns

Zero values are often used as placeholders for null values, so any row containing a zero value should be validated to determine if it is appropriate.

In [None]:
# Count number of zeros in all columns of Dataframe
for column_name in ml_dataset.columns:
    column = ml_dataset[column_name]
    # Get the count of Zeros in column
    count = (column == 0).sum()
    print('Count of zeros in column ', column_name, ' is : ', count)

## Removal of Columns

In [None]:
# ml_dataset_dropcols = ml_dataset.drop(columns=['Insulin'])
# ml_dataset = ml_dataset_dropcols

## Impute Missing Values

### Convert zero values to NaN

In [None]:
# replace zero values with NaN in selected columns
# ml_dataset.loc[ml_dataset.Glucose == 0, 'Glucose'] = np.NaN
# ml_dataset.loc[ml_dataset.BloodPressure == 0, 'BloodPressure'] = np.NaN
# ml_dataset.loc[ml_dataset.SkinThickness == 0, 'SkinThickness'] = np.NaN
# ml_dataset.loc[ml_dataset.BMI == 0, 'BMI'] = np.NaN

# display describe summary table to confirm the changes
ml_dataset.describe()

### Impute mean average

In [None]:
# # replace NaN values with mean value for the SkinThickness column
# ml_dataset["SkinThickness"] = ml_dataset["SkinThickness"].fillna((ml_dataset["SkinThickness"].mean())).round(2)

# # display column summary to confirm changes
# ml_dataset["SkinThickness"].describe()

### Impute using KNN

In [None]:
# from sklearn.impute import KNNImputer

# # create a KNN imputer object
# imputer = KNNImputer(n_neighbors=5)

# # impute missing values using KNN for specified columns
# columns = ['Glucose','BloodPressure','BMI']
# for col in columns:
#     temp_df = pd.DataFrame(ml_dataset[col])
#     ml_dataset[col] = pd.DataFrame(imputer.fit_transform(temp_df)).round(2)

# ml_dataset.describe()


## Outlier Detection

### The following section is retained for reference only, as applying outlier removal to the dataset ultimately reduced performance of the trained model. Code relating to inspecting the outliers has been left intact, but all code relating to removal of outlier data has been commented out to prevent it from impacting model training.

Outlier detection is useful to remove datapoints that may disproportionately affect the model training

### Extreme Value Analysis
Calculate the interquartile range (IQR)

IQR (Inter quantiles range)= 75th quantile — 25th quantile

An outlier will be in the following upper and lower boundaries:
- Upper Boundary = 75th quantile +(IQR * 1.5)
- Lower Boundary = 25th quantile — (IQR * 1.5)

Or for extreme cases:
- Upper Boundary = 75th quantile +(IQR * 3)
- Lower Boundary = 25th quantile — (IQR * 3)

If the data point is above the upper boundary or below the lower boundary, it can be considered as an outlier.

In [None]:
# # function to calculate inter-quartile ranges for a given column
# def calc_iqr(column):
#     temp_df = pd.DataFrame(ml_dataset[column])

#     # calculate inter-quartile range
#     IQR = (temp_df.quantile(0.75) - temp_df.quantile(0.25)).round(3)

#     # Calculate lower limit and lower limit extreme
#     lower_limit = (temp_df.quantile(0.25) - (IQR * 1.5)).round(3)
#     lower_limit_extreme = (temp_df.quantile(0.25) - (IQR * 3)).round(3)

#     # prevent negative numbers being evaluated
#     lower_limit[lower_limit < 0] = 0
#     lower_limit_extreme[lower_limit_extreme < 0] = 0

#     # get lower boundary and lower boundary extreme from the dataframe
#     compare_lower = lower_limit.iloc[0]
#     compare_lower_ex = lower_limit_extreme.iloc[0]

#     # compare the column data with the boundary value
#     lower_criteria = temp_df[(temp_df.iloc[:,0]) < compare_lower]
#     lower_ex_criteria = temp_df[(temp_df.iloc[:,0]) < compare_lower_ex]

#     # Calculate upper limit and upper limit extreme
#     upper_limit = (temp_df.quantile(0.75) + (IQR * 1.5)).round(3)
#     upper_limit_extreme = (temp_df.quantile(0.75) + (IQR * 3)).round(3)

#     # get upper boundary and upper boundary extreme from the dataframe
#     compare_upper = upper_limit.iloc[0]
#     compare_upper_ex = upper_limit_extreme.iloc[0]

#     # compare the column data with the boundary value
#     upper_criteria = temp_df[(temp_df.iloc[:,0]) > compare_upper]
#     upper_ex_criteria = temp_df[(temp_df.iloc[:,0]) > compare_upper_ex]

#     # display results of the calculations
#     print('\nTotal participants:',temp_df.size)
#     print(column, 'Inter-Quartile Range (IQR) = ', IQR[0])

#     print('\n', column, 'Lower Limit = ', lower_limit[0])
#     print('Participants with', column, 'below Lower Limit:', lower_criteria.size)

#     print('\n', column, 'Lower Limit Extreme = ', lower_limit_extreme[0])
#     print('Participants with', column, 'below Lower Limit Extreme:', lower_ex_criteria.size)

#     print('\n', column, 'Upper Limit = ', upper_limit[0])
#     print('Participants with', column, 'above Upper Limit:', upper_criteria.size)

#     print('\n', column, 'Upper Limit Extreme = ', upper_limit_extreme[0])
#     print('Participants with', column, 'above Upper Limit Extreme:', upper_ex_criteria.size)

In [None]:
# # widget to display IQR values for the column selected in a drop-down box
# widgets.interact(lambda column: calc_iqr(column), column=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'])

### Removal of Extreme Outliers

Remove extreme outliers identified in the previous section

*The following code section is commented out to prevent data removal*

In [None]:
# ml_dataset_ex_outs = ml_dataset

# index = ml_dataset[(ml_dataset['SkinThickness'] > 53)].index
# ml_dataset_ex_outs.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BMI'] > 63.90)].index
# ml_dataset_ex_outs.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['DiabetesPedigreeFunction'] > 1.77375)].index
# ml_dataset_ex_outs.drop(index, inplace=True)


### Visualizing Outliers

Using a box plot is a quick method of visualizing outliers

In [None]:
# # sns.boxplot(y='annual_inc', data = data)
# widgets.interact(lambda X: sns.boxplot(data=ml_dataset, x=X), X=['Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction','Age'])


### List Outliers Based on Limits Identifed in the Boxplot

In [None]:
# print(ml_dataset[(ml_dataset['SkinThickness'] > 80)].index)
# print(ml_dataset[(ml_dataset['Age'] > 65)].index)
# print(ml_dataset[(ml_dataset['BMI'] > 50)].index)
# print(ml_dataset[(ml_dataset['BloodPressure'] > 105)].index)
# print(ml_dataset[(ml_dataset['BloodPressure'] < 40)].index)
# print(ml_dataset[(ml_dataset['Pregnancies'] > 13)].index)

### Outlier Removal Based on Boxplot

*This section is commented out to prevent data removal*

In [None]:
# index = ml_dataset[(ml_dataset['SkinThickness'] > 80)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['Age'] > 65)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BMI'] > 50)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BloodPressure'] > 105)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['BloodPressure'] < 40)].index
# ml_dataset.drop(index, inplace=True)

# index = ml_dataset[(ml_dataset['Pregnancies'] > 13)].index
# ml_dataset.drop(index, inplace=True)

# Initial Visualizations

## Outcome Distribution

In [None]:
sns.countplot(x=ml_dataset['Settlement'])

## Histogram chart

Histogram chart using dropdown widget to allow switching between x values. This allows quick viewing of the recorded frequency of the dataset feature values.

In [None]:
# widgets.interact(lambda X: ml_dataset[X].plot.hist(bins=10, figsize=(10,5)), X=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'])

: 

## Distribution Plot Chart

Distribution Plot chart using dropdown widget to allow switching between x values. This allows comparison of the feature distributions for each outcome.

In [None]:
# widgets.interact(lambda X: sns.displot(data=ml_dataset, x=X, col='Outcome', kind='kde'), X=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'])

## Scatter Plot Chart

Scatter Plot chart, using X-axis and Y-axis dropdown widgets to allow bivariate analysis for identifying potential relationships between features.

In [None]:
# widgets.interact(lambda X, Y: sns.scatterplot(data=ml_dataset, style=ml_dataset['Outcome'], hue=ml_dataset['Outcome'], x=X, y=Y), X=['Age','Pregnancies','Glucose','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction'], Y=['Glucose','Pregnancies','BloodPressure','SkinThickness','BMI','DiabetesPedigreeFunction','Age'])

## Correlation Matrix

A correlation matrix quantifies and visualizes the linear relationships between variables, aiding in feature selection and understanding variable interactions.

In [None]:
# # calculate the feature correlation values
# c = ml_dataset.select_dtypes('number').corr().round(3)

# # Plot the correlation matrix as a heatmap
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.heatmap(c, annot=True)
# plt.show()

# Recursive Feature Elimination

As a final data processing step, we will apply recursive feature elimination with cross validation (RFECV) to identify the most useful features on which to train the models. Reducing the dimensionality and overall size of the dataset decreases training time and improves efficiency, which both contribute to a lower financial costs through reduced compute requirements and reduction in energy consumption, resulting in a more sustainable approach.

In [None]:
# # import required library functions
# from sklearn.feature_selection import RFECV
# from sklearn.linear_model import LinearRegression

# # split dataset to features and classifications
# X = ml_dataset.drop(["Outcome"], axis = 1)
# y = ml_dataset["Outcome"]

# # use a linear regressino model for cross validation testing
# regressor = LinearRegression()
# feature_selector = RFECV(regressor)

# # train the model
# fit = feature_selector.fit(X,y)

# # determine and print result of feature evaluation
# optimal_feature_count = feature_selector.n_features_
# print(f"Optimal numer of features: {optimal_feature_count}")

# print(X.columns)
# print(feature_selector.ranking_)
# print(feature_selector.support_)

# # plot chart of evaluation runs
# plt.plot(range(1, len(fit.grid_scores_) + 1), fit.grid_scores_, marker = "o")
# plt.ylabel("Model Score")
# plt.xlabel("Number of Features")
# plt.title(f"Feature Selection using RFE")
# plt.tight_layout()
# plt.show()

### Removal of features recommended by the RFECV process.

In [None]:
# # drop specificed columns to a new dataframe
# ml_dataset_4col = ml_dataset.drop(['BloodPressure', 'SkinThickness','Age'], axis=1)

# Model 1: Baseline for Comparison

## Split Features and Classifications

Split the dataset into two ndarrays, one for the feature matrix and another for the corresponding classifications.

In [None]:
# # Create X for features
# X=df.drop(['Settlement'],axis=1)

# # Create y for classes
# y=df['Settlement']

# # Display first 5 rows of X
# X[:5]

## Divide Train and Test

Split the dataset to utilise 70% of the data for training and 30% for testing. The model training was repeated with an 80:20 split (the usual recommendation) and with a 90:10 split, but the 70:30 ratio outperformed both of those options.

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

# print('Training dataframes have shape:', X_train.shape, y_train.shape, '\nTest dataframes have shape:',X_test.shape, y_test.shape)

## Stratification

In [None]:
## split dataframe using stratification
# train, test = train_test_split(ml_dataset_4col, test_size=0.3, random_state=1, stratify=ml_dataset_4col['Outcome'])

## split stratified training data into features and classes
# X_train=train.drop(['Outcome'],axis=1)
# y_train=train['Outcome']

## split stratified test data into features and classes
# X_test=test.drop(['Outcome'],axis=1)
# y_test=test['Outcome']

## Display Train and Test dataframes

Use dropdown menu to display initial records for selected dataframe - choose between features (X) and classes (y) for either train or test.

In [None]:
# def display_head(display):
#     if display == 'X_train':
#         print(X_train.head(5))

#     if display == 'X_test':
#         print(X_test.head(5))

#     if display == 'y_train':
#         print(y_train.head(5))

#     if display == 'y_test':
#         print(y_test.head(5))

# widgets.interact(lambda Selection: display_head(Selection), Selection=['X_train', 'y_train', 'X_test', 'y_test'])

## Create Processing and Training Pipeline

In [None]:
# # Import libraries required for SVM model, pipeline and scaling
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVC

# # create pipeline to scale features
# default_pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC())])

## Train Default Model

In [None]:
# # import libraries required for reporting
# from sklearn import metrics

# # train default model using the pipeline
# default_pipe.fit(X_train, y_train)

# # make predictions based on the default model
# y_pred_def = default_pipe.predict(X_test)

NameError: name 'default_pipe' is not defined

## Performance Metrics Function

As we will be re-using these performance metrics throughout the notebook, it makes sense to create a function that can be called as needed.

In [None]:
# # import library functions
# from sklearn.metrics import classification_report

# def display_metrics(y_test,y_pred):
#     # generate and display confusion matrix
#     conf_matrix = confusion_matrix(y_test,y_pred)
#     plot_confusion_matrix(conf_matrix)

#     # display classification report
#     print(classification_report(y_test,y_pred))

#     # display f1 score
#     print('F1 Score:', f1_score(y_test,y_pred))

#     # display roc-auc score
#     print('ROC-AUC Score:', roc_auc_score(y_test,y_pred))

## Default Model Performance Metrics Report

To demonstrate the complete metrics report, it is now applied to the default model predictions.

In [None]:
# # display metrics using previously defined function
# display_metrics(y_test,y_pred_def)

## Apply Class Weighting

In [None]:
# # create weighted pipeline to scale features
# weighted_pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC(class_weight='balanced'))])

# # train model using the weighted pipeline
# weighted_pipe.fit(X_train, y_train)

# # make predictions based on the weighted model
# y_pred_weighted = weighted_pipe.predict(X_test)

# # display metrics using previously defined function
# display_metrics(y_test,y_pred_weighted)

## Cross Validation

Rather than just training the model once on the entire training dataset, cross validation partitions the data into multiple subsets, training the model on some subsets and validating it on others, and then averaging the results to better estimate its performance on unseen data, resulting in a more generalized model with reduced tendency to overfit.

The sklearn RepeatedStratifiedKFold function is a cross-validation method that implements stratification of the training and validation partitions during cross validation. It achieves this by repeatedly splitting the dataset into 'K' stratified folds, ensuring that each fold is a good representative of the whole, and it is used multiple times to provide a more robust estimate of model performance.

In [None]:
# # importing libraries and functions needed for cross validation
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold

# # define reusable cross validation test harness
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)