# Team Project - Team 3

Objective: Create a model to predict if a car insurance claim will be filed based on data primarily relating to information about vehicles

## Import packages and libraries

In [6]:
!pip install imblearn
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [7]:
import pandas as pd
import numpy as np 
import seaborn as sns
import sagemaker
import boto3


from matplotlib import pyplot as plt
from numpy import where
from sklearn import metrics
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import make_classification
from collections import Counter
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.xgboost.estimator import XGBoost
from time import gmtime, strftime
import statsmodels.api as sm
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)


%matplotlib inline
pd.set_option('display.max_columns', None)

## Read data

In [8]:
X_train = pd.read_csv('s3://techexcellence.ml.project.team3/CarInsuranceClaim/processed_data/X_train.csv')
X_test = pd.read_csv('s3://techexcellence.ml.project.team3/CarInsuranceClaim/processed_data/X_test.csv')
y_train = pd.read_csv('s3://techexcellence.ml.project.team3/CarInsuranceClaim/processed_data/y_train.csv')
y_test = pd.read_csv('s3://techexcellence.ml.project.team3/CarInsuranceClaim/processed_data/y_test.csv')
df = pd.read_csv('s3://techexcellence.ml.project.team3/CarInsuranceClaim/processed_data/processed_df.csv')

## Dummy Model

In [9]:
features = list(df.columns[:-1])

In [10]:
X_dummy = df[features]
y_dummy = df['is_claim']

In [11]:
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)

In [12]:
dummy_clf.fit(X_dummy, y_dummy)

In [13]:
y_pred = dummy_clf.predict(X_dummy)

In [14]:
print('Summary of performance:', '\n')

#Accuracy
print("The model's accuracy is: ", dummy_clf.score(X_dummy, y_dummy))

#F1 score
print("The model's F1 score is: ",f1_score(y_dummy, y_pred))

Summary of performance: 

The model's accuracy is:  0.9360322228290552
The model's F1 score is:  0.0


SUMMARY: The high level of accuracy does not actually come with the best predictive power as shown by the F1 sore. Moving forward, F1 score will be the primary evaluation metric.

## Linear Regression Model

In [15]:
reg_df = pd.DataFrame(df)

In [16]:
# Splitting train/test and balancing data

X = reg_df.loc[:, reg_df.columns != 'is_claim']
y = reg_df.loc[:, reg_df.columns == 'is_claim']

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['is_claim'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['is_claim']==0]))
print("Number of subscription",len(os_data_y[os_data_y['is_claim']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['is_claim']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['is_claim']==1])/len(os_data_X))

length of oversampled data is  76772
Number of no subscription in oversampled data 38386
Number of subscription 38386
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [17]:
# Recursive Feature Selection

data_final_vars=reg_df.columns.values.tolist()
y=['is_claim']
X=[i for i in data_final_vars if i not in y]

logreg = LogisticRegression()

rfe = RFE(logreg, step = 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)

[ True  True False  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True False  True False False False
 False False  True False False False  True False  True False  True  True
  True False False False False False  True False False False False False
 False False False False  True False False False  True  True False False
 False False False  True False  True False False  True  True False False
 False  True  True False  True  True  True  True  True  True False  True
  True  True False]


In [18]:
cols = ['policy_tenure','age_of_car','population_density','airbags','displacement','gear_box','turning_radius',
        'gross_weight','ncap_rating','volume','max_torque_NM','max_torque_RPM','max_power_NM','max_power_RPM',
        'area_cluster_C10','area_cluster_C11','area_cluster_C12','area_cluster_C14','area_cluster_C2','area_cluster_C3',
        'area_cluster_C5','area_cluster_C7','area_cluster_C8','area_cluster_C9','segment_B2','model_M6',
        'fuel_type_Diesel','fuel_type_Petrol','engine_type_F8D Petrol Engine','engine_type_K Series Dual jet',
        'is_esc_Yes','is_adjustable_steering_Yes','rear_brakes_type_Drum','transmission_type_Manual',
        'steering_type_Power','is_front_fog_lights_Yes','is_rear_window_wiper_Yes','is_rear_window_defogger_Yes',
        'is_brake_assist_Yes','is_power_door_locks_Yes','is_driver_seat_height_adjustable_Yes',
        'is_day_night_rear_view_mirror_Yes','is_ecw_Yes']

X=os_data_X[cols]
y=os_data_y['is_claim']

In [19]:
logit_model=sm.Logit(y,X)

result=logit_model.fit()
print(result.summary2())

  return 1/(1+np.exp(-X))


         Current function value: 0.658093
         Iterations: 35
                                          Results: Logit
Model:                         Logit                       Pseudo R-squared:            0.051      
Dependent Variable:            is_claim                    AIC:                         101098.2814
Date:                          2023-04-14 13:43            BIC:                         101338.7449
No. Observations:              76772                       Log-Likelihood:              -50523.    
Df Model:                      25                          LL-Null:                     -53214.    
Df Residuals:                  76746                       LLR p-value:                 0.0000     
Converged:                     0.0000                      Scale:                       1.0000     
No. Iterations:                35.0000                                                             
-----------------------------------------------------------------------------

  return 1/(1+np.exp(-X))


In [20]:
cols = ['policy_tenure', 'age_of_car','population_density', 'area_cluster_C10',
       'area_cluster_C11', 'area_cluster_C12', 'area_cluster_C14', 'area_cluster_C2', 'area_cluster_C5',
       'area_cluster_C7', 'area_cluster_C8', 'area_cluster_C9']
X=os_data_X[cols]
y=os_data_y['is_claim']

logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

  return 1/(1+np.exp(-X))


Optimization terminated successfully.
         Current function value: 0.660466
         Iterations 5
                          Results: Logit
Model:               Logit            Pseudo R-squared: 0.047      
Dependent Variable:  is_claim         AIC:              101434.5405
Date:                2023-04-14 13:43 BIC:              101545.5236
No. Observations:    76772            Log-Likelihood:   -50705.    
Df Model:            11               LL-Null:          -53214.    
Df Residuals:        76760            LLR p-value:      0.0000     
Converged:           1.0000           Scale:            1.0000     
No. Iterations:      5.0000                                        
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
policy_tenure       0.9359   0.0182  51.4530 0.0000  0.9002  0.9715
age_of_car         -6.2996   0.1488 -42.3

  return 1/(1+np.exp(-X))


In [21]:
# Build a model with the selected features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [22]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.50


In [23]:
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[11446     0]
 [11586     0]]


In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.66     11446
           1       0.00      0.00      0.00     11586

    accuracy                           0.50     23032
   macro avg       0.25      0.50      0.33     23032
weighted avg       0.25      0.50      0.33     23032



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SUMMARY: A logistic regression model does poorly at predicting the "is_claim" target variable. Also, when looking at p-values for feature selection, it appears tha the majority of values do not show an indication of contributing to the outcome varible in this type of model.

## XGBoost Model

### Initial Model

In [25]:
xgb_train = pd.DataFrame(y_train).join(X_train)
xgb_train.rename(columns={"is_claim": "label"}, inplace = True)
xgb_train.dtypes

xgb_validate = pd.DataFrame(y_test).join(X_test)
xgb_validate.rename(columns={"is_claim": "label"}, inplace = True)

In [26]:
### write training and validation data to s3 bucket to be read by sagemaker model 
# xgb_train.to_csv("s3://techexcellence.ml.project.team3/scaledTrainingData", index = False, header = False)
# xgb_validate.to_csv("s3://techexcellence.ml.project.team3/scaledValidateData", index = False, header = False)

### Build Model

In [27]:
my_region = boto3.session.Session().region_name
sess = sagemaker.Session()
role = get_execution_role()
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")


xgb = sagemaker.estimator.Estimator(xgboost_container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    sagemaker_session=sess)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [28]:
s3_input_train = sagemaker.TrainingInput(s3_data='s3://techexcellence.ml.project.team3/scaledTrainingData', content_type='csv')
s3_input_validate = sagemaker.TrainingInput(s3_data='s3://techexcellence.ml.project.team3/scaledValidateData', content_type='csv')

In [29]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

# xgb_train = pd.DataFrame(y_train).join(X_train)
# xgb_train = sagemaker.TrainingInput(xgb_train.to_json())
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-04-14-13-43-38-878


2023-04-14 13:43:39 Starting - Starting the training job......
2023-04-14 13:44:15 Starting - Preparing the instances for training......
2023-04-14 13:45:30 Downloading - Downloading input data......
2023-04-14 13:46:15 Training - Downloading the training image...
2023-04-14 13:46:51 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2023-04-14:13:47:04:INFO] Running standalone xgboost training.[0m
[34m[2023-04-14:13:47:04:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-04-14:13:47:04:INFO] File size need to be processed in the node: 145.39mb. Available memory size in the node: 8597.07mb[0m
[34m[2023-04-14:13:47:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:47:04] S3DistributionType set as FullyReplicated[0m
[34m[13:47:04] 87762x87 matrix with 7635294 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[13:47:04] src/tree/updater_prune.cc:74: tree pruni

In [30]:
test_data = pd.concat([y_test, X_test], axis = 1)

### Deploy model endpoint

In [32]:
# Deploy or read in endpoint 

xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.2xlarge')
# xgb_predictor = sagemaker.predictor.Predictor(
#     endpoint_name="xgboost-2023-04-12-17-22-35-143",
#     sagemaker_session=sagemaker.Session(),
#     serializer=sagemaker.serializers.CSVSerializer()
# )

INFO:sagemaker:Creating model with name: xgboost-2023-04-14-13-47-58-759
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-04-14-13-47-58-759
INFO:sagemaker:Creating endpoint with name xgboost-2023-04-14-13-47-58-759


--------!

### Split data into batches and send to endpoint for inference

In [35]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [30]:
xgb_predictor = sagemaker.predictor.Predictor(
    endpoint_name="xgboost-2023-04-12-17-22-35-143",
    sagemaker_session=sagemaker.Session(),
    serializer=sagemaker.serializers.CSVSerializer()
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [32]:
predictions_list = []

for i in chunker(test_data, round(len(test_data)/4)):
    test_data_array = i.drop(['is_claim'], axis=1).values
    xgb_predictor.serializer = csv_serializer

    predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
    predictions_array = np.fromstring(predictions[1:], sep=',')
    predictions_list.append(predictions_array)
    print(predictions_array.shape)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2930,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2930,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2930,)
(2929,)


In [33]:
preds = np.concatenate(predictions_list, axis=None)
preds

array([0.14282593, 0.15730155, 0.09496378, ..., 0.21587121, 0.20863397,
       0.10911147])

In [34]:
predictions_list = []
s3_input_train_df = pd.read_csv('s3://techexcellence.ml.project.team3/scaledTrainingData', header=None)

for i in chunker(s3_input_train_df, 3000):
    test_data_array = i.drop([0], axis=1).values
    xgb_predictor.serializer = csv_serializer

    predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
    predictions_array = np.fromstring(predictions[1:], sep=',')
    predictions_list.append(predictions_array)
    print(predictions_array.shape)

y_hat = np.concatenate(predictions_list, axis=None)
y_hat

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(3000,)
(762,)


array([0.19345088, 0.06379937, 0.0742424 , ..., 0.37187734, 0.88787252,
       0.90471619])

### Get classification report for both training and testing data 

In [35]:
print(classification_report(s3_input_train_df[0], np.round(y_hat)))

              precision    recall  f1-score   support

           0       0.91      0.99      0.94     43881
           1       0.98      0.90      0.94     43881

    accuracy                           0.94     87762
   macro avg       0.94      0.94      0.94     87762
weighted avg       0.94      0.94      0.94     87762



In [36]:
print(classification_report(test_data[['is_claim']], np.round(preds)))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96     10963
           1       0.16      0.04      0.07       756

    accuracy                           0.92     11719
   macro avg       0.55      0.51      0.51     11719
weighted avg       0.89      0.92      0.90     11719



### Model with hyperparameter tuning

In [37]:
objective_metric_name = "validation:auc"
MAX_JOBS = 5
MAX_PARALLEL_JOBS = 4
STRATEGY = 'Bayesian'
SCALING_TYPE = 'Linear'

In [38]:
tuninig_job_name = "xgb-linsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameter_ranges_linear = {
    "alpha": ContinuousParameter(0.05, 1, scaling_type=SCALING_TYPE),
    "lambda": ContinuousParameter(0.05, 1, scaling_type=SCALING_TYPE),
}

tuner_linear = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=MAX_JOBS,
    max_parallel_jobs=MAX_PARALLEL_JOBS,
    strategy=STRATEGY,
)

tuner_linear.fit(
    {"train": s3_input_train, "validation": s3_input_validate},
    include_cls_metadata=False,
    job_name=tuninig_job_name)

INFO:sagemaker:Creating hyperparameter tuning job with name: xgb-linsearch-20230413-15-46-42


............................................................................!


In [39]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


'Completed'

In [40]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuninig_job_name)

full_df = tuner.dataframe()

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    if len(df) > 0:
        df = df.sort_values("FinalObjectiveValue", ascending=False)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest": min(df["FinalObjectiveValue"]), "highest": max(df["FinalObjectiveValue"])})
        pd.set_option("display.max_colwidth", None)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df

Number of training jobs with valid objective: 5
{'lowest': 0.6493110060691833, 'highest': 0.6522229909896851}


Unnamed: 0,alpha,lambda,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
3,0.181908,0.814959,xgb-linsearch-20230413-15-46-42-002-38d5ac0c,Completed,0.652223,2023-04-13 15:48:51+00:00,2023-04-13 15:50:54+00:00,123.0
1,0.973252,0.773653,xgb-linsearch-20230413-15-46-42-004-772edde8,Completed,0.651717,2023-04-13 15:48:56+00:00,2023-04-13 15:50:59+00:00,123.0
2,0.249562,0.213178,xgb-linsearch-20230413-15-46-42-003-5544840b,Completed,0.650437,2023-04-13 15:48:33+00:00,2023-04-13 15:50:42+00:00,129.0
4,0.125055,0.791197,xgb-linsearch-20230413-15-46-42-001-2f5ac8b7,Completed,0.650227,2023-04-13 15:48:49+00:00,2023-04-13 15:50:41+00:00,112.0
0,0.828146,0.07784,xgb-linsearch-20230413-15-46-42-005-82f7d352,Completed,0.649311,2023-04-13 15:52:14+00:00,2023-04-13 15:53:06+00:00,52.0


### Deploy best tuned model

In [142]:
# xgb_predictor = tuner_linear.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
xgb_tuned_predictor = sagemaker.predictor.Predictor(
    endpoint_name="xgb-linsearch-20230412-17-50-07-005-ef87c0f1",
    sagemaker_session=sagemaker.Session(),
    serializer=sagemaker.serializers.CSVSerializer()
)

In [143]:
#send y_train to endpoint deployed above
predictions_list = []
s3_input_validate_df = pd.read_csv('s3://techexcellence.ml.project.team3/scaledValidateData', header=None)

for i in chunker(s3_input_validate_df, 2500):
    test_data_array = i.drop([0], axis=1).values
    xgb_tuned_predictor.serializer = csv_serializer

    predictions = xgb_tuned_predictor.predict(test_data_array).decode('utf-8')
    predictions_array = np.fromstring(predictions[1:], sep=',')
    predictions_list.append(predictions_array)
    print(predictions_array.shape)

y_hat_2 = np.concatenate(predictions_list, axis=None)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2500,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2500,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2500,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2500,)
(1719,)


array([0.19345088, 0.06379937, 0.0742424 , ..., 0.37187734, 0.88787252,
       0.90471619])

In [144]:
print(classification_report(test_data[['is_claim']], np.round(y_hat_2)))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96     10963
           1       0.16      0.04      0.06       756

    accuracy                           0.93     11719
   macro avg       0.55      0.51      0.51     11719
weighted avg       0.89      0.93      0.90     11719



Summary: Model performed slightly better before hyperparameter tuning in terms of F1 score when predicting that a claim would be filed. Overall, there does not seem to be enough indicators in the data to build this type of prediction model in a way that provides value when predicting if a claim will be filed.

### Send test data to best model built by AutoML AutoPilot

In [37]:
# xgb_predictor = tuner_linear.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
xgb_tuned_predictor = sagemaker.predictor.Predictor(
    endpoint_name="SageMakerEndpoint-04142023",
    sagemaker_session=sagemaker.Session(),
    serializer=sagemaker.serializers.CSVSerializer()
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [38]:
s3_input_validate_df = pd.read_csv('s3://techexcellence.ml.project.team3/CarInsuranceClaim/data/test.csv')
s3_input_validate_df.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,K Series Dual jet,2,No,Yes,No,Yes,No,Drum,1197,4,Manual,5,Electric,4.8,3845,1735,1530,1335,Yes,No,No,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,91Nm@4250rpm,67.06bhp@5500rpm,1.0 SCe,2,No,No,No,No,Yes,Drum,999,3,Automatic,5,Electric,5.0,3731,1579,1490,1155,No,No,No,No,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0
4,ID58597,1.233404,0.02,0.634615,C5,34738,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0


In [39]:
predictions_list = []

for i in chunker(s3_input_validate_df, 2500):
    test_data_array = i.values
    xgb_tuned_predictor.serializer = csv_serializer

    predictions = xgb_tuned_predictor.predict(test_data_array).decode('utf-8')
    predictions_array = np.fromstring(predictions[1:], sep='\n')
    predictions_list.append(predictions_array)
    print(predictions_array.shape)

y_hat_2 = np.concatenate(predictions_list, axis=None)
y_hat_2

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(2499,)
(1562,)


array([0., 0., 0., ..., 0., 1., 0.])

In [40]:
output_df = pd.DataFrame(data=[s3_input_validate_df['policy_id'], y_hat_2]).T
output_df.columns = ['policy_id', 'is_claim_hat']
output_df.head(10)

Unnamed: 0,policy_id,is_claim_hat
0,ID58593,0.0
1,ID58594,0.0
2,ID58595,0.0
3,ID58596,0.0
4,ID58597,0.0
5,ID58598,0.0
6,ID58599,1.0
7,ID58600,0.0
8,ID58601,0.0
9,ID58602,0.0
