In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install imbalanced-learn

In [3]:
import pickle

In [4]:
df = pd.read_csv('insurance_claims.csv')

In [5]:
df.replace('?', np.nan, inplace = True)

In [6]:
df['collision_type'] = df['collision_type'].fillna(df['collision_type'].mode()[0])

df['property_damage'] = df['property_damage'].fillna(df['property_damage'].mode()[0])

df['police_report_available'] = df['police_report_available'].fillna(df['police_report_available'].mode()[0])

to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_model','auto_year', '_c39', 'insured_occupation', 'insured_relationship', 'policy_csl']

df.drop(to_drop, inplace = True, axis = 1)

df.drop(columns = ['total_claim_amount'], inplace = True, axis = 1)

In [7]:
X = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [8]:
cat_df = X.select_dtypes(include = ['object'])


cleanup_nums = {"insured_sex": {"MALE": 1, "FEMALE": 2},
                "insured_education_level": {"JD": 5,"PhD": 4,"Associate": 7,"MD": 6,"High School": 1,"Masters": 3,"College": 2},
                "auto_make": {"Saab": 13,"Dodge": 12,"Suburu": 6,"Nissan": 3,"Chevrolet": 4,"Ford": 10,"BMW": 7,"Toyota": 1,"Audi": 9,"Volkswagen": 11,"Accura": 5,"Jeep": 14,"Mercedes": 8,"Honda": 2},
                "incident_severity": {"Minor Damage": 1,"Major Damage": 2,"Total Loss": 3,"Trivial Damage": 4},
                "collision_type": {"Rear Collision": 1,"Side Collision": 2,"Front Collision": 3},
                "incident_type": {"Multi-vehicle Collision": 1,"Single Vehicle Collision": 2,"Vehicle Theft": 3,"Parked Car": 4},
                "property_damage": {"NO": 1,"YES": 2},
                "authorities_contacted": {"Police": 1,"Fire": 2,"Other": 3,"Ambulance": 4,"None": 5},
                "police_report_available": {"NO": 1,"YES": 2}}

cat_df = cat_df.replace(cleanup_nums)

num_df = X.select_dtypes(include = ['int64'])

In [9]:
cat_df.head()

Unnamed: 0,insured_sex,insured_education_level,incident_type,collision_type,incident_severity,authorities_contacted,property_damage,police_report_available,auto_make
0,1,6,2,2,2,1,2,2,13
1,1,6,3,1,1,1,1,1,8
2,2,4,1,1,1,1,1,1,12
3,2,4,2,3,2,1,1,1,4
4,1,7,3,1,1,5,1,1,5


In [10]:
X = pd.concat([num_df, cat_df], axis = 1)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
x_resampled, y_resampled = SMOTE().fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled,test_size = 0.25)
num_df = X_train[['age', 'months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]



In [None]:
import sys
!{sys.executable} -m pip install xgboost

In [12]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier as xgb
from sklearn.ensemble import RandomForestClassifier
models = xgb(use_label_encoder = True, random_seed = 42)
model = RandomForestClassifier()

xgb_grid = {"min_child_weight": [1],
            "gamma": [0, 1, 2],
            "subsample": [0.6, 0.8],
            "colsample_bytree": [0.5, 0.6],
            "max_depth": [6],
            "n_estimators": [100],
            "booster": ["gbtree"]}

rand_grid = {"n_estimators": [100, 200],
             "max_depth": [10,20],
             "max_features": ["sqrt"],
             "min_samples_split": [2],
             "min_samples_leaf": [2, 4]}

In [16]:
gs_clf = GridSearchCV(estimator = model, param_grid = rand_grid, cv = 5)
gs_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20], 'max_features': ['sqrt'],
                         'min_samples_leaf': [2, 4], 'min_samples_split': [2],
                         'n_estimators': [100, 200]})

In [17]:
y_preds = gs_clf.predict(X_test)
print(classification_report(y_test, y_preds))
print(gs_clf.best_params_)

              precision    recall  f1-score   support

           N       0.81      0.87      0.84       180
           Y       0.88      0.82      0.85       197

    accuracy                           0.84       377
   macro avg       0.84      0.84      0.84       377
weighted avg       0.85      0.84      0.84       377

{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
pickle.dump(gs_clf, open('model.pkl', 'wb'))