In [30]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  
import plotly.express as px  
import warnings 
import missingno as msno
warnings.filterwarnings('ignore')  
  
plt.style.use('ggplot')  

In [31]:
data_set=pd.read_csv('insurance_claims.csv')

In [32]:
data_set.replace('0',np.nan,inplace=True)

In [33]:
data_set.isnull().sum()

months_as_customer              0
age                             0
policy_number                   0
policy_bind_date                0
policy_state                    0
policy_csl                      0
policy_deductable               0
policy_annual_premium           0
umbrella_limit                  0
insured_zip                     0
insured_sex                     0
insured_education_level         0
insured_occupation              0
insured_hobbies                 0
insured_relationship            0
capital-gains                   0
capital-loss                    0
incident_date                   0
incident_type                   0
collision_type                  0
incident_severity               0
authorities_contacted          91
incident_state                  0
incident_city                   0
incident_location               0
incident_hour_of_the_day        0
number_of_vehicles_involved     0
property_damage                 0
bodily_injuries                 0
witnesses     

In [34]:
# allocating 0 to missing value
data_set['authorities_contacted'] = data_set['authorities_contacted'].fillna(data_set['authorities_contacted'].mode()[0])  

In [35]:
# dropping columns not necessary for prediction
to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',  
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year']
data_set.drop(to_drop,inplace=True,axis=1)

In [36]:
x=data_set.drop('fraud_reported',axis=1)
y=data_set['fraud_reported']

In [37]:
dataset_cate=x.select_dtypes(include=['object'])
data_set.head(5)

Unnamed: 0,months_as_customer,age,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_occupation,insured_relationship,...,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,fraud_reported
0,328,48,250/500,1000,1406.91,0,MALE,MD,craft-repair,husband,...,1,YES,1,2,YES,71610,6510,13020,52080,Y
1,228,42,250/500,2000,1197.22,5000000,MALE,MD,machine-op-inspct,other-relative,...,1,?,0,0,?,5070,780,780,3510,Y
2,134,29,100/300,2000,1413.14,5000000,FEMALE,PhD,sales,own-child,...,3,NO,2,3,NO,34650,7700,3850,23100,N
3,256,41,250/500,2000,1415.74,6000000,FEMALE,PhD,armed-forces,unmarried,...,1,?,1,2,NO,63400,6340,6340,50720,Y
4,228,44,500/1000,1000,1583.91,6000000,MALE,Associate,sales,unmarried,...,1,NO,0,1,NO,6500,1300,650,4550,N


In [38]:
for col in dataset_cate.columns:  
    print(f"{col}: \n{dataset_cate[col].unique()}\n")  

policy_csl: 
['250/500' '100/300' '500/1000']

insured_sex: 
['MALE' 'FEMALE']

insured_education_level: 
['MD' 'PhD' 'Associate' 'Masters' 'High School' 'College' 'JD']

insured_occupation: 
['craft-repair' 'machine-op-inspct' 'sales' 'armed-forces' 'tech-support'
 'prof-specialty' 'other-service' 'priv-house-serv' 'exec-managerial'
 'protective-serv' 'transport-moving' 'handlers-cleaners' 'adm-clerical'
 'farming-fishing']

insured_relationship: 
['husband' 'other-relative' 'own-child' 'unmarried' 'wife' 'not-in-family']

incident_type: 
['Single Vehicle Collision' 'Vehicle Theft' 'Multi-vehicle Collision'
 'Parked Car']

collision_type: 
['Side Collision' '?' 'Rear Collision' 'Front Collision']

incident_severity: 
['Major Damage' 'Minor Damage' 'Total Loss' 'Trivial Damage']

authorities_contacted: 
['Police' 'Fire' 'Other' 'Ambulance']

property_damage: 
['YES' '?' 'NO']

police_report_available: 
['YES' '?' 'NO']



In [39]:
dataset_cate=pd.get_dummies(dataset_cate,drop_first=True)
dataset_cate.head()

Unnamed: 0,policy_csl_250/500,policy_csl_500/1000,insured_sex_MALE,insured_education_level_College,insured_education_level_High School,insured_education_level_JD,insured_education_level_MD,insured_education_level_Masters,insured_education_level_PhD,insured_occupation_armed-forces,...,incident_severity_Minor Damage,incident_severity_Total Loss,incident_severity_Trivial Damage,authorities_contacted_Fire,authorities_contacted_Other,authorities_contacted_Police,property_damage_NO,property_damage_YES,police_report_available_NO,police_report_available_YES
0,True,False,True,False,False,False,True,False,False,False,...,False,False,False,False,False,True,False,True,False,True
1,True,False,True,False,False,False,True,False,False,False,...,True,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,True,True,False,True,False
3,True,False,False,False,False,False,False,False,True,True,...,False,False,False,False,False,True,False,False,True,False
4,False,True,True,False,False,False,False,False,False,False,...,True,False,False,False,False,True,True,False,True,False


In [40]:
# extracting the numerical columns  
dataset_num = x.select_dtypes(include = ['int64'])  
dataset_num.head()  

Unnamed: 0,months_as_customer,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim
0,328,48,1000,0,53300,0,5,1,1,2,71610,6510,13020,52080
1,228,42,2000,5000000,0,0,8,1,0,0,5070,780,780,3510
2,134,29,2000,5000000,35100,0,7,3,2,3,34650,7700,3850,23100
3,256,41,2000,6000000,48900,-62400,5,1,1,2,63400,6340,6340,50720
4,228,44,1000,6000000,66000,-46000,20,1,0,1,6500,1300,650,4550


In [41]:
# combine
x= pd.concat([dataset_num, dataset_cate], axis = 1)  
x.head()  

Unnamed: 0,months_as_customer,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,...,incident_severity_Minor Damage,incident_severity_Total Loss,incident_severity_Trivial Damage,authorities_contacted_Fire,authorities_contacted_Other,authorities_contacted_Police,property_damage_NO,property_damage_YES,police_report_available_NO,police_report_available_YES
0,328,48,1000,0,53300,0,5,1,1,2,...,False,False,False,False,False,True,False,True,False,True
1,228,42,2000,5000000,0,0,8,1,0,0,...,True,False,False,False,False,True,False,False,False,False
2,134,29,2000,5000000,35100,0,7,3,2,3,...,True,False,False,False,False,True,True,False,True,False
3,256,41,2000,6000000,48900,-62400,5,1,1,2,...,False,False,False,False,False,True,False,False,True,False
4,228,44,1000,6000000,66000,-46000,20,1,0,1,...,True,False,False,False,False,True,True,False,True,False


In [42]:
x = x.replace({True: 1, False: 0})

In [43]:
# splitting data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [44]:
x_train.head()

Unnamed: 0,months_as_customer,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,...,incident_severity_Minor Damage,incident_severity_Total Loss,incident_severity_Trivial Damage,authorities_contacted_Fire,authorities_contacted_Other,authorities_contacted_Police,property_damage_NO,property_damage_YES,police_report_available_NO,police_report_available_YES
687,194,41,500,0,52500,-51300,17,3,0,2,...,1,0,0,0,0,1,0,0,0,0
500,1,29,500,0,52200,0,15,1,2,3,...,1,0,0,0,0,1,0,0,0,1
332,85,25,500,0,67000,-53600,8,1,2,2,...,0,0,1,0,0,1,1,0,0,0
979,229,37,1000,0,0,-55400,17,1,0,2,...,0,1,0,0,1,0,1,0,0,1
817,250,42,500,0,69500,-40700,16,1,1,1,...,0,0,0,0,1,0,0,0,0,0


In [45]:
dataframe_num= x_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',  
       'capital-gains', 'capital-loss', 'incident_hour_of_the_day',  
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',  
       'vehicle_claim']]  

In [46]:
# scaling specific data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_data=scaler.fit_transform(dataframe_num)
scaled_dataframe_num  = pd.DataFrame(data=scaled_data,columns=dataframe_num.columns,index=x_train.index)
scaled_dataframe_num

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
687,-0.094195,-1.021723,-0.489168,0.960020,-0.855061,0.739671,1.139326,-1.211098,0.443634,1.307162,2.720780,1.263562
500,-1.749713,-1.021723,-0.489168,0.949272,0.958258,0.452560,-0.820805,1.223270,1.339864,0.108737,3.366941,0.922620
332,-1.029177,-1.021723,-0.489168,1.479509,-0.936360,-0.552331,-0.820805,1.223270,0.443634,-1.324834,-1.340510,-1.810751
979,0.206028,-0.200237,-0.489168,-0.920888,-0.999985,0.739671,-0.820805,-1.211098,0.443634,0.527464,0.519275,-0.166586
817,0.386163,-1.021723,-0.489168,1.569076,-0.480379,0.596115,-0.820805,0.006086,-0.452596,1.286535,-0.126887,0.878542
...,...,...,...,...,...,...,...,...,...,...,...,...
835,1.415500,1.442734,2.534387,-0.920888,-0.922221,-0.839442,-0.820805,-1.211098,0.443634,-1.333085,-1.162402,-1.675862
192,1.321144,1.442734,-0.489168,-0.920888,0.958258,0.165448,-0.820805,-1.211098,1.339864,-0.373933,0.763656,0.052742
629,1.072388,1.442734,-0.489168,-0.920888,-1.035333,1.170338,0.159261,-1.211098,0.443634,0.750235,-0.396121,0.326239
559,-0.094195,1.442734,-0.489168,0.745059,0.958258,0.452560,1.139326,0.006086,-1.348827,-0.512133,0.486138,0.062832


In [47]:
x_train.drop(columns = scaled_dataframe_num , inplace=True)
x_train=pd.concat([scaled_dataframe_num,x_train],axis=1)
y_train = y_train.replace({'Y': 1, 'N': 0})
y_test = y_test.replace({'Y': 1, 'N': 0})
x_train

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,...,incident_severity_Minor Damage,incident_severity_Total Loss,incident_severity_Trivial Damage,authorities_contacted_Fire,authorities_contacted_Other,authorities_contacted_Police,property_damage_NO,property_damage_YES,police_report_available_NO,police_report_available_YES
687,-0.094195,-1.021723,-0.489168,0.960020,-0.855061,0.739671,1.139326,-1.211098,0.443634,1.307162,...,1,0,0,0,0,1,0,0,0,0
500,-1.749713,-1.021723,-0.489168,0.949272,0.958258,0.452560,-0.820805,1.223270,1.339864,0.108737,...,1,0,0,0,0,1,0,0,0,1
332,-1.029177,-1.021723,-0.489168,1.479509,-0.936360,-0.552331,-0.820805,1.223270,0.443634,-1.324834,...,0,0,1,0,0,1,1,0,0,0
979,0.206028,-0.200237,-0.489168,-0.920888,-0.999985,0.739671,-0.820805,-1.211098,0.443634,0.527464,...,0,1,0,0,1,0,1,0,0,1
817,0.386163,-1.021723,-0.489168,1.569076,-0.480379,0.596115,-0.820805,0.006086,-0.452596,1.286535,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,1.415500,1.442734,2.534387,-0.920888,-0.922221,-0.839442,-0.820805,-1.211098,0.443634,-1.333085,...,1,0,0,0,0,1,0,0,0,0
192,1.321144,1.442734,-0.489168,-0.920888,0.958258,0.165448,-0.820805,-1.211098,1.339864,-0.373933,...,0,1,0,0,0,1,1,0,1,0
629,1.072388,1.442734,-0.489168,-0.920888,-1.035333,1.170338,0.159261,-1.211098,0.443634,0.750235,...,0,1,0,0,1,0,0,0,0,1
559,-0.094195,1.442734,-0.489168,0.745059,0.958258,0.452560,1.139326,0.006086,-1.348827,-0.512133,...,1,0,0,0,0,0,0,0,0,0


In [55]:
# training model with the use of svm classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  
from sklearn.svm import SVC
classifier1=SVC()
classifier1.fit(x_train,y_train)
x_test = x_test[x_train.columns]
y_pred=classifier1.predict(x_test)
acc1=accuracy_score(y_test,y_pred)
print(acc1)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [54]:
# training model with the use of KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=30)
# x_test = x_test[x_train.columns]
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
x_test = x_test.to_numpy()
knn.fit(x_train, y_train)
y_pred=knn.predict(x_test)
acc2=accuracy_score(y_test,y_pred)
print(acc2)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'