In [2]:
!pip install dalex
!pip install scikit-plot
!pip install shap
!pip install eli5
!pip install lime

Collecting dalex
  Downloading dalex-1.4.1.tar.gz (1.0 MB)
[?25l[K     |▎                               | 10 kB 28.1 MB/s eta 0:00:01[K     |▋                               | 20 kB 29.3 MB/s eta 0:00:01[K     |█                               | 30 kB 34.3 MB/s eta 0:00:01[K     |█▎                              | 40 kB 24.9 MB/s eta 0:00:01[K     |█▋                              | 51 kB 18.4 MB/s eta 0:00:01[K     |██                              | 61 kB 20.9 MB/s eta 0:00:01[K     |██▎                             | 71 kB 21.2 MB/s eta 0:00:01[K     |██▋                             | 81 kB 22.4 MB/s eta 0:00:01[K     |███                             | 92 kB 24.3 MB/s eta 0:00:01[K     |███▏                            | 102 kB 26.2 MB/s eta 0:00:01[K     |███▌                            | 112 kB 26.2 MB/s eta 0:00:01[K     |███▉                            | 122 kB 26.2 MB/s eta 0:00:01[K     |████▏                           | 133 kB 26.2 MB/s eta 0:00:01[K    

In [103]:
# import pandas for data wrangling
import pandas as pd
# import numpy for vectorize data manipulation
import numpy as np
# import matplotlib.pyplot module for data visualization
import matplotlib.pyplot as plt
# import seaborn for data visualization
import seaborn as sns
# import scipy for certain statistical function
from scipy import stats

# import train and test split method from scikit-learn
from sklearn.model_selection import train_test_split
# import metrics method for model evaluation
import sklearn.metrics as metrics
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier
# import multi-layer perceptron
from sklearn.neural_network import MLPClassifier
# import decision tree model as surrogate model
from sklearn.tree import DecisionTreeClassifier
# import tree module
from sklearn import tree

# import xgboost classifier
from xgboost import XGBClassifier

# import dalex to explain complex model
import dalex as dx

# load statsmodel module 
import statsmodels.api as sm
import statsmodels.formula.api as smf

# load scikit-plot modules
import scikitplot as skplt

# load shap package for shap explanation
import shap

# load eli5
import eli5

# load LimeTabularExplainer for LIME method
from lime.lime_tabular import LimeTabularExplainer 

In [104]:
# load dataset
fraud_data = pd.read_csv("https://raw.githubusercontent.com/hadimaster65555/dataset_for_teaching/main/dataset/car_insurance_fraud_dataset/insuranceFraud.csv")

In [105]:
fraud_data.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,10/17/2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,6/27/2006,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,9/6/2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,5/25/1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,6/6/2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [106]:
fraud_data[fraud_data.columns[19:32]]

Unnamed: 0,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount
0,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,5,1,YES,1,2,YES,71610
1,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,8,1,?,0,0,?,5070
2,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,7,3,NO,2,3,NO,34650
3,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,5,1,?,1,2,NO,63400
4,?,Minor Damage,,NY,Arlington,3041 3rd Ave,20,1,NO,0,1,NO,6500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Front Collision,Minor Damage,Fire,NC,Northbrook,6045 Andromedia St,20,1,YES,0,1,?,87200
996,Rear Collision,Major Damage,Fire,SC,Northbend,3092 Texas Drive,23,1,YES,2,3,?,108480
997,Side Collision,Minor Damage,Police,NC,Arlington,7629 5th St,4,3,?,2,3,YES,67500
998,Rear Collision,Major Damage,Other,NY,Arlington,6128 Elm Lane,2,1,?,0,1,YES,46980


In [107]:
# check churn_data data structure and its types
fraud_data.dtypes

months_as_customer               int64
age                              int64
policy_number                    int64
policy_bind_date                object
policy_state                    object
policy_csl                      object
policy_deductable                int64
policy_annual_premium          float64
umbrella_limit                   int64
insured_zip                      int64
insured_sex                     object
insured_education_level         object
insured_occupation              object
insured_hobbies                 object
insured_relationship            object
capital-gains                    int64
capital-loss                     int64
incident_date                   object
incident_type                   object
collision_type                  object
incident_severity               object
authorities_contacted           object
incident_state                  object
incident_city                   object
incident_location               object
incident_hour_of_the_day 

In [108]:
# check data dimension
fraud_data.shape

(1000, 39)

In [109]:
# check null values inside dataset
fraud_data.isna().sum()

months_as_customer             0
age                            0
policy_number                  0
policy_bind_date               0
policy_state                   0
policy_csl                     0
policy_deductable              0
policy_annual_premium          0
umbrella_limit                 0
insured_zip                    0
insured_sex                    0
insured_education_level        0
insured_occupation             0
insured_hobbies                0
insured_relationship           0
capital-gains                  0
capital-loss                   0
incident_date                  0
incident_type                  0
collision_type                 0
incident_severity              0
authorities_contacted          0
incident_state                 0
incident_city                  0
incident_location              0
incident_hour_of_the_day       0
number_of_vehicles_involved    0
property_damage                0
bodily_injuries                0
witnesses                      0
police_rep

Soal no 1. Identifikasi berapa banyak nilai null yang terdapat di dalam dataset
Jawab : Sebelumnya Peneliti sudah mengambil data asuransi digithub, didalam data tersebut tidak ada data yang null akan tetapi dari variable yang dilihat ada ketidak cocokan nilai dari varibale (berbeda) maka dari itu peneliti ingin mengetahui data mana sih yang salah tidak tepat didapatlah seperti dibawah ini 
untuk variable data police_report_available terdapat nilai yang berisi ? sebanyak 343, untuk variable data umbrella_limit terdapat nilai yang berisi 0 sebanyak 798, untuk variable data capital gains terdapat nilai yang berisi 0 sebanyak 508, untuk variable data capital loss terdapat nilai yang berisi 0 sebanyak 475, untuk variable data collision terdapat nilai yang berisi ? sebanyak 178, untuk variable data prpoperty damage terdapat nilai yang berisi ? sebanyak 360.

In [110]:
Police =(fraud_data['police_report_available'] == '?').sum()
Police

343

In [111]:
umbrella =(fraud_data['umbrella_limit'] == 0 ).sum()
umbrella

798

In [112]:
capital =(fraud_data['capital-gains'] == 0 ).sum()
capital

508

In [113]:
capital1 =(fraud_data['capital-loss'] == 0 ).sum()
capital1

475

In [114]:
collision =(fraud_data['collision_type'] == '?' ).sum()
collision

178

In [115]:
collision =(fraud_data['property_damage'] == '?' ).sum()
collision

360

2. Lakukan treatment pada kolom yang bernilai NA atau NULL. Berikan alasan anda mengapa anda melakukan treatment tertentu pada nilai NA tersebut
Jawab : Peneliti ingin menghapus nilai2 yang tidak sesuai atau salah dalam pengisiannya, karena akan berpengaruh pada analisa selanjutnya, maka dari itu perlu menghapus baris yang dirasa kurang sesuai pada variable2 tertentu seperti berikut.

In [116]:
baru = fraud_data[ fraud_data['umbrella_limit'] == 0 ].index
fraud_data.drop(baru , inplace=True)


In [117]:
baru = fraud_data[ fraud_data['police_report_available'] == '?' ].index
fraud_data.drop(baru , inplace=True)


In [118]:
baru = fraud_data[ fraud_data['capital-gains'] == 0 ].index
fraud_data.drop(baru , inplace=True)

In [119]:
baru = fraud_data[ fraud_data['capital-loss'] == 0 ].index
fraud_data.drop(baru , inplace=True)

In [120]:
baru = fraud_data[ fraud_data['collision_type'] == '?' ].index
fraud_data.drop(baru , inplace=True)

In [121]:
baru = fraud_data[ fraud_data['property_damage'] == '?' ].index
fraud_data.drop(baru , inplace=True)
fraud_data.head(10)

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
35,147,33,129872,8/8/2010,OH,100/300,1000,1334.15,6000000,479224,...,0,YES,53100,10620,5310,37170,Mercedes,C300,1995,Y
66,107,31,356590,8/17/2011,IN,250/500,500,1239.22,7000000,476458,...,1,NO,89700,13800,13800,62100,Audi,A5,2009,Y
125,206,39,965768,7/27/2014,IN,250/500,1000,1302.4,6000000,603948,...,3,YES,36300,3300,9900,23100,Ford,Escape,2013,N
212,335,50,565564,2/7/2007,OH,100/300,1000,1538.26,6000000,615346,...,3,YES,34320,8580,4290,21450,Volkswagen,Passat,2009,N
231,298,46,667021,5/2/2007,OH,500/1000,1000,1138.42,6000000,477678,...,3,YES,33550,3050,6100,24400,Volkswagen,Passat,2005,N
346,429,56,804410,12/12/1998,OH,250/500,1000,1127.89,6000000,460722,...,0,YES,39480,6580,6580,26320,Suburu,Forrestor,2002,N
394,157,31,121439,8/2/1990,IN,500/1000,500,1257.83,7000000,458622,...,2,NO,47700,4770,9540,33390,Accura,TL,2011,Y
415,108,32,439828,9/7/2006,OH,500/1000,2000,1257.0,4000000,616341,...,3,NO,61270,5570,11140,44560,Suburu,Legacy,1999,N
471,195,38,238412,5/18/1993,IL,500/1000,2000,1294.93,6000000,477356,...,2,NO,64620,7180,0,57440,Dodge,Neon,2003,N
561,272,41,337158,4/8/1991,OH,250/500,2000,945.73,5000000,435663,...,0,NO,84100,16820,8410,58870,Ford,Escape,2009,Y


In [122]:
# Peneliti ingin mengubah Variable fraud_reported menjadi kategori angka
fraud_data['fraud_reported'] = np.where((fraud_data.fraud_reported == 'Y'),1 ,0)
fraud_data

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
35,147,33,129872,8/8/2010,OH,100/300,1000,1334.15,6000000,479224,...,0,YES,53100,10620,5310,37170,Mercedes,C300,1995,1
66,107,31,356590,8/17/2011,IN,250/500,500,1239.22,7000000,476458,...,1,NO,89700,13800,13800,62100,Audi,A5,2009,1
125,206,39,965768,7/27/2014,IN,250/500,1000,1302.4,6000000,603948,...,3,YES,36300,3300,9900,23100,Ford,Escape,2013,0
212,335,50,565564,2/7/2007,OH,100/300,1000,1538.26,6000000,615346,...,3,YES,34320,8580,4290,21450,Volkswagen,Passat,2009,0
231,298,46,667021,5/2/2007,OH,500/1000,1000,1138.42,6000000,477678,...,3,YES,33550,3050,6100,24400,Volkswagen,Passat,2005,0
346,429,56,804410,12/12/1998,OH,250/500,1000,1127.89,6000000,460722,...,0,YES,39480,6580,6580,26320,Suburu,Forrestor,2002,0
394,157,31,121439,8/2/1990,IN,500/1000,500,1257.83,7000000,458622,...,2,NO,47700,4770,9540,33390,Accura,TL,2011,1
415,108,32,439828,9/7/2006,OH,500/1000,2000,1257.0,4000000,616341,...,3,NO,61270,5570,11140,44560,Suburu,Legacy,1999,0
471,195,38,238412,5/18/1993,IL,500/1000,2000,1294.93,6000000,477356,...,2,NO,64620,7180,0,57440,Dodge,Neon,2003,0
561,272,41,337158,4/8/1991,OH,250/500,2000,945.73,5000000,435663,...,0,NO,84100,16820,8410,58870,Ford,Escape,2009,1


3. Analisis modeling berdasarkan data diatas
Jawab : Dalam menganalisa variable prediktor dan target peneliti menyeleksi dari variable yang sudah tersedia apakah ada pengaruh atau tidak sebelumnya yaitu dengan menggunakan korelasi.
peneliti sudah membuang data dikarenakan memiliki korelasi negatif dan rendah = months_as_customer	, age, policy_number, policy_deductable, policy_annual_premium, capital gains, capital loss, insured_zip: kode pos nasabah, insured_hobbies: hobi nasabah, authorities_contacted: pihak berwajib yang dihubungi, witness: jumlah saksi mata, auto_make, auto_model, auto_year

In [124]:
fraud_data.corr()

Unnamed: 0,months_as_customer,age,policy_number,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported
months_as_customer,1.0,0.982977,0.571218,0.111182,0.076459,0.167585,-0.295086,0.260956,0.449292,-0.335938,-0.133595,0.328981,-0.280841,-0.387001,-0.100697,-0.366944,-0.36581,-0.028927,-0.505532
age,0.982977,1.0,0.652989,0.155139,0.113079,0.142269,-0.239588,0.339371,0.38801,-0.288421,-0.163396,0.360245,-0.204967,-0.392376,-0.139291,-0.373735,-0.359858,-0.044885,-0.57178
policy_number,0.571218,0.652989,1.0,0.078224,0.113404,0.004078,0.108898,0.317875,-0.129318,-0.12863,-0.081071,0.475089,0.185876,-0.359999,-0.434536,0.146797,-0.368507,0.147856,-0.512508
policy_deductable,0.111182,0.155139,0.078224,1.0,-0.073666,-0.331416,-0.197082,0.055634,0.346985,0.569634,-0.020921,-0.246693,-0.1682,0.228564,0.120051,-0.338937,0.341471,-0.291559,-0.240063
policy_annual_premium,0.076459,0.113079,0.113404,-0.073666,1.0,0.243003,0.344028,0.073128,0.238379,-0.190999,0.390908,0.321425,0.040467,-0.226666,-0.373622,0.032794,-0.18821,-0.458247,-0.420163
umbrella_limit,0.167585,0.142269,0.004078,-0.331416,0.243003,1.0,-0.537016,-0.265533,-0.233199,-0.534692,0.401041,0.089562,-0.328395,-0.118534,-0.026786,-0.106141,-0.114805,0.188639,0.127827
insured_zip,-0.295086,-0.239588,0.108898,-0.197082,0.344028,-0.537016,1.0,0.28656,0.040413,0.17451,0.1767,0.108599,0.670561,-0.306997,-0.335063,0.22408,-0.349761,0.010751,-0.330278
capital-gains,0.260956,0.339371,0.317875,0.055634,0.073128,-0.265533,0.28656,1.0,0.280416,-0.158915,-0.197381,0.301588,0.00413,-0.301954,-0.081872,-0.016635,-0.353726,-0.19156,-0.235731
capital-loss,0.449292,0.38801,-0.129318,0.346985,0.238379,-0.233199,0.040413,0.280416,1.0,-0.024967,0.215026,0.248267,-0.095534,-0.398017,-0.151958,-0.46547,-0.339816,-0.33464,-0.315257
incident_hour_of_the_day,-0.335938,-0.288421,-0.12863,0.569634,-0.190999,-0.534692,0.17451,-0.158915,-0.024967,1.0,-0.297521,-0.444054,0.140782,0.303452,0.303886,0.046882,0.284591,-0.135598,0.064223


In [126]:
# remove data from dataset
data = fraud_data.drop(
    ['months_as_customer','age','policy_number', 'policy_deductable','policy_annual_premium','capital-gains','capital-loss','insured_zip', 'insured_hobbies', 'authorities_contacted','witnesses','auto_make','auto_model','auto_year',],
    axis = 1
)
# check its first 20 rows
data.head(20)

Unnamed: 0,policy_bind_date,policy_state,policy_csl,umbrella_limit,insured_sex,insured_education_level,insured_occupation,insured_relationship,incident_date,incident_type,...,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,fraud_reported
35,8/8/2010,OH,100/300,6000000,MALE,High School,craft-repair,not-in-family,1/24/2015,Single Vehicle Collision,...,15,1,YES,2,YES,53100,10620,5310,37170,1
66,8/17/2011,IN,250/500,7000000,FEMALE,High School,tech-support,not-in-family,1/30/2015,Single Vehicle Collision,...,12,1,YES,0,NO,89700,13800,13800,62100,1
125,7/27/2014,IN,250/500,6000000,MALE,JD,craft-repair,unmarried,2/17/2015,Multi-vehicle Collision,...,12,3,NO,2,YES,36300,3300,9900,23100,0
212,2/7/2007,OH,100/300,6000000,MALE,High School,sales,other-relative,1/24/2015,Multi-vehicle Collision,...,12,3,YES,2,YES,34320,8580,4290,21450,0
231,5/2/2007,OH,500/1000,6000000,MALE,JD,prof-specialty,own-child,2/16/2015,Single Vehicle Collision,...,18,1,NO,2,YES,33550,3050,6100,24400,0
346,12/12/1998,OH,250/500,6000000,MALE,Associate,machine-op-inspct,own-child,1/28/2015,Single Vehicle Collision,...,0,1,YES,2,YES,39480,6580,6580,26320,0
394,8/2/1990,IN,500/1000,7000000,MALE,High School,farming-fishing,own-child,2/14/2015,Multi-vehicle Collision,...,2,4,NO,2,NO,47700,4770,9540,33390,1
415,9/7/2006,OH,500/1000,4000000,FEMALE,High School,machine-op-inspct,unmarried,1/11/2015,Single Vehicle Collision,...,23,1,NO,1,NO,61270,5570,11140,44560,0
471,5/18/1993,IL,500/1000,6000000,MALE,MD,tech-support,unmarried,2/14/2015,Multi-vehicle Collision,...,12,3,YES,1,NO,64620,7180,0,57440,0
561,4/8/1991,OH,250/500,5000000,MALE,MD,protective-serv,wife,2/4/2015,Single Vehicle Collision,...,23,1,NO,0,NO,84100,16820,8410,58870,1


In [129]:
# typecasting date data to datetime
data['policy_bind_date']= data.policy_bind_date.astype('datetime64')
data['incident_date']= data.incident_date.astype('datetime64')
data.head()

Unnamed: 0,policy_bind_date,policy_state,policy_csl,umbrella_limit,insured_sex,insured_education_level,insured_occupation,insured_relationship,incident_date,incident_type,...,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,fraud_reported
35,2010-08-08,OH,100/300,6000000,MALE,High School,craft-repair,not-in-family,2015-01-24,Single Vehicle Collision,...,15,1,YES,2,YES,53100,10620,5310,37170,1
66,2011-08-17,IN,250/500,7000000,FEMALE,High School,tech-support,not-in-family,2015-01-30,Single Vehicle Collision,...,12,1,YES,0,NO,89700,13800,13800,62100,1
125,2014-07-27,IN,250/500,6000000,MALE,JD,craft-repair,unmarried,2015-02-17,Multi-vehicle Collision,...,12,3,NO,2,YES,36300,3300,9900,23100,0
212,2007-02-07,OH,100/300,6000000,MALE,High School,sales,other-relative,2015-01-24,Multi-vehicle Collision,...,12,3,YES,2,YES,34320,8580,4290,21450,0
231,2007-05-02,OH,500/1000,6000000,MALE,JD,prof-specialty,own-child,2015-02-16,Single Vehicle Collision,...,18,1,NO,2,YES,33550,3050,6100,24400,0


4. Buatlah model machine learning dengan regresi logistik dan KNN sebagai baseline dari model, lalu random forest dan XgBoost sebagai model lanjutan
Jawab : Sebelumnya Peneliti ingin menggunakan model regresi logistik terlebih dahulu untuk melihat R-Squarenya

In [130]:
# assign 'fraud_reported' column to y 
y = fraud_data['fraud_reported'].values
# assign all columns except 'fraud_reported' to X
X = data.drop(['fraud_reported'], axis = 1).values

In [131]:
# check y dimension
y.shape

(14,)

In [132]:
# check X dimension
X.shape

(14, 24)

In [133]:
# split data to train and test data
# with test size is 30% of overall data
# use stratified sampling
# and set RNG to 1000
X_train_fraud_reported, X_test_fraud_reported, y_train_fraud_reported, y_test_fraud_reported = train_test_split(
    X,
    y,
    test_size = 0.3,
    stratify = y,
    random_state = 1000
)

In [134]:
# Check X_train_Response dimension
X_train_fraud_reported.shape

(9, 24)

In [135]:
# check y_train_Response class distribution
np.unique(y_train_fraud_reported, return_counts=True)[1]

array([6, 3])

In [136]:
# check X_test_Response dimension
X_test_fraud_reported.shape

(5, 24)

In [137]:
# check y_test_Response class distribution
np.unique(y_test_fraud_reported, return_counts=True)[1]

array([3, 2])

In [138]:
# Check y_train_sales dimension
y_train_fraud_reported.shape

(9,)

In [139]:
# Check y_test_sales dimension
y_test_fraud_reported.shape

(5,)

In [140]:
# define model
X_train_int = sm.add_constant(X_train_fraud_reported)
# fit regression model to data
logistic_reg_sm = sm.Logit(y_train_fraud_reported, X_train_int).fit()

TypeError: ignored

* Masih eror dalam analisa pengujian regresi logistik, dari peneliti berpikir eror karena data variable dependent masih dalam bentuk kata bukan angka seharusnya dirubah terlebih dahulu.