# **Import Libraries**

In [204]:
import pandas as pd
import numpy as np
#for data exploration
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#feature selection 
from scipy import stats
from sklearn.feature_selection import chi2


**Dataset**

In [205]:
df = pd.read_csv('insurance_claims.csv')

#Convert the column containing date values to a datetime format
df['incident_date'] = pd.to_datetime(df['incident_date'])

# Filter the data based on a date before February 20, 2021
df_model = df[df['incident_date'] <= '2015-02-20']
df_actual = df[df['incident_date'] > '2015-02-20']


df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,10/17/2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,6/27/2006,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,9/6/2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,5/25/1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,6/6/2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [206]:
df_model.shape

(846, 39)

In [207]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   months_as_customer           1000 non-null   int64         
 1   age                          1000 non-null   int64         
 2   policy_number                1000 non-null   int64         
 3   policy_bind_date             1000 non-null   object        
 4   policy_state                 1000 non-null   object        
 5   policy_csl                   1000 non-null   object        
 6   policy_deductable            1000 non-null   int64         
 7   policy_annual_premium        1000 non-null   float64       
 8   umbrella_limit               1000 non-null   int64         
 9   insured_zip                  1000 non-null   int64         
 10  insured_sex                  1000 non-null   object        
 11  insured_education_level      1000 non-null  

In [208]:
df.describe()

Unnamed: 0,months_as_customer,age,policy_number,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,203.954,38.948,546238.648,1136.0,1256.40615,1101000.0,501214.488,25126.1,-26793.7,11.644,1.839,0.992,1.487,52761.94,7433.42,7399.57,37928.95,2005.103
std,115.113174,9.140287,257063.005276,611.864673,244.167395,2297407.0,71701.610941,27872.187708,28104.096686,6.951373,1.01888,0.820127,1.111335,26401.53319,4880.951853,4824.726179,18886.252893,6.015861
min,0.0,19.0,100804.0,500.0,433.33,-1000000.0,430104.0,0.0,-111100.0,0.0,1.0,0.0,0.0,100.0,0.0,0.0,70.0,1995.0
25%,115.75,32.0,335980.25,500.0,1089.6075,0.0,448404.5,0.0,-51500.0,6.0,1.0,0.0,1.0,41812.5,4295.0,4445.0,30292.5,2000.0
50%,199.5,38.0,533135.0,1000.0,1257.2,0.0,466445.5,0.0,-23250.0,12.0,1.0,1.0,1.0,58055.0,6775.0,6750.0,42100.0,2005.0
75%,276.25,44.0,759099.75,2000.0,1415.695,0.0,603251.0,51025.0,0.0,17.0,3.0,2.0,2.0,70592.5,11305.0,10885.0,50822.5,2010.0
max,479.0,64.0,999435.0,2000.0,2047.59,10000000.0,620962.0,100500.0,0.0,23.0,4.0,2.0,3.0,114920.0,21450.0,23670.0,79560.0,2015.0


**Data Preparation dan Data Cleaning**

In [209]:
df_clean = df.replace('?',np.NaN)


**Imputasi Data menggunakan Nilai Modus Data**

In [210]:
df_clean['collision_type'].fillna(df_clean['collision_type'].mode()[0], inplace = True)
df_clean['property_damage'].fillna(df_clean['property_damage'].mode()[0], inplace = True)
df_clean['police_report_available'].fillna(df_clean['police_report_available'].mode()[0], inplace = True)
#cari refrensinya untuk imputasi menggunakan modus


In [211]:
#mengecek nilai kosong pada data
# df_clean = df
df_clean.isnull().any().any()

False

In [212]:
df_clean

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,10/17/2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,6/27/2006,IN,250/500,2000,1197.22,5000000,468176,...,0,NO,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,9/6/2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,5/25/1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,6/6/2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,38,941851,7/16/1991,OH,500/1000,1000,1310.80,0,431289,...,1,NO,87200,17440,8720,61040,Honda,Accord,2006,N
996,285,41,186934,1/5/2014,IL,100/300,1000,1436.79,0,608177,...,3,NO,108480,18080,18080,72320,Volkswagen,Passat,2015,N
997,130,34,918516,2/17/2003,OH,250/500,500,1383.49,3000000,442797,...,3,YES,67500,7500,7500,52500,Suburu,Impreza,1996,N
998,458,62,533940,11/18/2011,IL,500/1000,2000,1356.92,5000000,441714,...,1,YES,46980,5220,5220,36540,Audi,A5,1998,N


In [213]:
for i in df_clean.columns:
  if df_clean[i].dtype == 'object' :
    print(i, ":", df_clean[i].nunique())


policy_bind_date : 951
policy_state : 3
policy_csl : 3
insured_sex : 2
insured_education_level : 7
insured_occupation : 14
insured_hobbies : 20
insured_relationship : 6
incident_type : 4
collision_type : 3
incident_severity : 4
authorities_contacted : 5
incident_state : 7
incident_city : 7
incident_location : 1000
property_damage : 2
police_report_available : 2
auto_make : 14
auto_model : 39
fraud_reported : 2


In [214]:
df_new = df_clean.drop(['policy_state', 'policy_csl', 'incident_state', 'incident_city', 'incident_location'],axis=1)
df_new['fraud_reported'] = df_new['fraud_reported'].str.replace('Y', '1')
df_new['fraud_reported'] = df_new['fraud_reported'].str.replace('N', '0')
df_new['fraud_reported'] = df_new['fraud_reported'].astype(int)
df_new['fraud_reported'].unique()

array([1, 0])

# **Exploratory Data Analysis**

In [215]:
# sns.countplot(df_new['fraud_reported'])

In [216]:
df_new.fraud_reported.value_counts()

0    753
1    247
Name: fraud_reported, dtype: int64

In [217]:
# def vis_data(df, x, y = 'fraud_reported', graph = 'countplot'):
#     if graph == 'hist':
#         fig = px.histogram(df, x = x)
#         fig.update_layout(title = 'Distribution of {x}'.format(x = x))
#         fig.show()
#     elif graph == 'bar':
#       fig = px.bar(df, x = x, y = y)
#       fig.update_layout(title = '{x} vs. {y}'.format(x = x, y = y))
#       fig.show()
#     elif graph == 'countplot':
#       a = df.groupby([x,y]).count()
#       a.reset_index(inplace = True)
#       no_fraud = a[a['fraud_reported'] == 0]
#       yes_fraud = a[a['fraud_reported'] == 1]
#       trace1 = go.Bar(x = no_fraud[x], y = no_fraud['policy_number'], name = 'No Fraud')
#       trace2 = go.Bar(x = yes_fraud[x], y = yes_fraud['policy_number'], name = 'Fraud')
#       fig = go.Figure(data = [trace1, trace2])
#       fig.update_layout(title = '{x} vs. {y}'.format(x=x, y = y))
#       fig.update_layout(barmode = 'group')
#       fig.show()

In [218]:
# vis_data(df_new, 'insured_sex')

In [219]:
# vis_data(df_new, 'insured_education_level')

In [220]:
# vis_data(df_new, 'insured_occupation')

In [221]:
# vis_data(df_new, 'insured_relationship')

In [222]:
# vis_data(df_new, 'incident_type')

In [223]:
# vis_data(df_new, 'collision_type')

In [224]:
# vis_data(df_new, 'incident_severity')

high amount of repair cost which will be incurred by the insurer due to major damage.

In [225]:
# vis_data(df_new, 'authorities_contacted')

In [226]:
# vis_data(df_new, 'insured_hobbies')

In [227]:
hobbies = df_new['insured_hobbies'].unique()
for hobby in hobbies:
  if (hobby != 'chess') & (hobby != 'cross-fit'):
    df_new['insured_hobbies'] = df_new['insured_hobbies'].str.replace(hobby, 'other')

df_new['insured_hobbies'].unique()

array(['other', 'chess', 'cross-fit'], dtype=object)

In [228]:
# vis_data(df_new, 'age', 'anything', 'hist')

In [229]:
# df_new['age'].describe()

In [230]:
bin_labels = ['15-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65']
bins = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65]

df_new['age_groups'] = pd.cut(df_new['age'], bins = bins, labels = bin_labels, include_lowest = True)

In [231]:
# vis_data(df_new, 'age_group')

In [232]:
# vis_data(df_new, 'months_as_customer', 'y', 'hist')

In [233]:
# df_new['months_as_customer'].describe()

In [234]:
bins = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bin_labels = ['0-50','51-100','101-150','151-200','201-250','251-300','301-350','351-400','401-450','451-500']

df_new['months_as_customer_groups'] = pd.cut(df_new['months_as_customer'], bins = 10, labels = bin_labels, include_lowest= True)

In [235]:
# vis_data(df_new, 'months_as_customer_groups')

In [236]:
# vis_data(df_new, 'auto_make')

In [237]:
# vis_data(df_new, 'number_of_vehicles_involved')

In [238]:
# vis_data(df_new, 'witnesses')

In [239]:
# vis_data(df_new, 'bodily_injuries')

In [240]:
# vis_data(df_new, 'total_claim_amount', 'y', 'hist')


In [241]:
# df_new['total_claim_amount'].describe()

In [242]:
# bins = [0, 20000, 40000, 60000, 80000, 100000, 120000]
# bin_labels = ['0k-20k','20k-40k','40k-60k','60k-80k','80k-100k','100k-120k']

# df_new['total_claim_amount_groups'] = pd.cut(df_new['total_claim_amount'], bins = 6, labels = bin_labels, include_lowest= True)

In [243]:
# vis_data(df_new,'total_claim_amount_groups')

In [244]:
# vis_data(df_new, 'incident_hour_of_the_day')

In [245]:
# vis_data(df_new, 'auto_year')


In [246]:
# vis_data(df_new,'policy_annual_premium', 'y', 'hist')

In [247]:
# df['policy_annual_premium'].describe()

In [248]:
bins = list(np.linspace(0,2500, 6, dtype = int))
bin_labels = ['very low', 'low', 'medium', 'high', 'very high']

df_new['policy_annual_premium_groups'] = pd.cut(df_new['policy_annual_premium'], bins = bins, labels=bin_labels)

In [249]:
# vis_data(df_new, 'policy_annual_premium_groups')

In [250]:
# vis_data(df_new,'policy_deductable', 'y', 'hist')

In [251]:
# df['policy_deductable'].describe()

In [252]:
bins = list(np.linspace(0,2000, 5, dtype = int))
bin_labels = ['0-500', '501-1000', '1001-1500', '1501-2000']

df_new['policy_deductable_groups'] = pd.cut(df_new['policy_deductable'], bins = bins, labels = bin_labels)


In [253]:
# vis_data(df_new, 'policy_deductable_group')

In [254]:
# vis_data(df_new, 'property_damage')

In [255]:
# vis_data(df_new, 'police_report_available')

In [256]:
# vis_data(df_new, 'umbrella_limit')

In [257]:
#menghapus kolom yang sudah di binning
df_new = df_new.drop(['age', 'months_as_customer', 'policy_deductable', 'policy_annual_premium'], axis = 1)
df_new.columns

Index(['policy_number', 'policy_bind_date', 'umbrella_limit', 'insured_zip',
       'insured_sex', 'insured_education_level', 'insured_occupation',
       'insured_hobbies', 'insured_relationship', 'capital-gains',
       'capital-loss', 'incident_date', 'incident_type', 'collision_type',
       'incident_severity', 'authorities_contacted',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'property_damage', 'bodily_injuries', 'witnesses',
       'police_report_available', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim', 'auto_make', 'auto_model',
       'auto_year', 'fraud_reported', 'age_groups',
       'months_as_customer_groups', 'policy_annual_premium_groups',
       'policy_deductable_groups'],
      dtype='object')

In [258]:
#berdasarkan EDA, beberapa kolom akan dihilangkan
df1 = df_new.drop(['policy_number', 'umbrella_limit', 'insured_zip','policy_bind_date'],axis=1)

In [259]:
corr_matrix = df1.corr()

fig = go.Figure(data = go.Heatmap(
                                z = corr_matrix.values,
                                x = list(corr_matrix.columns),
                                y = list(corr_matrix.index)))

fig.update_layout(title = 'Correlation')

fig.show()

In [260]:
#akan dibuang injury_claim, property_claim, dan vehicle_claim karena memiliki korelasi yang tinggi
df1 = df1.drop(['injury_claim','property_claim', 'vehicle_claim'],axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   insured_sex                   1000 non-null   object        
 1   insured_education_level       1000 non-null   object        
 2   insured_occupation            1000 non-null   object        
 3   insured_hobbies               1000 non-null   object        
 4   insured_relationship          1000 non-null   object        
 5   capital-gains                 1000 non-null   int64         
 6   capital-loss                  1000 non-null   int64         
 7   incident_date                 1000 non-null   datetime64[ns]
 8   incident_type                 1000 non-null   object        
 9   collision_type                1000 non-null   object        
 10  incident_severity             1000 non-null   object        
 11  authorities_contacted         1

In [261]:
df1.head()

Unnamed: 0,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_date,incident_type,collision_type,...,police_report_available,total_claim_amount,auto_make,auto_model,auto_year,fraud_reported,age_groups,months_as_customer_groups,policy_annual_premium_groups,policy_deductable_groups
0,MALE,MD,craft-repair,other,husband,53300,0,2015-01-25,Single Vehicle Collision,Side Collision,...,YES,71610,Saab,92x,2004,1,46-50,301-350,medium,501-1000
1,MALE,MD,machine-op-inspct,other,other-relative,0,0,2015-01-21,Vehicle Theft,Rear Collision,...,NO,5070,Mercedes,E400,2007,1,41-45,201-250,medium,1501-2000
2,FEMALE,PhD,sales,other,own-child,35100,0,2015-02-22,Multi-vehicle Collision,Rear Collision,...,NO,34650,Dodge,RAM,2007,0,26-30,101-150,medium,1501-2000
3,FEMALE,PhD,armed-forces,other,unmarried,48900,-62400,2015-01-10,Single Vehicle Collision,Front Collision,...,NO,63400,Chevrolet,Tahoe,2014,1,41-45,251-300,medium,1501-2000
4,MALE,Associate,sales,other,unmarried,66000,-46000,2015-02-17,Vehicle Theft,Rear Collision,...,NO,6500,Accura,RSX,2009,0,41-45,201-250,high,501-1000


In [262]:
# Define the number of rows to include in the head
num_rows = 5

# Save the head of the DataFrame as a CSV file
df1.head(num_rows).to_csv('head.csv', index=False)

 **Categorical Encoding**

In [263]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for i in df1.columns:
  if df1[i].dtype == 'category' :
    df1[i] = label_encoder.fit_transform(df_new[i])
    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
for i in df1.columns:
  if df1[i].dtype == object :
    # Fit the encoder to the column data
    ohe.fit(df1[[i]])
    
    # Reshape the column data to a 2-dimensional array
    col_data = df1[i].values.reshape(-1, 1)
    
    # Transform the column data and add it to the DataFrame as new columns
    new_cols = ohe.transform(col_data).toarray()
    for j, col_name in enumerate(ohe.get_feature_names_out([i])):
        df1[col_name] = new_cols[:, j]
    
# Drop the original object type column
df1 = df1.drop(df1.select_dtypes(include=['object']), axis=1)

# Filter the data based on a date before February 20, 2021
df_mod = df1[df1['incident_date'] <= '2015-02-20']
df_mod['incident_date'] = label_encoder.fit_transform(df_mod['incident_date'])
# df_mod = df_mod.drop('incident_date',axis=1)
df_act = df1[df1['incident_date'] > '2015-02-20']
df_act['incident_date'] = label_encoder.fit_transform(df_act['incident_date'])
# df_act = df_act.drop('incident_date',axis=1)

df1.head()


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but OneHotEncoder was fitted with feature names


X does not have valid feature names, but O

Unnamed: 0,capital-gains,capital-loss,incident_date,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,auto_year,fraud_reported,...,auto_model_Pathfinder,auto_model_RAM,auto_model_RSX,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6
0,53300,0,2015-01-25,5,1,1,2,71610,2004,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,2015-01-21,8,1,0,0,5070,2007,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,35100,0,2015-02-22,7,3,2,3,34650,2007,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48900,-62400,2015-01-10,5,1,1,2,63400,2014,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,66000,-46000,2015-02-17,20,1,0,1,6500,2009,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Chi-Squared Test

In [264]:
# from scipy.stats import chi2_contingency
# X = df1.drop(['fraud_reported','capital-gains','capital-loss'],axis=1)
# y = df1['fraud_reported']

# # Perform chi-square test on OHE-encoded features
# ohe_features = [col for col in X.columns if X[col].nunique() > 2]
# ohe_pvals = []
# for feature in ohe_features:
#     contingency_table = pd.crosstab(X[feature], y)
#     chi2, pval, dof, expected = chi2_contingency(contingency_table)
#     ohe_pvals.append(pval)

        
# # Perform chi-square test on label-encoded features
# label_features = [col for col in X.columns if X[col].nunique() == 2]
# label_pvals = []
# for feature in label_features:
#     contingency_table = pd.crosstab(X[feature], y)
#     chi2, pval, dof, expected = chi2_contingency(contingency_table)
#     label_pvals.append(pval)
        
# # Combine p-values for all features
# all_pvals = ohe_pvals + label_pvals

# # Select significant features based on p-values
# sig_features = [X.columns[i] for i in range(len(all_pvals)) if all_pvals[i] < 0.05]

# print(sig_features)

In [265]:
# #membuang kolom-kolom dgn nilai p-value yang tinggi
# for i in X.columns:
#     if i not in sig_features:
#         df1 = df1.drop([i],axis=1)
# df1.info()

**Feature Scaling**


In [266]:
features = []
for col in df_mod.columns:
  if col != 'fraud_reported':
    features.append(col)

target = 'fraud_reported'

X_mod= df_mod[features]
X_act= df_act[features]
y = df_mod[target]
y_act = df_act[target]

In [267]:
# from sklearn.preprocessing import StandardScaler

# sc = StandardScaler()
# X_mod = sc.fit_transform(X_mod)
# X_act = sc.fit_transform(X_act)

# **Modelling**


# Modelling AdaBoost

In [268]:
#Importing AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
#train test split
from sklearn.model_selection import train_test_split
#metrics
from sklearn import metrics
#Library KFold CV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [269]:
i = 12428

In [270]:
# %%time 
# Acc_AB=[]
# F1_AB=[]
# for i in range(1,1000):
#   train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   ab_clf = AdaBoostClassifier()
#   model = ab_clf.fit(train_x,train_y)
#   y_pred = model.predict(test_x)
#   y_pred_proba = model.predict_proba(test_x)
#   Accuracy = metrics.accuracy_score(test_y,y_pred)
#   F1_score = metrics.f1_score(test_y,y_pred)
#   Acc_AB.append(Accuracy)
#   F1_AB.append(F1_score)

# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# confusion_matrix

In [271]:
# sns.histplot(data=Acc_AB)

In [272]:
# sns.histplot(data=F1_AB)

Hyperparameter Tuning AdaBoost

In [273]:
# %%time

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# clf = AdaBoostClassifier()
# grid = dict()
# base_estimator = []
# base_estimator_dict = {}
# for i in range(1,100,10):
#   base_estimator.append(DecisionTreeClassifier(max_depth=i))
#   base_estimator_dict['DecisionTreeClassifier(max_depth='+ str(i)+')'] = i   
    
# grid['n_estimators'] = range(1,1000,100)
# grid['learning_rate'] = [0.1, 0.01, 0.05]
# grid['base_estimator']= base_estimator

# cv=StratifiedKFold(n_splits=10)
# grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)


# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# grid_result.best_params_

In [274]:
# # Extract the relevant information from cv_results_
# results_dict = {'n_estimators': [], 'learning_rate': [],'base_estimator': [], 'mean_test_score': []}

# for i, params in enumerate(grid_search.cv_results_['params']):
#     results_dict['n_estimators'].append(params['n_estimators'])
#     results_dict['learning_rate'].append(params['learning_rate'])
#     results_dict['base_estimator'].append(params['base_estimator'])
#     results_dict['mean_test_score'].append(grid_search.cv_results_['mean_test_score'][i])
    

# # Create a DataFrame from the extracted information
# results_df_AB = pd.DataFrame(results_dict)

# results_df_AB.info()

In [275]:
# results_df_AB['base_estimator'] =results_df_AB['base_estimator'].astype(str).map(base_estimator_dict)
# results_df_AB

In [276]:
# pd.pivot_table(results_df_AB, values='mean_test_score', index=['n_estimators','learning_rate'], columns='base_estimator', aggfunc=np.mean)

In [277]:
# %%time

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# clf = AdaBoostClassifier()
# grid = dict()
# base_estimator = [DecisionTreeClassifier(max_depth=1)]
# base_estimator_dict = {}
# for i in range(1,6,1):
#   base_estimator.append(DecisionTreeClassifier(max_depth=i))
#   base_estimator_dict['DecisionTreeClassifier(max_depth='+ str(i)+')'] = i   
    
# grid['n_estimators'] = range(1,11,1)
# grid['learning_rate'] = np.arange(0.01,0.3,0.01)
# grid['base_estimator']= base_estimator

# cv=StratifiedKFold(n_splits=10)
# grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)


# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# grid_result.best_params_

In [278]:
# # Extract the relevant information from cv_results_
# results_dict = {'n_estimators': [], 'learning_rate': [],'base_estimator': [], 'mean_test_score': []}

# for i, params in enumerate(grid_search.cv_results_['params']):
#     results_dict['n_estimators'].append(params['n_estimators'])
#     results_dict['learning_rate'].append(params['learning_rate'])
#     results_dict['base_estimator'].append(params['base_estimator'])
#     results_dict['mean_test_score'].append(grid_search.cv_results_['mean_test_score'][i])
    

# # Create a DataFrame from the extracted information
# results_df_AB = pd.DataFrame(results_dict)

# results_df_AB['base_estimator'] =results_df_AB['base_estimator'].astype(str).map(base_estimator_dict)
# results_df_AB

In [279]:
i = 12428

In [280]:
Acc_AB_tuned=[]
F1_AB_tuned=[]
grid_result= {'base_estimator': DecisionTreeClassifier(max_depth=2),'learning_rate': 0.23,
                            'n_estimators': 8}
# for i in range(1,1000):
train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   ab_clf = AdaBoostClassifier(base_estimator = grid_result.best_params_['base_estimator'],learning_rate = grid_result.best_params_['learning_rate'],n_estimators =grid_result.best_params_['n_estimators'])
ab_clf = AdaBoostClassifier(base_estimator = grid_result['base_estimator'],learning_rate = grid_result['learning_rate'],n_estimators =grid_result['n_estimators'])
model = ab_clf.fit(train_x,train_y)
y_pred = model.predict(test_x)
y_pred_proba = model.predict_proba(test_x)
Accuracy = metrics.accuracy_score(test_y,y_pred)
F1_score = metrics.f1_score(test_y,y_pred)
Acc_AB_tuned.append(Accuracy)
F1_AB_tuned.append(F1_score)

confusion_matrix_tuned = metrics.confusion_matrix(test_y,y_pred)
confusion_matrix_tuned

array([[168,  20],
       [ 12,  54]], dtype=int64)

In [281]:
# # Add a new column to the DataFrame with the probabilities(AdaBoost)
# fraud_prob = model.predict_proba(X_mod)
# fraud_prob_act = model.predict_proba(X_act)
# df_model['fraud_prob'] = pd.Series(fraud_prob[:, 1], index=df_model.index)
# df_actual['fraud_prob'] = pd.Series(fraud_prob_act[:, 1], index=df_actual.index)
# # Save the updated DataFrame as a new CSV file
# df_model.to_csv('insurance_claims_with_probs(846)(AdaBoost).csv', index=False)
# df_actual.to_csv('insurance_claims_with_probs(154)(AdaBoost).csv', index=False)

In [282]:
# sns.histplot(data=Acc_AB_tuned)

In [283]:
# sns.histplot(data=F1_AB_tuned)

In [284]:
# AB_columns = [Acc_AB,Acc_AB_tuned,F1_AB,F1_AB_tuned]
# AB = pd.DataFrame(AB_columns).transpose()
# AB.columns = ['Acc_AB','Acc_AB_tuned','F1_AB','F1_AB_tuned']
# AB.describe()

Confusion Matrix AdaBoost


In [285]:
# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# Accuracy = metrics.accuracy_score(test_y,y_pred)
# Precision = metrics.precision_score(test_y,y_pred)
# Sensitivity_recall = metrics.recall_score(test_y,y_pred)
# Specificity = metrics.recall_score(test_y,y_pred,pos_label=0)
# F1_score = metrics.f1_score(test_y,y_pred)
# print('Accuracy:', Accuracy)
# print('Precision:', Precision)
# print('Sensitivity:', Sensitivity_recall)
# print('Specificity:', Specificity)
# print('F1-score:', F1_score)
# confusion_matrix



# Modelling GBDT

In [286]:
#Importing GBDT Classifier
from sklearn.ensemble import GradientBoostingClassifier

In [287]:
# Acc_GBDT=[]
# F1_GBDT=[]
# for i in range(1,1000):
#   train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   gbdt_clf = GradientBoostingClassifier()
#   model = gbdt_clf.fit(train_x,train_y)
#   y_pred = model.predict(test_x)
#   y_pred_proba = model.predict_proba(test_x)
#   Accuracy = metrics.accuracy_score(test_y,y_pred)
#   F1_score = metrics.f1_score(test_y,y_pred)
#   Acc_GBDT.append(Accuracy)
#   F1_GBDT.append(F1_score)

# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# confusion_matrix

In [288]:
# sns.histplot(data=Acc_GBDT)

In [289]:
# sns.histplot(data=F1_GBDT)

Hyperparameter Tuning GBDT

In [290]:
# %%time

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# gbdt_clf = GradientBoostingClassifier()
# grid = dict()   
    
# grid['n_estimators'] = range(1,1000,100)
# grid['learning_rate'] = [0.1, 0.01, 0.05]
# grid['max_depth']= range(1,100,10)
# cv=StratifiedKFold(n_splits=10)
# grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)


# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# grid_result.best_params_

In [291]:
# # Extract the relevant information from cv_results_
# results_dict = {'n_estimators': [], 'learning_rate': [],'max_depth': [], 'mean_test_score': []}

# for i, params in enumerate(grid_search.cv_results_['params']):
#     results_dict['n_estimators'].append(params['n_estimators'])
#     results_dict['learning_rate'].append(params['learning_rate'])
#     results_dict['max_depth'].append(params['max_depth'])
#     results_dict['mean_test_score'].append(grid_search.cv_results_['mean_test_score'][i])
    

# # Create a DataFrame from the extracted information
# results_df_GBDT = pd.DataFrame(results_dict)

# results_df_GBDT.info()

In [292]:
# pd.pivot_table(results_df_GBDT, values='mean_test_score', index=['n_estimators','learning_rate'], columns='max_depth', aggfunc=np.mean)

In [293]:
# %%time

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# clf = GradientBoostingClassifier()
# grid = dict()   
    
# grid['n_estimators'] = range(470,480,1)
# grid['learning_rate'] = np.arange(0.01,0.51,0.01)
# grid['max_depth']= range(1,11,1)
# cv=StratifiedKFold(n_splits=10)
# grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)


# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# grid_result.best_params_

In [294]:
# # Extract the relevant information from cv_results_
# results_dict = {'n_estimators': [], 'learning_rate': [],'max_depth': [], 'mean_test_score': []}

# for i, params in enumerate(grid_search.cv_results_['params']):
#     results_dict['n_estimators'].append(params['n_estimators'])
#     results_dict['learning_rate'].append(params['learning_rate'])
#     results_dict['max_depth'].append(params['max_depth'])
#     results_dict['mean_test_score'].append(grid_search.cv_results_['mean_test_score'][i])
    

# # Create a DataFrame from the extracted information
# results_df_GBDT = pd.DataFrame(results_dict)

# results_df_GBDT.info()

In [295]:
# pd.pivot_table(results_df_GBDT, values='mean_test_score', index=['n_estimators','learning_rate'], columns='max_depth', aggfunc=np.mean)

In [296]:
Acc_GBDT_tuned=[]
F1_GBDT_tuned=[]
grid_result= {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 471}
# for i in range(1,1000):
train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   gbdt_clf = GradientBoostingClassifier(max_depth = grid_result.best_params_['max_depth'],learning_rate = grid_result.best_params_['learning_rate'],n_estimators =grid_result.best_params_['n_estimators'])
gbdt_clf = GradientBoostingClassifier(max_depth = grid_result['max_depth'],learning_rate = grid_result['learning_rate'],n_estimators =grid_result['n_estimators'])  
model = gbdt_clf.fit(train_x,train_y)
y_pred = model.predict(test_x)
y_pred_proba = model.predict_proba(test_x)
Accuracy = metrics.accuracy_score(test_y,y_pred)
F1_score = metrics.f1_score(test_y,y_pred)
Acc_GBDT_tuned.append(Accuracy)
F1_GBDT_tuned.append(F1_score)

confusion_matrix_tuned = metrics.confusion_matrix(test_y,y_pred)
confusion_matrix_tuned

array([[168,  20],
       [ 10,  56]], dtype=int64)

In [297]:
# # Add a new column to the DataFrame with the probabilities(GBDT)
# fraud_prob = model.predict_proba(X_mod)
# fraud_prob_act = model.predict_proba(X_act)
# df_model['fraud_prob'] = pd.Series(fraud_prob[:, 1], index=df_model.index)
# df_actual['fraud_prob'] = pd.Series(fraud_prob_act[:, 1], index=df_actual.index)
# # Save the updated DataFrame as a new CSV file
# df_model.to_csv('insurance_claims_with_probs(846)(GBDT).csv', index=False)
# df_actual.to_csv('insurance_claims_with_probs(154)(GBDT).csv', index=False)

In [298]:
# sns.histplot(data=Acc_GBDT_tuned)

In [299]:
# sns.histplot(data=F1_GBDT_tuned)

In [300]:
# GBDT_columns = [Acc_GBDT,Acc_GBDT_tuned,F1_GBDT,F1_GBDT_tuned]
# GBDT = pd.DataFrame(GBDT_columns).transpose()
# GBDT.columns = ['Acc_GBDT','Acc_GBDT_tuned','F1_GBDT','F1_GBDT_tuned']
# GBDT.describe()

Confusion Matrix GBDT

In [301]:
# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# Accuracy = metrics.accuracy_score(test_y,y_pred)
# Precision = metrics.precision_score(test_y,y_pred)
# Sensitivity_recall = metrics.recall_score(test_y,y_pred)
# Specificity = metrics.recall_score(test_y,y_pred,pos_label=0)
# F1_score = metrics.f1_score(test_y,y_pred)
# print('Accuracy:', Accuracy)
# print('Precision:', Precision)
# print('Sensitivity:', Sensitivity_recall)
# print('Specificity:', Specificity)
# print('F1-score:', F1_score)
# confusion_matrix



# Modelling XGBoost


In [302]:
#Importing XGB Classifier
from xgboost import XGBClassifier

In [303]:
# Acc_XGB=[]
# F1_XGB=[]
# for i in range(1,1000):
#   train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   xgb_clf = XGBClassifier()
#   model = xgb_clf.fit(train_x,train_y)
#   y_pred = model.predict(test_x)
#   y_pred_proba = model.predict_proba(test_x)
#   Accuracy = metrics.accuracy_score(test_y,y_pred)
#   F1_score = metrics.f1_score(test_y,y_pred)
#   Acc_XGB.append(Accuracy)
#   F1_XGB.append(F1_score)

# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# confusion_matrix

In [304]:
# sns.histplot(data=Acc_XGB)

In [305]:
# sns.histplot(data=F1_XGB)

Hyperparameter Tuning XGBoost

In [306]:
# %%time

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# xgb_clf = XGBClassifier()
# grid = dict()   
    
# grid['n_estimators'] = range(1,1000,100)
# grid['learning_rate'] = [0.1, 0.01, 0.05]
# grid['max_depth']= range(1,100,10)
# cv=StratifiedKFold(n_splits=10)
# grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)


# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# grid_result.best_params_

In [307]:
# # Extract the relevant information from cv_results_
# results_dict = {'n_estimators': [], 'learning_rate': [],'max_depth': [], 'mean_test_score': []}

# for i, params in enumerate(grid_search.cv_results_['params']):
#     results_dict['n_estimators'].append(params['n_estimators'])
#     results_dict['learning_rate'].append(params['learning_rate'])
#     results_dict['max_depth'].append(params['max_depth'])
#     results_dict['mean_test_score'].append(grid_search.cv_results_['mean_test_score'][i])
    

# # Create a DataFrame from the extracted information
# results_df_XGB = pd.DataFrame(results_dict)

# results_df_XGB.info()

In [308]:
# pd.pivot_table(results_df_XGB, values='mean_test_score', index=['n_estimators','learning_rate'], columns='max_depth', aggfunc=np.mean)

In [309]:
# %%time

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# clf = XGBClassifier()
# grid = dict()   
    
# grid['n_estimators'] = range(525,531,1)
# grid['learning_rate'] = np.arange(0.01,0.51,0.01)
# grid['max_depth']= range(1,6,1)
# cv=StratifiedKFold(n_splits=10)
# grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)


# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# grid_result.best_params_

In [310]:
# # Extract the relevant information from cv_results_
# results_dict = {'n_estimators': [], 'learning_rate': [],'max_depth': [], 'mean_test_score': []}

# for i, params in enumerate(grid_search.cv_results_['params']):
#     results_dict['n_estimators'].append(params['n_estimators'])
#     results_dict['learning_rate'].append(params['learning_rate'])
#     results_dict['max_depth'].append(params['max_depth'])
#     results_dict['mean_test_score'].append(grid_search.cv_results_['mean_test_score'][i])
    

# # Create a DataFrame from the extracted information
# results_df_XGB = pd.DataFrame(results_dict)

# results_df_XGB.info()

In [311]:
# pd.pivot_table(results_df_XGB, values='mean_test_score', index=['n_estimators','learning_rate'], columns='max_depth', aggfunc=np.mean)

In [312]:
Acc_XGB_tuned=[]
F1_XGB_tuned=[]
grid_result= {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 525}
# for i in range(1,1000):
train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   xgb_clf = XGBClassifier(max_depth = grid_result.best_params_['max_depth'],learning_rate = grid_result.best_params_['learning_rate'],n_estimators =grid_result.best_params_['n_estimators'])
xgb_clf = XGBClassifier(max_depth = grid_result['max_depth'],learning_rate = grid_result['learning_rate'],n_estimators =grid_result['n_estimators'])
model = xgb_clf.fit(train_x,train_y)
y_pred = model.predict(test_x)
y_pred_proba = model.predict_proba(test_x)
Accuracy = metrics.accuracy_score(test_y,y_pred)
F1_score = metrics.f1_score(test_y,y_pred)
Acc_XGB_tuned.append(Accuracy)
F1_XGB_tuned.append(F1_score)

confusion_matrix_tuned = metrics.confusion_matrix(test_y,y_pred)
confusion_matrix_tuned

array([[168,  20],
       [ 10,  56]], dtype=int64)

In [313]:
# # Add a new column to the DataFrame with the probabilities(XGBoost)
# fraud_prob = model.predict_proba(X_mod)
# fraud_prob_act = model.predict_proba(X_act)
# df_model['fraud_prob'] = pd.Series(fraud_prob[:, 1], index=df_model.index)
# df_actual['fraud_prob'] = pd.Series(fraud_prob_act[:, 1], index=df_actual.index)
# # Save the updated DataFrame as a new CSV file
# df_model.to_csv('insurance_claims_with_probs(846)(XGBoost).csv', index=False)
# df_actual.to_csv('insurance_claims_with_probs(154)(XGBoost).csv', index=False)

In [314]:
# sns.histplot(data=Acc_XGB_tuned)

In [315]:
# sns.histplot(data=F1_XGB_tuned)

In [316]:
# XGB_columns = [Acc_XGB,Acc_XGB_tuned,F1_XGB,F1_XGB_tuned]
# XGB = pd.DataFrame(XGB_columns).transpose()
# XGB.columns = ['Acc_XGB','Acc_XGB_tuned','F1_XGB','F1_XGB_tuned']
# XGB.describe()

Confusion Matrix XGB

In [317]:
# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# Accuracy = metrics.accuracy_score(test_y,y_pred)
# Precision = metrics.precision_score(test_y,y_pred)
# Sensitivity_recall = metrics.recall_score(test_y,y_pred)
# Specificity = metrics.recall_score(test_y,y_pred,pos_label=0)
# F1_score = metrics.f1_score(test_y,y_pred)
# print('Accuracy:', Accuracy)
# print('Precision:', Precision)
# print('Sensitivity:', Sensitivity_recall)
# print('Specificity:', Specificity)
# print('F1-score:', F1_score)
# confusion_matrix




# Soft Voting Classifier

In [318]:
#Importing soft voting classifier
from sklearn.ensemble import VotingClassifier

In [319]:
# Acc_SV=[]
# F1_SV=[]
# for i in range(1,1000):
#   train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   sv_clf = VotingClassifier(estimators=[('AB', ab_clf), ('GBDT', gbdt_clf), ('XGB', xgb_clf)],
#            voting='soft',n_jobs=-1)
#   model = sv_clf.fit(train_x,train_y)
#   y_pred = model.predict(test_x)
#   y_pred_proba = model.predict_proba(test_x)
#   Accuracy = metrics.accuracy_score(test_y,y_pred)
#   F1_score = metrics.f1_score(test_y,y_pred)
#   Acc_SV.append(Accuracy)
#   F1_SV.append(F1_score)

# confusion_matrix = metrics.confusion_matrix(test_y,y_pred)
# confusion_matrix

In [320]:
# sns.histplot(data=Acc_SV)

In [321]:
# sns.histplot(data=F1_SV)

In [322]:
# total_weight = 100
# weights = []

# for i in range(total_weight+1):
#     for j in range(total_weight+1):
#         for k in range(total_weight+1):
#             if i+j+k == total_weight:
#                 weights.append([i,j,k])

In [323]:
# from sklearn.ensemble import VotingClassifier

# train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=1,stratify=y,test_size=0.3,train_size=0.7)
# voting_clf = VotingClassifier(
#     estimators=[('AB', ab_clf), ('GBDT', gbdt_clf), ('XGB', xgb_clf)],
#     voting='soft',n_jobs=-1)
# # Define the hyperparameters to tune
# grid = {
#     'weights': weights
# }

# cv=StratifiedKFold(n_splits=10)
# # Create a GridSearchCV object and fit it on the data
# grid_search = GridSearchCV(estimator=voting_clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
# grid_result = grid_search.fit(train_x,train_y)

# # Print the best hyperparameters and the corresponding accuracy score
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [324]:
Acc_SV_tuned=[]
F1_SV_tuned=[]
# for i in range(1,1000):
train_x,test_x,train_y,test_y = train_test_split(X_mod,y,random_state=i,stratify=y,test_size=0.3,train_size=0.7)
#   sv_clf = VotingClassifier(estimators=[('AB', ab_clf), ('GBDT', gbdt_clf), ('XGB', xgb_clf)],
#            voting='soft',weights = grid_result.best_params_['weights'])
sv_clf = VotingClassifier(estimators=[('AB', ab_clf), ('GBDT', gbdt_clf), ('XGB', xgb_clf)],
       voting='soft',weights = [45, 16, 39])
model = sv_clf.fit(train_x,train_y)
y_pred = model.predict(test_x)
y_pred_proba = model.predict_proba(test_x)
Accuracy = metrics.accuracy_score(test_y,y_pred)
F1_score = metrics.f1_score(test_y,y_pred)
Acc_SV_tuned.append(Accuracy)
F1_SV_tuned.append(F1_score)

confusion_matrix_tuned = metrics.confusion_matrix(test_y,y_pred)
confusion_matrix_tuned

array([[168,  20],
       [ 12,  54]], dtype=int64)

In [325]:
# sns.histplot(data=Acc_SV_tuned)

In [326]:
# sns.histplot(data=F1_SV_tuned)

In [327]:
# SV_columns = [Acc_SV,Acc_SV_tuned,F1_SV,F1_SV_tuned]
# SV = pd.DataFrame(SV_columns).transpose()
# SV.columns = ['Acc_SV','Acc_SV_tuned','F1_SV','F1_SV_tuned']
# SV.describe()

In [328]:
fraud_prob = model.predict_proba(X_mod)
fraud_class = model.predict(X_mod)

fraud_prob_act = model.predict_proba(X_act)
fraud_class_act = model.predict(X_act)

In [329]:
fraud_prob

array([[0.46058361, 0.53941639],
       [0.89968572, 0.10031427],
       [0.39658839, 0.60341161],
       ...,
       [0.88810719, 0.11189282],
       [0.40940489, 0.59059511],
       [0.87387535, 0.12612466]])

In [330]:
# Add a new column to the DataFrame with the probabilities
df_model['fraud_prob'] = pd.Series(fraud_prob[:, 1], index=df_model.index)
df_actual['fraud_prob'] = pd.Series(fraud_prob_act[:, 1], index=df_actual.index)
# Save the updated DataFrame as a new CSV file
df_model.to_csv('insurance_claims_with_probs(846).csv', index=False)
df_actual.to_csv('insurance_claims_with_probs(154).csv', index=False)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [331]:
confusion_matrix = metrics.confusion_matrix(y,fraud_class)
Accuracy = metrics.accuracy_score(y,fraud_class)
Precision = metrics.precision_score(y,fraud_class)
Sensitivity_recall = metrics.recall_score(y,fraud_class)
Specificity = metrics.recall_score(y,fraud_class,pos_label=0)
F1_score = metrics.f1_score(y,fraud_class)
print('Accuracy:', Accuracy)
print('Precision:', Precision)
print('Sensitivity:', Sensitivity_recall)
print('Specificity:', Specificity)
print('F1-score:', F1_score)
confusion_matrix

Accuracy: 0.8687943262411347
Precision: 0.6914893617021277
Sensitivity: 0.8904109589041096
Specificity: 0.861244019138756
F1-score: 0.7784431137724552


array([[540,  87],
       [ 24, 195]], dtype=int64)

In [332]:
confusion_matrix = metrics.confusion_matrix(y_act,fraud_class_act)
Accuracy = metrics.accuracy_score(y_act,fraud_class_act)
Precision = metrics.precision_score(y_act,fraud_class_act)
Sensitivity_recall = metrics.recall_score(y_act,fraud_class_act)
Specificity = metrics.recall_score(y_act,fraud_class_act,pos_label=0)
F1_score = metrics.f1_score(y_act,fraud_class_act)
print('Accuracy:', Accuracy)
print('Precision:', Precision)
print('Sensitivity:', Sensitivity_recall)
print('Specificity:', Specificity)
print('F1-score:', F1_score)
confusion_matrix

Accuracy: 0.7922077922077922
Precision: 0.4583333333333333
Sensitivity: 0.7857142857142857
Specificity: 0.7936507936507936
F1-score: 0.5789473684210527


array([[100,  26],
       [  6,  22]], dtype=int64)