## Card Fraud Detection

In [1]:
#!pip install --user deepchecks
#!pip install ipywidgets

In [2]:
import pandas as pd
import datetime
import numpy as np

data_df = pd.read_csv("carclaims.csv", sep = ";")
data_df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,FraudFound_P
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


### Data ingestion and validation

In [6]:
from deepchecks.tabular.suites import data_integrity

result = data_integrity().run(data_df)
result.show()



Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_MSJUMYNAWQPMKMJOE5Q2KLDJY">Data Integrity Sui…

### Data pre-processing and data validation

In [7]:
data_clean_df = data_df.copy()
data_clean_df['PastNumberOfClaims'] = data_clean_df['PastNumberOfClaims'].fillna('none')
data_clean_df.drop(index=1146, inplace=True)
data_clean_df['Month'] = data_clean_df['Month'].replace({str(i): datetime.date(1900, i, 1).strftime('%b') for i in range(1, 13)})
data_clean_df['DayOfWeek'] = data_clean_df['DayOfWeek'].str.lower()

In [8]:
import re
def str_to_mean(str_val):
    if isinstance(str_val, (int, float)):
        return str_val
    if str_val.lower() == 'none':
        return np.nan
    if str_val == 'new':
        return 0
    parts = re.findall(r'\d+', str_val)
    parts = list(map(int, parts))
    if len(parts) < 1:
        raise ValueError(str_val)
    return np.mean(parts)

In [9]:
data_clean_df['VehiclePrice'] = data_clean_df['VehiclePrice'].apply(str_to_mean)
data_clean_df['Days_Policy_Accident'] = data_clean_df['Days_Policy_Accident'].apply(str_to_mean)
data_clean_df['Days_Policy_Claim'] = data_clean_df['Days_Policy_Claim'].apply(str_to_mean)
data_clean_df['AgeOfVehicle'] = data_clean_df['AgeOfVehicle'].apply(str_to_mean)
data_clean_df['AgeOfPolicyHolder'] = data_clean_df['AgeOfPolicyHolder'].apply(str_to_mean)
data_clean_df['NumberOfCars'] = data_clean_df['NumberOfCars'].apply(str_to_mean)
data_clean_df['PastNumberOfClaims'] = data_clean_df['PastNumberOfClaims'].apply(str_to_mean)

In [10]:
data_clean_df['AddressChange_Claim'] = data_clean_df['AddressChange_Claim'].replace(
{
    'no change': 10,
    '4 to 8 years': 6,
    '2 to 3 years': 2.5,
    '1 year': 1,
    'under 6 months': 0.5
})

In [11]:
data_clean_df[['Year', 'FraudFound_P', 'Month']].groupby(['Year', 'FraudFound_P']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Month
Year,FraudFound_P,Unnamed: 2_level_1
1994,No,5732
1994,Yes,409
1995,No,4894
1995,Yes,301
1996,No,3870
1996,Yes,213


In [12]:
train_df = data_clean_df[data_clean_df.Year == 1994]
test_df = data_clean_df[data_clean_df.Year > 1994]

In [13]:
from deepchecks.tabular import Dataset

cat_cols = ['Month',
 'WeekOfMonth',
 'DayOfWeek',
 'Make',
 'AccidentArea',
 'DayOfWeekClaimed',
 'MonthClaimed',
 'WeekOfMonthClaimed',
 'Sex',
 'MaritalStatus',
 'Fault',
 'PolicyType',
 'VehicleCategory',
 'PoliceReportFiled',
 'WitnessPresent',
 'AgentType',
 'NumberOfSuppliments',
 'BasePolicy']

In [14]:
train_ds = Dataset(train_df, label='FraudFound_P', datetime_name='Year', cat_features=cat_cols)
test_ds = Dataset(test_df, label='FraudFound_P', datetime_name='Year', cat_features=cat_cols)

In [15]:
from deepchecks.tabular.suites import train_test_validation

res = train_test_validation().run(train_ds, test_ds)
res.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_O9WY2RIFGF2L3GI6DGAKARK0H">Train Test Validat…

In [16]:
test_df = test_df[~test_df.Make.isin(['Ferrari', 'Lexus'])]

In [17]:
train_ds = Dataset(train_df, label='FraudFound_P', datetime_name='Year', index_name='PolicyNumber', cat_features=cat_cols)
test_ds = Dataset(test_df, label='FraudFound_P', datetime_name='Year', index_name='PolicyNumber', cat_features=cat_cols)

In [18]:
res = train_test_validation().run(train_ds, test_ds)
res.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_5LRDRVD7LQM8C0IWH5NYGKGWR">Train Test Validat…

In [19]:
all(res.results[i].passed_conditions() for i in range(len(res.results)))

True

### Model training and validation

In [20]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=100, random_seed=42, verbose=0)
model.fit(train_df.drop(columns=['FraudFound_P', 'Year', 'PolicyNumber']), train_df['FraudFound_P'],
          cat_features=cat_cols)

<catboost.core.CatBoostClassifier at 0x233d7046610>

In [21]:
from deepchecks.tabular.suites import model_evaluation

res = model_evaluation().run(train_ds, test_ds, model)
res.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_IIC72DPPLM1OO99ZEB4DRZJL2">Model Evaluation S…

In [22]:
from deepchecks.tabular import Suite
from deepchecks.tabular.checks import TrainTestPerformance, BoostingOverfit, SimpleModelComparison, ConfusionMatrixReport

custom_suite = Suite('My Custom Performance Suite',
                     TrainTestPerformance().add_condition_train_test_relative_degradation_less_than(0.1),
                     ConfusionMatrixReport(),
                     BoostingOverfit(alternative_scorer=['f1', 'f1']).add_condition_test_score_percent_decline_less_than(0.01),
                     SimpleModelComparison().add_condition_gain_greater_than(0.1),
                    )

custom_suite.run(train_ds, test_ds, model)

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_9BBW3O86TO51WAGM4G4JLFZFI">My Custom Performa…

In [23]:
model = CatBoostClassifier(iterations=50, random_seed=42, verbose=0, learning_rate=0.2, colsample_bylevel=0.03, subsample=0.5,
                       depth=4)

model.fit(train_df.drop(columns=['FraudFound_P', 'Year', 'PolicyNumber']), train_df['FraudFound_P'],
          cat_features=cat_cols)

<catboost.core.CatBoostClassifier at 0x233d511d340>

In [25]:
custom_suite = Suite('My Custom Performance Suite',
                     TrainTestPerformance().add_condition_train_test_relative_degradation_less_than(0.1),
                     ConfusionMatrixReport(),
                     BoostingOverfit(alternative_scorer=['f1', 'f1']).add_condition_test_score_percent_decline_less_than(0.01),
                     SimpleModelComparison().add_condition_gain_greater_than(0.1),
                    )

custom_suite.run(train_ds, test_ds, model)


invalid value encountered in double_scalars



Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_0PK5O580HIQ6SCVM7O6BDFKVW">My Custom Performa…