In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# insurance fraud detection

## Data Preparation

import, prepare/clean the dataset


In [85]:
df = pd.read_csv('insurance_fraud.csv')
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [None]:
# remove uneccesary variables
df.drop(['policy_number','policy_state','incident_location','incident_date',
           'incident_state','incident_city','collision_type','insured_hobbies','property_damage','auto_make','auto_model','auto_year', '_c39'], axis = 1, inplace = True)

In [87]:
# create catergorical dataframe
cat_var = df.select_dtypes(include=['object'])
cat_var.head()

Unnamed: 0,policy_csl,insured_sex,insured_education_level,insured_occupation,insured_relationship,incident_type,incident_severity,authorities_contacted,police_report_available,fraud_reported
0,250/500,MALE,MD,craft-repair,husband,Single Vehicle Collision,Major Damage,Police,YES,Y
1,250/500,MALE,MD,machine-op-inspct,other-relative,Vehicle Theft,Minor Damage,Police,?,Y
2,100/300,FEMALE,PhD,sales,own-child,Multi-vehicle Collision,Minor Damage,Police,NO,N
3,250/500,FEMALE,PhD,armed-forces,unmarried,Single Vehicle Collision,Major Damage,Police,NO,Y
4,500/1000,MALE,Associate,sales,unmarried,Vehicle Theft,Minor Damage,,NO,N


In [82]:
#inspect
for c in cat_var.columns:
    print(f"{c} : \n length: {len(cat_var[c].unique())}\n {cat_var[c].unique()} \n")


policy_csl : 
 length: 3
 ['250/500' '100/300' '500/1000'] 

insured_education_level : 
 length: 7
 ['MD' 'PhD' 'Associate' 'Masters' 'High School' 'College' 'JD'] 

insured_occupation : 
 length: 14
 ['craft-repair' 'machine-op-inspct' 'sales' 'armed-forces' 'tech-support'
 'prof-specialty' 'other-service' 'priv-house-serv' 'exec-managerial'
 'protective-serv' 'transport-moving' 'handlers-cleaners' 'adm-clerical'
 'farming-fishing'] 

insured_relationship : 
 length: 6
 ['husband' 'other-relative' 'own-child' 'unmarried' 'wife' 'not-in-family'] 

incident_type : 
 length: 4
 ['Single Vehicle Collision' 'Vehicle Theft' 'Multi-vehicle Collision'
 'Parked Car'] 

incident_severity : 
 length: 4
 ['Major Damage' 'Minor Damage' 'Total Loss' 'Trivial Damage'] 

authorities_contacted : 
 length: 5
 ['Police' 'None' 'Fire' 'Other' 'Ambulance'] 



In [88]:
# encode catergorical variables
cat_var = pd.get_dummies(cat_var, drop_first = True) 
cat_var.head()

Unnamed: 0,policy_csl_250/500,policy_csl_500/1000,insured_sex_MALE,insured_education_level_College,insured_education_level_High School,insured_education_level_JD,insured_education_level_MD,insured_education_level_Masters,insured_education_level_PhD,insured_occupation_armed-forces,...,incident_severity_Minor Damage,incident_severity_Total Loss,incident_severity_Trivial Damage,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police,police_report_available_NO,police_report_available_YES,fraud_reported_Y
0,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
1,1,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,1,0,0
3,1,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,1,1,0,1
4,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0


In [89]:
int_var = df.select_dtypes(include=["int64"])
int_var.head()

Unnamed: 0,months_as_customer,age,policy_deductable,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim
0,328,48,1000,0,466132,53300,0,5,1,1,2,71610,6510,13020,52080
1,228,42,2000,5000000,468176,0,0,8,1,0,0,5070,780,780,3510
2,134,29,2000,5000000,430632,35100,0,7,3,2,3,34650,7700,3850,23100
3,256,41,2000,6000000,608117,48900,-62400,5,1,1,2,63400,6340,6340,50720
4,228,44,1000,6000000,610706,66000,-46000,20,1,0,1,6500,1300,650,4550


In [96]:
#combine numerical and catergorical dataframes
df = pd.concat([int_var,cat_var] , axis = 1)
df.head()

Unnamed: 0,months_as_customer,age,policy_deductable,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,...,incident_severity_Minor Damage,incident_severity_Total Loss,incident_severity_Trivial Damage,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police,police_report_available_NO,police_report_available_YES,fraud_reported_Y
0,328,48,1000,0,466132,53300,0,5,1,1,...,0,0,0,0,0,0,1,0,1,1
1,228,42,2000,5000000,468176,0,0,8,1,0,...,1,0,0,0,0,0,1,0,0,1
2,134,29,2000,5000000,430632,35100,0,7,3,2,...,1,0,0,0,0,0,1,1,0,0
3,256,41,2000,6000000,608117,48900,-62400,5,1,1,...,0,0,0,0,0,0,1,1,0,1
4,228,44,1000,6000000,610706,66000,-46000,20,1,0,...,1,0,0,0,1,0,0,1,0,0


In [None]:
#plot variables against claim freq
for label in df.columns[:-1]:
    plt.hist(df[df['fraud_reported_Y']==1 ][label], color='blue', label='fraud',alpha = 0.6,density=True) #desity being true to normalise the distribution
    plt.hist(df[df['fraud_reported_Y']==0 ][label], color='red', label='no_fraud',alpha = 0.6,density=True)
    plt.title(label)
    plt.ylabel('Probability')
    plt.xlabel(label)
    plt.legend()
    plt.show()



### Train, Valid, Test datasets

In [97]:
train, valid,test = np.split(df.sample(frac =1),[ int(0.6*len(df)),int(0.8*len(df))])

In [98]:
#scale the dataset and define x and y

def scale_dataset(dataframe,oversample = False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    if oversample:
        ros = RandomOverSampler()
        x,y= ros.fit_resample(x,y)

    data = np.hstack((x,np.reshape(y,(-1,1))))

    return data, x, y

In [99]:
train,x_train,y_train = scale_dataset(train, oversample = True)
valid,x_valid,y_valid = scale_dataset(valid, oversample = False)
test,x_test,y_test = scale_dataset(test, oversample = False)

### Fit ML models

For this project we trying to detect if a claim is fraud or not, we are going to use a couple of classifications and compare then to a neuro network model to see which one will produce better results

#### KN classifier

In [100]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [101]:

knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(x_train,y_train)
y_pred = knn_model.predict(x_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.78      0.59      0.67       148
           1       0.31      0.52      0.39        52

    accuracy                           0.57       200
   macro avg       0.54      0.55      0.53       200
weighted avg       0.65      0.57      0.60       200



### Naive Bayes

In [102]:
from sklearn.naive_bayes import GaussianNB

In [103]:
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
y_pred = nb_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.49      0.61       148
           1       0.33      0.71      0.45        52

    accuracy                           0.55       200
   macro avg       0.58      0.60      0.53       200
weighted avg       0.70      0.55      0.57       200



### Linear Regression

In [104]:
from sklearn.linear_model import LogisticRegression

In [105]:
lg_model = LogisticRegression()
lg_model.fit(x_train,y_train)
y_pred = lg_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81       148
           1       0.51      0.73      0.60        52

    accuracy                           0.74       200
   macro avg       0.70      0.74      0.71       200
weighted avg       0.79      0.74      0.76       200



### Surport vector Mechines

In [106]:
from sklearn.svm import SVC

In [117]:
svm_model = SVC()
svm_model.fit(x_train,y_train)
y_pred = svm_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       148
           1       0.57      0.56      0.56        52

    accuracy                           0.78       200
   macro avg       0.71      0.70      0.71       200
weighted avg       0.77      0.78      0.77       200



### NN
From our classification models, the highest accuracy was just over 78%. Now, lets see if we can beat that with Neuron Nework

In [109]:
import tensorflow as tf

In [110]:
def plot_loss(history):

    plt.plot(history.history['loss'],label = 'loss')
    plt.plot(history.history['val_loss'],label = 'val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Binary crossentropy')
    plt.legend()
    plt.show()

    plt.plot(history.history['accuracy'],label = 'accuracy')
    plt.plot(history.history['val_accuracy'],label = 'val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()


In [114]:
def train_model(x_train,y_train,num_nodes,dropout_prob, lr, batch_size ,epochs):


    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation = 'relu', input_shape = (54,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation = "relu"),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1,activation = "sigmoid")
    ])

    nn_model.compile(optimizer = tf.keras.optimizers.Adam(lr), loss ="binary_crossentropy" , metrics = ['accuracy'])

    history = nn_model.fit(
    x_train, y_train ,epochs = epochs , batch_size=batch_size, validation_split = 0.2 ,verbose = 0
    )

    return nn_model,history

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs = 100
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0,0.2]:
        for lr in [0.01, 0.005,0.001]:
            for batch_size in [32,64,128]: 
                print(f"{num_nodes} nodes and {dropout_prob} dropout prob, learning rate of {lr}, batch size {batch_size} ")
                model,history = train_model(x_train,y_train,num_nodes,dropout_prob, lr, batch_size ,epochs)
                #plot_loss(history)
                val_loss = model.evaluate(x_valid,y_valid)
                if val_loss[0] < least_val_loss:
                    least_val_loss = val_loss[0]
                    least_loss_model = model
            


In [116]:
least_loss_model.predict(x_test)
y_pred = (y_pred >0.5).astype(int).reshape(-1,)
print(classification_report(y_test,y_pred))



              precision    recall  f1-score   support

           0       0.85      0.85      0.85       148
           1       0.57      0.56      0.56        52

    accuracy                           0.78       200
   macro avg       0.71      0.70      0.71       200
weighted avg       0.77      0.78      0.77       200



### Results

KNN : 56%
Naive bayse : 54%
Logistic regression : 74%
Surpport vector mechine: 78%
Neuro Network: 78%