# **Introduction**: The goal of this notebook is to create a machine learning model to accurately predict cases of car insurance fraud and to understand what characterisitcs in a claim are most indicative of potential fraud

Data: https://www.kaggle.com/roshansharma/insurance-claim

GitHub: https://github.com/ArielJosephCohen/capstone

Presentation: https://docs.google.com/presentation/d/1IQdYSxrzyGvMpurhM-i097Btp4ksqL70WLEiM6yc5Sw/edit#slide=id.g35f391192_00

# **Notebook**

## This will save some stress

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

## Load helper module with custom functions

In [2]:
import helper_module as hm
from helper_module import *

## Load central data for analysis

In [3]:
df = pd.read_csv('Claims.csv')

## Assign uniform randomness for entire project

In [4]:
seed = 14

## Clean data

In [5]:
# address '?' values in data
df=hm.clean_data(df)

In [6]:
# create separate columns for policy bind year and month
df = hm.reassign_year_and_month(df)

In [7]:
# assign a car type to auto models
auto_model_dict = hm.create_auto_dict()

In [8]:
# map car type
df.auto_model=df.auto_model.map(lambda x: auto_model_dict[x])

In [9]:
# create a timeline between policy bind data and claim
df=hm.create_timeline(df)

In [10]:
# now that I have the timeline and month-year information, I can drop some more columns
df.drop(['incident_date','policy_bind_date'],axis=1,inplace=True)

In [11]:
# show capital loss as a positive value
df=hm.quantify_absolute_value(df,'capital-loss')

In [12]:
# assign numerical binary to insured sex
df=hm.map_binary_dict(df,'insured_sex','MALE','FEMALE')

In [13]:
# assign numerical binary to fraud reported (target feature)
df=hm.map_binary_dict(df,'fraud_reported','Y','N')

## Address categorical and numerical features

In [14]:
# create separate lists of numerical and categorical features
num_list = hm.create_num_list()
cat_list = hm.create_cat_list()
cat_list_2 = hm.create_cat_list_2()

In [15]:
# remove correlated features and update numerical feature list
df,num_list=hm.remove_correlation(df,num_list)

In [16]:
# create categorical and numerical data frames
df_num=df[num_list]
df_cat=df[cat_list]

## Encode categorical data as numerical values

In [17]:
# use correlation with target variable to encode categorical features
for col in cat_list_2:
    df_cat = create_encoding(col,df_cat,df)

## Combine data frames and revisit correlation

In [18]:
# merge categorical and numerical data frames into one
df_atg = hm.combine_data_frames(df_cat,df_num)

In [19]:
# remove correlation from encoded categorical features
df_atg = hm.remove_categorical_correlation(df_atg,'incident_type')

## Recursive Feature Elimination

In [20]:
# reduce un-needed features
x_and_y=hm.reduce_features(df_atg,seed)

## Filter, normalize, and scale

In [21]:
# filter outliers
x_and_y = hm.filter_outliers(x_and_y,2.5)

In [22]:
# split into X and y
X = x_and_y.drop('fraud_reported',axis=1)
y = x_and_y.fraud_reported

In [23]:
# normalize features
X = hm.normalize_features(X)

In [24]:
# scale features
X = hm.min_max_scale_data(X)

## Train-Test-Split

In [25]:
# split data into train set and validation set
X_train, X_test, y_train, y_test = hm.split_data(X,y,seed,t_s=0.25)

## Balance data

In [26]:
# balance data for more meaningful results
X_train, y_train = hm.upsample_data(X_train,y_train,seed)

## Training scores

In [27]:
# logistic regression score
lr_train_scores,lr_train_class_rep,lr_train_cm = hm.models.logistic_regression_model(X_train,y_train,X_train,y_train,seed)

In [28]:
# support vector machine scores
svc_train_scores,svc_train_class_rep,svc_train_cm = hm.models.support_vector_machine_model(X_train,y_train,X_train,y_train,seed)

In [29]:
# k nearest neighbors score
knn_train_scores,knn_train_class_rep,knn_train_cm = hm.models.knn_model(X_train,y_train,X_train,y_train)

In [30]:
# gaussian naive bayes score
gnb_train_scores,gnb_train_class_rep,gnb_train_cm = hm.models.gaussian_naive_bayes_model(X_train,y_train,X_train,y_train)

In [31]:
# linear svc score
lsvc_train_scores,lsvc_train_class_rep,lsvc_train_cm = hm.models.linear_svc_model(X_train,y_train,X_train,y_train,seed)

In [32]:
# stochastic gradient descent score
sgd_train_scores,sgd_train_class_rep,sgd_train_cm = hm.models.stochastic_gradient_descent_model(X_train,y_train,X_train,y_train,seed)

In [33]:
# decision tree score
dt_train_scores,dt_train_class_rep,dt_train_cm = hm.models.decision_tree_model(X_train,y_train,X_train,y_train,seed)

In [34]:
# random forest score
rf_train_scores,rf_train_class_rep,rf_train_cm = hm.models.random_forest_model(X_train,y_train,X_train,y_train,seed)

In [35]:
# xgboost score
xgb_train_scores,xgb_train_class_rep,xgb_train_cm = hm.models.XGBoost_model(X_train,y_train,X_train,y_train,seed)

## Train validation summary

In [51]:
# summary data frame
train_summary=pd.DataFrame(hm.models.metrics_train).set_index('models')

## Test scores

In [37]:
# logistic regression validation score
lr_test_scores,lr_test_class_rep,lr_test_cm = hm.models.logistic_regression_model(X_train,y_train,X_test,y_test,seed)

In [38]:
# support vector machine validation score
svc_test_scores,svc_test_class_rep,svc_test_cm = hm.models.support_vector_machine_model(X_train,y_train,X_test,y_test,seed)

In [39]:
# k nearest neighbors validation score
knn_test_scores,knn_test_class_rep,knn_test_cm = hm.models.knn_model(X_train,y_train,X_test,y_test)

In [40]:
# gaussian naive bayes validation score
gnb_test_scores,gnb_test_class_rep,gnb_test_cm = hm.models.gaussian_naive_bayes_model(X_train,y_train,X_test,y_test)

In [41]:
# linear svc validation score
lsvc_test_scores,lsvc_test_class_rep,lsvc_test_cm = hm.models.linear_svc_model(X_train,y_train,X_test,y_test,seed)

In [42]:
# stochastic gradient descent validation score
sgd_test_scores,sgd_test_class_rep,sgd_test_cm = hm.models.stochastic_gradient_descent_model(X_train,y_train,X_test,y_test,seed)

In [43]:
# decision tree validation score
dt_test_scores,dt_test_class_rep,dt_test_cm = hm.models.decision_tree_model(X_train,y_train,X_test,y_test,seed)

In [44]:
# random forest validation score
rf_test_scores,rf_test_class_rep,rf_test_cm = hm.models.random_forest_model(X_train,y_train,X_test,y_test,seed)

In [45]:
# xgboost validation score
xgb_test_scores,xgb_test_class_rep,xgb_test_cm = hm.models.XGBoost_model(X_train,y_train,X_test,y_test,seed)

## Test validation summary

In [52]:
# summary data frame
test_summary=pd.DataFrame(hm.models.metrics_test).set_index('models')

## Find most predictive features

In [49]:
# breakdown of most indicative features
important_features=hm.find_feature_importance(X_train,y_train,seed)

## Visualize decision tree

In [50]:
# visual of how decision tree operates
decision_tree_visual=hm.draw_decision_tree(seed,X_train,y_train,3)