In [66]:
import pandas as pd
import numpy as np
import re
import itertools
import csv
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

pd.set_option('display.max_columns', 500)

train = pd.read_csv("train_2021.csv", dtype={"zip_code" : object})
test = pd.read_csv("test_2021.csv", dtype={"zip_code" : object})


In [67]:
## fraud
train = train[train.fraud != -1]

## annual_income
train.loc[train.annual_income==-1, 'annual_income'] = np.nan
test.loc[test.annual_income==-1, 'annual_income'] = np.nan

## age_of_driver
train.loc[train.age_of_driver>100, 'age_of_driver'] = np.nan
test.loc[test.age_of_driver>100, 'age_of_driver'] = np.nan

## Encode Categorical Variables

In [68]:
# set claim_number as index:
train = train.set_index('claim_number')
test = test.set_index('claim_number')


train["marital_status"] = pd.Categorical(train["marital_status"])
train["high_education_ind"] = pd.Categorical(train["high_education_ind"])
train["address_change_ind"] = pd.Categorical(train["address_change_ind"])
train["zip_code"] = pd.Categorical(train["zip_code"])
train["witness_present_ind"] = pd.Categorical(train["witness_present_ind"])
train["policy_report_filed_ind"] = pd.Categorical(train["policy_report_filed_ind"])
train["fraud"] = pd.Categorical(train["fraud"])
train["claim_day_of_week"]=pd.Categorical(train["claim_day_of_week"])

test["marital_status"] = pd.Categorical(test["marital_status"])
test["high_education_ind"] = pd.Categorical(test["high_education_ind"])
test["address_change_ind"] = pd.Categorical(test["address_change_ind"])
test["zip_code"] = pd.Categorical(test["zip_code"])
test["witness_present_ind"] = pd.Categorical(test["witness_present_ind"])
test["policy_report_filed_ind"] = pd.Categorical(test["policy_report_filed_ind"])
test["claim_day_of_week"]=pd.Categorical(test["claim_day_of_week"])

## Add new features: Lat/Lon/State

In [69]:
with open("zip_code_database.csv", newline='') as csvfile:
    csv_reader = csv.DictReader(csvfile, delimiter=',')
    zip_to_lat = {}
    zip_to_lon = {}
    zip_to_state = {}
    for zip_data in csv_reader:
        zip_to_lat[zip_data['zip']] = float(zip_data['latitude'])
        zip_to_lon[zip_data['zip']] = float(zip_data['longitude'])
        zip_to_state[zip_data['zip']] = zip_data['state']
        
### assuming the '0' zip code is NaN (no such thing as a zip code of 0)   
zip_to_lat[np.nan] = np.nan
zip_to_lon[np.nan] = np.nan
zip_to_state[np.nan] = np.nan

zip_to_lat['0'] = np.nan
zip_to_lon['0'] = np.nan
zip_to_state['0'] = np.nan

### transform zip code to latitude, longitude, and state
latitude_train = train['zip_code'].apply(
    lambda x: zip_to_lat[x]
)
longitude_train = train['zip_code'].apply(
    lambda x: zip_to_lon[x]
)
state_train = train['zip_code'].apply(
    lambda x: zip_to_state[x]
)
latitude_train.name = 'latitude'
longitude_train.name = 'longitude'
state_train.name = 'state'

latitude_test = test['zip_code'].apply(
    lambda x: zip_to_lat[x]
)
longitude_test = test['zip_code'].apply(
    lambda x: zip_to_lon[x]
)
state_test = test['zip_code'].apply(
    lambda x: zip_to_state[x]
)
latitude_test.name = 'latitude'
longitude_test.name = 'longitude'
state_test.name = 'state'


### Add these new features to the data frame
train = pd.concat([train, latitude_train], axis=1)
train = pd.concat([train, longitude_train], axis=1)
train = pd.concat([train, state_train], axis=1)
train["state"] = pd.Categorical(train["state"])

test = pd.concat([test, latitude_test], axis=1)
test = pd.concat([test, longitude_test], axis=1)
test = pd.concat([test, state_test], axis=1)
test["state"] = pd.Categorical(test["state"])

## Imputation of Missing Values 
(Use mode and mean to impute)

In [70]:
# mean of age_of_driver
age_of_driver_mean = train.age_of_driver.mean()
train['age_of_driver'].fillna(age_of_driver_mean, inplace=True)
test['age_of_driver'].fillna(age_of_driver_mean, inplace=True)

# mode of marital_status
marital_status_mode = train.marital_status.mode().values[0]
train['marital_status'].fillna(marital_status_mode, inplace=True)
test['marital_status'].fillna(marital_status_mode, inplace=True)

# average of annual_income
annual_income_mean = train.annual_income.mean()
train['annual_income'].fillna(annual_income_mean, inplace=True)
test['annual_income'].fillna(annual_income_mean, inplace=True)

# mode of witness_present_ind
witness_present_mode = train.witness_present_ind.mode().values[0]
train['witness_present_ind'].fillna(witness_present_mode, inplace=True)
test['witness_present_ind'].fillna(witness_present_mode, inplace=True)

# mean of claim_est_payout
claim_est_payout_mean = train.claim_est_payout.mean()
train['claim_est_payout'].fillna(claim_est_payout_mean, inplace=True)
test['claim_est_payout'].fillna(claim_est_payout_mean, inplace=True)

# mean of age_of_vehicle
age_of_vehicle_mean = train.age_of_vehicle.mean()
train['age_of_vehicle'].fillna(age_of_vehicle_mean, inplace=True)
test['age_of_vehicle'].fillna(age_of_vehicle_mean, inplace=True)

# mean latitude
latitude_mean = train.latitude.mean()
train['latitude'].fillna(latitude_mean, inplace=True)
test['latitude'].fillna(latitude_mean, inplace=True)

# mean longitude
longitude_mean = train.longitude.mean()
train['longitude'].fillna(longitude_mean, inplace=True)
test['longitude'].fillna(longitude_mean, inplace=True)

# mode of state
state_mode = train.state.mode().values[0]
train['state'].fillna(state_mode, inplace=True)
test['state'].fillna(state_mode, inplace=True)

# # print the list of missing columns
# print(list(itertools.compress(list(train), list(train.isna().any()))))
# print(list(itertools.compress(list(test), list(test.isna().any()))))

## One-Hot Encoding of Categorical Variables

In [71]:
###encoding for TRAIN data set

# one-hot encoding for day of week
day_dummies = pd.get_dummies(train['claim_day_of_week'], 
                             prefix='claim_day', drop_first=True)
train = pd.concat([train, day_dummies], axis=1)
train.drop(["claim_day_of_week"], axis=1, inplace=True)

# one-hot encoding for site of accident
accident_dummies = pd.get_dummies(train['accident_site'], 
                                  prefix='accident_site', drop_first=True)
train = pd.concat([train, accident_dummies], axis=1)
train.drop(["accident_site"], axis=1, inplace=True)

# one-hot encoding for channel
channel_dummies = pd.get_dummies(train['channel'], 
                                 prefix='channel', drop_first=True)
train = pd.concat([train, channel_dummies], axis=1)
train.drop(["channel"], axis=1, inplace=True)

# one-hot encoding for vehicle category
vehicle_cat_dummies = pd.get_dummies(train['vehicle_category'], 
                                 prefix='vehicle_category', drop_first=True)
train = pd.concat([train, vehicle_cat_dummies], axis=1)
train.drop(["vehicle_category"], axis=1, inplace=True)

# one-hot encoding for vehicle color
vehicle_color_dummies = pd.get_dummies(train['vehicle_color'], 
                                 prefix='vehicle_color', drop_first=True)
train = pd.concat([train, vehicle_color_dummies], axis=1)
train.drop(["vehicle_color"], axis=1, inplace=True)

# # one-hot encoding for claim month
# vehicle_color_dummies = pd.get_dummies(train['claim_month'], 
#                                  prefix='claim_month', drop_first=True)
# train = pd.concat([train, vehicle_color_dummies], axis=1)
# train.drop(["claim_month"], axis=1, inplace=True)

# one-hot encoding for state
state_dummies = pd.get_dummies(train['state'],
                               prefix='state', drop_first=True)
train = pd.concat([train, state_dummies], axis=1)
train.drop(["state"], axis=1, inplace=True)



### encoding for TEST data set

# one-hot encoding for day of week
day_dummies = pd.get_dummies(test["claim_day_of_week"], 
                             prefix='claim_day', drop_first=True)
test = pd.concat([test, day_dummies], axis=1)
test.drop(["claim_day_of_week"], axis=1, inplace=True)

# one-hot encoding for site of accident
accident_dummies = pd.get_dummies(test['accident_site'], 
                                  prefix='accident_site', drop_first=True)
test = pd.concat([test, accident_dummies], axis=1)
test.drop(["accident_site"], axis=1, inplace=True)

# one-hot encoding for channel
channel_dummies = pd.get_dummies(test['channel'], 
                                 prefix='channel', drop_first=True)
test = pd.concat([test, channel_dummies], axis=1)
test.drop(["channel"], axis=1, inplace=True)

# one-hot encoding for vehicle category
vehicle_cat_dummies = pd.get_dummies(test['vehicle_category'], 
                                 prefix='vehicle_category', drop_first=True)
test = pd.concat([test, vehicle_cat_dummies], axis=1)
test.drop(["vehicle_category"], axis=1, inplace=True)

# one-hot encoding for vehicle color
vehicle_color_dummies = pd.get_dummies(test['vehicle_color'], 
                                 prefix='vehicle_color', drop_first=True)
test = pd.concat([test, vehicle_color_dummies], axis=1)
test.drop(["vehicle_color"], axis=1, inplace=True)

# # one-hot encoding for claim month
# vehicle_color_dummies = pd.get_dummies(test['claim_month'], 
#                                  prefix='claim_month', drop_first=True)
# test = pd.concat([test, vehicle_color_dummies], axis=1)
# test.drop(["claim_month"], axis=1, inplace=True)

# one-hot encoding for state
state_dummies = pd.get_dummies(test['state'],
                               prefix='state', drop_first=True)
test = pd.concat([test, state_dummies], axis=1)
test.drop(["state"], axis=1, inplace=True)

In [72]:
### clean up variable names by making them all lowercase with underscore separators.
train.columns = map(
    lambda s: s.lower().replace(' ', '_'), 
    train.columns)

test.columns = map(
    lambda s: s.lower().replace(' ', '_'), 
    test.columns)

## add grouped-by means as the new features

In [62]:
# train["fraud"] = train["fraud"].cat.codes

In [None]:
# ## gender
# grouped_gender = train["fraud_1"].groupby(train['gender'])
# grouped_gender_mean = grouped_gender.mean().to_frame()
# grouped_gender_mean['gender']=grouped_gender_mean.index
# grouped_gender_mean['fraud_gender'] = grouped_gender_mean['fraud']
# grouped_gender_mean.drop('fraud', axis = 1, inplace = True)
# grouped_gender_mean=grouped_gender_mean.drop(['gender'],axis=1)
# train = pd.merge(train, grouped_gender_mean, on = "gender", how = "left")
# test = pd.merge(test, grouped_gender_mean, on = "gender", how = "left")
# grouped_gender_mean

# ## marital_status
# grouped_marital_status = train["fraud"].groupby(train['marital_status'])
# grouped_marital_status_mean = grouped_marital_status.mean().to_frame()
# grouped_marital_status_mean['marital_status']=grouped_marital_status_mean.index
# grouped_marital_status_mean['fraud_marital_status'] = grouped_marital_status_mean['fraud']
# grouped_marital_status_mean.drop('fraud', axis = 1, inplace = True)
# grouped_marital_status_mean=grouped_marital_status_mean.drop(['marital_status'],axis=1)
# train = pd.merge(train, grouped_marital_status_mean, on = "marital_status", how = "left")
# test = pd.merge(test, grouped_marital_status_mean, on = "marital_status", how = "left")
# grouped_marital_status_mean

# ## high_education_ind
# grouped_high_education_ind = train["fraud"].groupby(train['high_education_ind'])
# grouped_high_education_ind_mean = grouped_high_education_ind.mean().to_frame()
# grouped_high_education_ind_mean['high_education_ind']=grouped_high_education_ind_mean.index
# grouped_high_education_ind_mean['fraud_high_education_ind'] = grouped_high_education_ind_mean['fraud']
# grouped_high_education_ind_mean.drop('fraud', axis = 1, inplace = True)
# grouped_high_education_ind_mean=grouped_high_education_ind_mean.drop(['high_education_ind'],axis=1)
# train = pd.merge(train, grouped_high_education_ind_mean, on = "high_education_ind", how = "left")
# test = pd.merge(test, grouped_high_education_ind_mean, on = "high_education_ind", how = "left")
# grouped_high_education_ind_mean


# ## address_change_ind
# grouped_address_change_ind = train["fraud"].groupby(train['address_change_ind'])
# grouped_address_change_ind_mean = grouped_address_change_ind.mean().to_frame()
# grouped_address_change_ind_mean['address_change_ind']=grouped_address_change_ind_mean.index
# grouped_address_change_ind_mean['fraud_address_change_ind'] = grouped_address_change_ind_mean['fraud']
# grouped_address_change_ind_mean.drop('fraud', axis = 1, inplace = True)
# grouped_address_change_ind_mean=grouped_address_change_ind_mean.drop(['address_change_ind'],axis=1)
# train = pd.merge(train, grouped_address_change_ind_mean, on = "address_change_ind", how = "left")
# test = pd.merge(test, grouped_address_change_ind_mean, on = "address_change_ind", how = "left")
# grouped_address_change_ind_mean

# ## living_status
# grouped_living_status = train["fraud"].groupby(train['living_status'])
# grouped_living_status_mean = grouped_living_status.mean().to_frame()
# grouped_living_status_mean['living_status']=grouped_living_status_mean.index
# grouped_living_status_mean['fraud_living_status'] = grouped_living_status_mean['fraud']
# grouped_living_status_mean.drop('fraud', axis = 1, inplace = True)
# grouped_living_status_mean=grouped_living_status_mean.drop(['living_status'],axis=1)
# train = pd.merge(train, grouped_living_status_mean, on = "living_status", how = "left")
# test = pd.merge(test, grouped_living_status_mean, on = "living_status", how = "left")
# grouped_living_status_mean

# ## zip_code
# grouped_zip_code = train["fraud"].groupby(train['zip_code'])
# grouped_zip_code_mean = grouped_zip_code.mean().to_frame()
# grouped_zip_code_mean['zip_code']=grouped_zip_code_mean.index
# grouped_zip_code_mean['fraud_zip_code'] = grouped_zip_code_mean['fraud']
# grouped_zip_code_mean.drop('fraud', axis = 1, inplace = True)
# grouped_zip_code_mean=grouped_zip_code_mean.drop(['zip_code'],axis=1)
# train = pd.merge(train, grouped_zip_code_mean, on = "zip_code", how = "left")
# test = pd.merge(test, grouped_zip_code_mean, on = "zip_code", how = "left")
# grouped_zip_code_mean

# ## claim_date
# grouped_claim_date = train["fraud"].groupby(train['claim_date'])
# grouped_claim_date_mean = grouped_claim_date.mean().to_frame()
# grouped_claim_date_mean['claim_date']=grouped_claim_date_mean.index
# grouped_claim_date_mean['fraud_claim_date'] = grouped_claim_date_mean['fraud']
# grouped_claim_date_mean.drop('fraud', axis = 1, inplace = True)
# grouped_claim_date_mean=grouped_claim_date_mean.drop(['claim_date'],axis=1)
# train = pd.merge(train, grouped_claim_date_mean, on = "claim_date", how = "left")
# test = pd.merge(test, grouped_claim_date_mean, on = "claim_date", how = "left")
# grouped_claim_date_mean

# ## witness_present_ind
# grouped_witness_present_ind = train["fraud"].groupby(train['witness_present_ind'])
# grouped_witness_present_ind_mean = grouped_witness_present_ind.mean().to_frame()
# grouped_witness_present_ind_mean['witness_present_ind']=grouped_witness_present_ind_mean.index
# grouped_witness_present_ind_mean['fraud_witness_present_ind'] = grouped_witness_present_ind_mean['fraud']
# grouped_witness_present_ind_mean.drop('fraud', axis = 1, inplace = True)
# grouped_witness_present_ind_mean=grouped_witness_present_ind_mean.drop(['witness_present_ind'],axis=1)
# train = pd.merge(train, grouped_witness_present_ind_mean, on = "witness_present_ind", how = "left")
# test = pd.merge(test, grouped_witness_present_ind_mean, on = "witness_present_ind", how = "left")
# grouped_witness_present_ind_mean

# ## policy_report_filed_ind
# grouped_policy_report_filed_ind = train["fraud"].groupby(train['policy_report_filed_ind'])
# grouped_policy_report_filed_ind_mean = grouped_policy_report_filed_ind.mean().to_frame()
# grouped_policy_report_filed_ind_mean['policy_report_filed_ind']=grouped_policy_report_filed_ind_mean.index
# grouped_policy_report_filed_ind_mean['fraud_policy_report_filed_ind'] = grouped_policy_report_filed_ind_mean['fraud']
# grouped_policy_report_filed_ind_mean.drop('fraud', axis = 1, inplace = True)
# grouped_policy_report_filed_ind_mean=grouped_policy_report_filed_ind_mean.drop(['policy_report_filed_ind'],axis=1)
# train = pd.merge(train, grouped_policy_report_filed_ind_mean, on = "policy_report_filed_ind", how = "left")
# test = pd.merge(test, grouped_policy_report_filed_ind_mean, on = "policy_report_filed_ind", how = "left")
# grouped_policy_report_filed_ind_mean

# ## state
# grouped_state = train["fraud"].groupby(train['state'])
# grouped_state_mean = grouped_state.mean().to_frame()
# grouped_state_mean['state']=grouped_state_mean.index
# grouped_state_mean['fraud_state'] = grouped_state_mean['fraud']
# grouped_state_mean.drop('fraud', axis = 1, inplace = True)
# grouped_state_mean=grouped_state_mean.drop(['state'],axis=1)
# train = pd.merge(train, grouped_state_mean, on = "state", how = "left")
# test = pd.merge(test, grouped_state_mean, on = "state", how = "left")
# grouped_state_mean

# ## accident_site
# grouped_accident_site = raw_train["fraud"].groupby(raw_train['accident_site'])
# grouped_accident_site_mean = grouped_accident_site.mean().to_frame()
# grouped_accident_site_mean['accident_site']=grouped_accident_site_mean.index
# grouped_accident_site_mean['fraud_accident_site'] = grouped_accident_site_mean['fraud']
# grouped_accident_site_mean.drop('fraud', axis = 1, inplace = True)
# grouped_accident_site_mean=grouped_accident_site_mean.drop(['accident_site'],axis=1)
# raw_train = pd.merge(raw_train, grouped_accident_site_mean, on = "accident_site", how = "left")
# train['fraud_accident_site'] = raw_train['fraud_accident_site']
# raw_test = pd.merge(raw_test, grouped_accident_site_mean, on = "accident_site", how = "left")
# test['fraud_accident_site'] = raw_test['fraud_accident_site']
# grouped_accident_site_mean

# ## channel
# grouped_channel = raw_train["fraud"].groupby(raw_train['channel'])
# grouped_channel_mean = grouped_channel.mean().to_frame()
# grouped_channel_mean['channel']=grouped_channel_mean.index
# grouped_channel_mean['fraud_channel'] = grouped_channel_mean['fraud']
# grouped_channel_mean.drop('fraud', axis = 1, inplace = True)
# grouped_channel_mean=grouped_channel_mean.drop(['channel'],axis=1)
# raw_train = pd.merge(raw_train, grouped_channel_mean, on = "channel", how = "left")
# train['fraud_channel'] = raw_train['fraud_channel']
# raw_test = pd.merge(raw_test, grouped_channel_mean, on = "channel", how = "left")
# test['fraud_channel'] = raw_test['fraud_channel']
              
# grouped_channel_mean

# ## vehicle_category
# grouped_vehicle_category = raw_train["fraud"].groupby(raw_train['vehicle_category'])
# grouped_vehicle_category_mean = grouped_vehicle_category.mean().to_frame()
# grouped_vehicle_category_mean['vehicle_category']=grouped_vehicle_category_mean.index
# grouped_vehicle_category_mean['fraud_vehicle_category'] = grouped_vehicle_category_mean['fraud']
# grouped_vehicle_category_mean.drop('fraud', axis = 1, inplace = True)
# grouped_vehicle_category_mean=grouped_vehicle_category_mean.drop(['vehicle_category'],axis=1)
# raw_train = pd.merge(raw_train, grouped_vehicle_category_mean, on = "vehicle_category", how = "left")
# train['fraud_vehicle_category'] = raw_train['fraud_vehicle_category']
# raw_test = pd.merge(raw_test, grouped_vehicle_category_mean, on = "vehicle_category", how = "left")
# test['fraud_vehicle_category'] = raw_test['fraud_vehicle_category']              
# grouped_vehicle_category_mean

# ## vehicle_color
# grouped_vehicle_color = raw_train["fraud"].groupby(raw_train['vehicle_color'])
# grouped_vehicle_color_mean = grouped_vehicle_color.mean().to_frame()
# grouped_vehicle_color_mean['vehicle_color']=grouped_vehicle_color_mean.index
# grouped_vehicle_color_mean['fraud_vehicle_color'] = grouped_vehicle_color_mean['fraud']
# grouped_vehicle_color_mean.drop('fraud', axis = 1, inplace = True)
# grouped_vehicle_color_mean=grouped_vehicle_color_mean.drop(['vehicle_color'],axis=1)
# raw_train = pd.merge(raw_train, grouped_vehicle_color_mean, on = "vehicle_color", how = "left")
# train['fraud_vehicle_color'] = raw_train['fraud_vehicle_color']
# raw_test = pd.merge(raw_test, grouped_vehicle_color_mean, on = "vehicle_color", how = "left")
# test['fraud_vehicle_color'] = raw_test['fraud_vehicle_color']              
# grouped_vehicle_color_mean

In [None]:
#  Encode gender and living status and state  #####
train["living_status"] = pd.Categorical(train["living_status"])
train["gender"] = np.where(train["gender"].str.contains("M"), 1, 0)
train["living_status"] = np.where(train["living_status"].str.contains("Rent"), 1, 0)

test["living_status"] = pd.Categorical(test["living_status"])
test["gender"] = np.where(test["gender"].str.contains("M"), 1, 0)
test["living_status"] = np.where(test["living_status"].str.contains("Rent"), 1, 0)

In [79]:
train.to_csv("train_data_cleaned.csv")
test.to_csv("test_data_cleaned.csv")

In [82]:
train.head()

Unnamed: 0_level_0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,past_num_of_claims,witness_present_ind,liab_prct,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_price,vehicle_weight,fraud,latitude,longitude,claim_day_monday,claim_day_saturday,claim_day_sunday,claim_day_thursday,claim_day_tuesday,claim_day_wednesday,accident_site_local,accident_site_parking_lot,channel_online,channel_phone,vehicle_category_large,vehicle_category_medium,vehicle_color_blue,vehicle_color_gray,vehicle_color_other,vehicle_color_red,vehicle_color_silver,vehicle_color_white,state_co,state_ia,state_pa,state_va
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
1,46.0,1,1.0,85,38301.0,1,1,1,80006,12/16/2016,1,0.0,74,0,7530.940993,9.0,12885.45235,16161.33381,0,39.82,-105.1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,21.0,0,0.0,75,30445.0,0,1,1,15021,2/12/2015,1,1.0,79,0,2966.024895,4.0,29429.45218,28691.96422,0,40.38,-80.39,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
4,49.0,0,0.0,87,38923.0,0,1,0,20158,12/6/2016,0,0.0,0,0,6283.888333,3.0,21701.18195,22090.94758,1,39.13,-77.66,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
5,58.0,0,1.0,58,40605.0,1,0,0,15024,5/5/2016,3,0.0,99,1,6169.747994,4.0,13198.27344,38329.58106,1,40.54,-79.8,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0
6,38.0,1,1.0,95,36380.0,1,0,1,50034,10/27/2015,0,1.0,7,0,4541.38715,7.0,38060.21122,25876.56319,0,42.47,-93.64,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0


# Naive Bayes

In [84]:
# Drop month, day and year data, drop vehicle color, zipcode, claim_date, claim_number and SP_Index  #####
train.drop([   "claim_day_monday", "claim_day_tuesday", "claim_day_wednesday", "claim_day_thursday", 
               "claim_day_saturday", "claim_day_sunday",  "zip_code", "claim_date",   "vehicle_color_blue", 
               "vehicle_color_gray", "vehicle_color_other", "vehicle_color_red", 
              "vehicle_color_silver", "vehicle_color_white"], axis =1, inplace=True)

test.drop(["claim_day_monday", "claim_day_tuesday", "claim_day_wednesday", "claim_day_thursday", 
               "claim_day_saturday", "claim_day_sunday", "zip_code", "claim_date", "vehicle_color_blue", 
               "vehicle_color_gray", "vehicle_color_other", "vehicle_color_red", 
              "vehicle_color_silver", "vehicle_color_white"], axis =1, inplace=True)

In [85]:
train = train.filter(regex="^(?!state_).*$")
test = test.filter(regex="^(?!state_).*$")

In [87]:
from sklearn.naive_bayes import GaussianNB
y = train["fraud"]
X = train.drop("fraud", 1)


In [102]:
from sklearn import model_selection
from sklearn.metrics import f1_score
nb = GaussianNB()
scores = model_selection.cross_val_score(nb, X.values, y.values, cv = 5, scoring = 'f1')
print("F1: %0.9f (+/- %0.4f)" % (scores.mean(), scores.std()))

F1: 0.153834916 (+/- 0.0145)
