### Data Cleaning and Preparation

In [20]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv('Data/claims-data.csv')

print(df.columns.to_list())

['Customer ID', 'Age', 'Gender', 'Marital Status', 'Occupation', 'Income Level', 'Education Level', 'Geographic Information', 'Location', 'Behavioral Data', 'Purchase History', 'Policy Start Date', 'Policy Renewal Date', 'Claim History', 'Interactions with Customer Service', 'Insurance Products Owned', 'Coverage Amount', 'Premium Amount', 'Deductible', 'Policy Type', 'Customer Preferences', 'Preferred Communication Channel', 'Preferred Contact Time', 'Preferred Language', 'Risk Profile', 'Previous Claims History', 'Credit Score', 'Driving Record', 'Life Events', 'Segmentation Group']


In [22]:
df = df[['Customer ID', 'Age', 'Gender', 'Marital Status', 'Occupation',
        'Income Level', 'Education Level',
        'Policy Start Date', 'Policy Renewal Date', 
        'Claim History', 'Insurance Products Owned', 'Coverage Amount', 'Premium Amount', 'Deductible', 
        'Policy Type','Risk Profile', 'Previous Claims History', 'Credit Score', 'Driving Record']]

df.shape[1]

19

In [23]:
df['Risk Profile'].value_counts()

Risk Profile
3    16647
1    15393
0    11405
2    10058
Name: count, dtype: int64

In [24]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Policy Start Date,Policy Renewal Date,Claim History,Insurance Products Owned,Coverage Amount,Premium Amount,Deductible,Policy Type,Risk Profile,Previous Claims History,Credit Score,Driving Record
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,08-01-2023,12-03-2023,5,policy2,366603,2749,1604,Group,1,3,728,DUI
1,95568,26,Male,Widowed,Manager,54168,Doctorate,09-06-2020,06-09-2023,0,policy1,780236,1966,1445,Group,1,2,792,Clean
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,09-03-2023,11-03-2024,4,policy3,773926,4413,1612,Group,2,1,719,Accident
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,4/14/2018,05-04-2023,5,policy2,787815,4342,1817,Family,3,0,639,DUI
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,12-02-2022,09-10-2023,3,policy4,366506,1276,133,Family,0,3,720,Major Violations


In [25]:
def assign_approval(row):
    if row["Driving Record"] in ["DUI", "Major Violations"]:
        return np.random.choice([0, 1], p=[0.6, 0.4])  # 60% rejection chance
    elif row["Claim History"] > 3 or row["Previous Claims History"] > 2:
        return np.random.choice([0, 1], p=[0.3, 0.7])  # 50% rejection chance
    elif row["Credit Score"] < 650:
        return np.random.choice([0, 1], p=[0.1, 0.9])  # 40% rejection chance
    else:
        return np.random.choice([0, 1], p=[0.1, 0.9])  # 90% approval chance

# Apply function to create "Approved" column
df["Approved"] = df.apply(assign_approval, axis=1)


# Display updated DataFrame
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Policy Start Date,Policy Renewal Date,Claim History,Insurance Products Owned,Coverage Amount,Premium Amount,Deductible,Policy Type,Risk Profile,Previous Claims History,Credit Score,Driving Record,Approved
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,08-01-2023,12-03-2023,5,policy2,366603,2749,1604,Group,1,3,728,DUI,1
1,95568,26,Male,Widowed,Manager,54168,Doctorate,09-06-2020,06-09-2023,0,policy1,780236,1966,1445,Group,1,2,792,Clean,1
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,09-03-2023,11-03-2024,4,policy3,773926,4413,1612,Group,2,1,719,Accident,1
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,4/14/2018,05-04-2023,5,policy2,787815,4342,1817,Family,3,0,639,DUI,1
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,12-02-2022,09-10-2023,3,policy4,366506,1276,133,Family,0,3,720,Major Violations,0


In [26]:
df.Approved.value_counts()

Approved
1    33539
0    19964
Name: count, dtype: int64

In [27]:
df

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Policy Start Date,Policy Renewal Date,Claim History,Insurance Products Owned,Coverage Amount,Premium Amount,Deductible,Policy Type,Risk Profile,Previous Claims History,Credit Score,Driving Record,Approved
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,08-01-2023,12-03-2023,5,policy2,366603,2749,1604,Group,1,3,728,DUI,1
1,95568,26,Male,Widowed,Manager,54168,Doctorate,09-06-2020,06-09-2023,0,policy1,780236,1966,1445,Group,1,2,792,Clean,1
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,09-03-2023,11-03-2024,4,policy3,773926,4413,1612,Group,2,1,719,Accident,1
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,4/14/2018,05-04-2023,5,policy2,787815,4342,1817,Family,3,0,639,DUI,1
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,12-02-2022,09-10-2023,3,policy4,366506,1276,133,Family,0,3,720,Major Violations,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53498,44809,35,Female,Divorced,Salesperson,120850,Associate Degree,10/20/2023,2/15/2024,1,policy1,586401,4404,1010,Family,3,3,506,Clean,1
53499,65485,61,Male,Single,Entrepreneur,122309,Doctorate,4/16/2022,10/15/2023,1,policy4,637733,1285,1531,Group,0,0,543,Major Violations,0
53500,26213,25,Male,Divorced,Teacher,49258,Doctorate,3/18/2023,9/25/2024,2,policy1,631057,4407,732,Individual,0,0,803,Major Violations,0
53501,63136,42,Male,Married,Artist,66301,Doctorate,05-06-2021,10-12-2023,0,policy1,730385,4482,1855,Business,1,3,803,Clean,0


In [28]:
df.columns.to_list()

['Customer ID',
 'Age',
 'Gender',
 'Marital Status',
 'Occupation',
 'Income Level',
 'Education Level',
 'Policy Start Date',
 'Policy Renewal Date',
 'Claim History',
 'Insurance Products Owned',
 'Coverage Amount',
 'Premium Amount',
 'Deductible',
 'Policy Type',
 'Risk Profile',
 'Previous Claims History',
 'Credit Score',
 'Driving Record',
 'Approved']

In [29]:
df.dtypes

Customer ID                  int64
Age                          int64
Gender                      object
Marital Status              object
Occupation                  object
Income Level                 int64
Education Level             object
Policy Start Date           object
Policy Renewal Date         object
Claim History                int64
Insurance Products Owned    object
Coverage Amount              int64
Premium Amount               int64
Deductible                   int64
Policy Type                 object
Risk Profile                 int64
Previous Claims History      int64
Credit Score                 int64
Driving Record              object
Approved                     int64
dtype: object

### Building the Classification Model for Approval and Denial of Claims

In [30]:
data = df.drop(columns=['Customer ID'])

In [31]:
data.columns

Index(['Age', 'Gender', 'Marital Status', 'Occupation', 'Income Level',
       'Education Level', 'Policy Start Date', 'Policy Renewal Date',
       'Claim History', 'Insurance Products Owned', 'Coverage Amount',
       'Premium Amount', 'Deductible', 'Policy Type', 'Risk Profile',
       'Previous Claims History', 'Credit Score', 'Driving Record',
       'Approved'],
      dtype='object')

In [33]:
data.dtypes

Age                          int64
Gender                      object
Marital Status              object
Occupation                  object
Income Level                 int64
Education Level             object
Policy Start Date           object
Policy Renewal Date         object
Claim History                int64
Insurance Products Owned    object
Coverage Amount              int64
Premium Amount               int64
Deductible                   int64
Policy Type                 object
Risk Profile                 int64
Previous Claims History      int64
Credit Score                 int64
Driving Record              object
Approved                     int64
dtype: object

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

ImportError: dlopen(/opt/anaconda3/lib/python3.11/site-packages/scipy/sparse/linalg/_isolve/_iterative.cpython-311-darwin.so, 0x0002): Library not loaded: @rpath/liblapack.3.dylib
  Referenced from: <CB06B8E9-5573-3234-A309-5114AFCED6F5> /opt/anaconda3/lib/python3.11/site-packages/scipy/sparse/linalg/_isolve/_iterative.cpython-311-darwin.so
  Reason: tried: '/opt/anaconda3/lib/python3.11/site-packages/scipy/sparse/linalg/_isolve/liblapack.3.dylib' (no such file), '/opt/anaconda3/lib/python3.11/site-packages/scipy/sparse/linalg/_isolve/../../../../../../liblapack.3.dylib' (no such file), '/opt/anaconda3/lib/python3.11/site-packages/scipy/sparse/linalg/_isolve/liblapack.3.dylib' (no such file), '/opt/anaconda3/lib/python3.11/site-packages/scipy/sparse/linalg/_isolve/../../../../../../liblapack.3.dylib' (no such file), '/opt/anaconda3/bin/../lib/liblapack.3.dylib' (no such file), '/opt/anaconda3/bin/../lib/liblapack.3.dylib' (no such file), '/usr/local/lib/liblapack.3.dylib' (no such file), '/usr/lib/liblapack.3.dylib' (no such file, not in dyld cache)