### Data Cleaning and Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/claims-data.csv')

print(df.columns.to_list())

['Customer ID', 'Age', 'Gender', 'Marital Status', 'Occupation', 'Income Level', 'Education Level', 'Geographic Information', 'Location', 'Behavioral Data', 'Purchase History', 'Policy Start Date', 'Policy Renewal Date', 'Claim History', 'Interactions with Customer Service', 'Insurance Products Owned', 'Coverage Amount', 'Premium Amount', 'Deductible', 'Policy Type', 'Customer Preferences', 'Preferred Communication Channel', 'Preferred Contact Time', 'Preferred Language', 'Risk Profile', 'Previous Claims History', 'Credit Score', 'Driving Record', 'Life Events', 'Segmentation Group']


In [3]:
df = df[['Customer ID', 'Age', 'Gender', 'Marital Status', 'Occupation',
        'Income Level', 'Education Level',
        'Policy Start Date', 'Policy Renewal Date', 
        'Claim History', 'Insurance Products Owned', 'Coverage Amount', 'Premium Amount', 'Deductible', 
        'Policy Type','Risk Profile', 'Previous Claims History', 'Credit Score', 'Driving Record']]

df.shape[1]

19

In [4]:
df['Risk Profile'].value_counts()

Risk Profile
3    16647
1    15393
0    11405
2    10058
Name: count, dtype: int64

In [5]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Policy Start Date,Policy Renewal Date,Claim History,Insurance Products Owned,Coverage Amount,Premium Amount,Deductible,Policy Type,Risk Profile,Previous Claims History,Credit Score,Driving Record
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,08-01-2023,12-03-2023,5,policy2,366603,2749,1604,Group,1,3,728,DUI
1,95568,26,Male,Widowed,Manager,54168,Doctorate,09-06-2020,06-09-2023,0,policy1,780236,1966,1445,Group,1,2,792,Clean
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,09-03-2023,11-03-2024,4,policy3,773926,4413,1612,Group,2,1,719,Accident
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,4/14/2018,05-04-2023,5,policy2,787815,4342,1817,Family,3,0,639,DUI
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,12-02-2022,09-10-2023,3,policy4,366506,1276,133,Family,0,3,720,Major Violations


In [6]:
def assign_approval(row):
    if row["Driving Record"] in ["DUI", "Major Violations"]:
        return np.random.choice([0, 1], p=[0.6, 0.4])  # 60% rejection chance
    elif row["Claim History"] > 3 or row["Previous Claims History"] > 2:
        return np.random.choice([0, 1], p=[0.3, 0.7])  # 50% rejection chance
    elif row["Credit Score"] < 650:
        return np.random.choice([0, 1], p=[0.1, 0.9])  # 40% rejection chance
    else:
        return np.random.choice([0, 1], p=[0.1, 0.9])  # 90% approval chance

# Apply function to create "Approved" column
df["Approved"] = df.apply(assign_approval, axis=1)


# Display updated DataFrame
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Policy Start Date,Policy Renewal Date,Claim History,Insurance Products Owned,Coverage Amount,Premium Amount,Deductible,Policy Type,Risk Profile,Previous Claims History,Credit Score,Driving Record,Approved
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,08-01-2023,12-03-2023,5,policy2,366603,2749,1604,Group,1,3,728,DUI,0
1,95568,26,Male,Widowed,Manager,54168,Doctorate,09-06-2020,06-09-2023,0,policy1,780236,1966,1445,Group,1,2,792,Clean,1
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,09-03-2023,11-03-2024,4,policy3,773926,4413,1612,Group,2,1,719,Accident,1
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,4/14/2018,05-04-2023,5,policy2,787815,4342,1817,Family,3,0,639,DUI,0
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,12-02-2022,09-10-2023,3,policy4,366506,1276,133,Family,0,3,720,Major Violations,0


In [7]:
df.Approved.value_counts()

Approved
1    33507
0    19996
Name: count, dtype: int64

In [8]:
df

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Occupation,Income Level,Education Level,Policy Start Date,Policy Renewal Date,Claim History,Insurance Products Owned,Coverage Amount,Premium Amount,Deductible,Policy Type,Risk Profile,Previous Claims History,Credit Score,Driving Record,Approved
0,84966,23,Female,Married,Entrepreneur,70541,Associate Degree,08-01-2023,12-03-2023,5,policy2,366603,2749,1604,Group,1,3,728,DUI,0
1,95568,26,Male,Widowed,Manager,54168,Doctorate,09-06-2020,06-09-2023,0,policy1,780236,1966,1445,Group,1,2,792,Clean,1
2,10544,29,Female,Single,Entrepreneur,73899,Associate Degree,09-03-2023,11-03-2024,4,policy3,773926,4413,1612,Group,2,1,719,Accident,1
3,77033,20,Male,Divorced,Entrepreneur,63381,Bachelor's Degree,4/14/2018,05-04-2023,5,policy2,787815,4342,1817,Family,3,0,639,DUI,0
4,88160,25,Female,Separated,Manager,38794,Bachelor's Degree,12-02-2022,09-10-2023,3,policy4,366506,1276,133,Family,0,3,720,Major Violations,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53498,44809,35,Female,Divorced,Salesperson,120850,Associate Degree,10/20/2023,2/15/2024,1,policy1,586401,4404,1010,Family,3,3,506,Clean,0
53499,65485,61,Male,Single,Entrepreneur,122309,Doctorate,4/16/2022,10/15/2023,1,policy4,637733,1285,1531,Group,0,0,543,Major Violations,1
53500,26213,25,Male,Divorced,Teacher,49258,Doctorate,3/18/2023,9/25/2024,2,policy1,631057,4407,732,Individual,0,0,803,Major Violations,0
53501,63136,42,Male,Married,Artist,66301,Doctorate,05-06-2021,10-12-2023,0,policy1,730385,4482,1855,Business,1,3,803,Clean,1


In [9]:
df.columns.to_list()

['Customer ID',
 'Age',
 'Gender',
 'Marital Status',
 'Occupation',
 'Income Level',
 'Education Level',
 'Policy Start Date',
 'Policy Renewal Date',
 'Claim History',
 'Insurance Products Owned',
 'Coverage Amount',
 'Premium Amount',
 'Deductible',
 'Policy Type',
 'Risk Profile',
 'Previous Claims History',
 'Credit Score',
 'Driving Record',
 'Approved']

In [10]:
df.dtypes

Customer ID                  int64
Age                          int64
Gender                      object
Marital Status              object
Occupation                  object
Income Level                 int64
Education Level             object
Policy Start Date           object
Policy Renewal Date         object
Claim History                int64
Insurance Products Owned    object
Coverage Amount              int64
Premium Amount               int64
Deductible                   int64
Policy Type                 object
Risk Profile                 int64
Previous Claims History      int64
Credit Score                 int64
Driving Record              object
Approved                     int64
dtype: object

### Building the Classification Model for Approval and Denial of Claims

#### Data Preprocessing

In [11]:
data = df.drop(columns=['Customer ID'])

In [12]:
data.columns

Index(['Age', 'Gender', 'Marital Status', 'Occupation', 'Income Level',
       'Education Level', 'Policy Start Date', 'Policy Renewal Date',
       'Claim History', 'Insurance Products Owned', 'Coverage Amount',
       'Premium Amount', 'Deductible', 'Policy Type', 'Risk Profile',
       'Previous Claims History', 'Credit Score', 'Driving Record',
       'Approved'],
      dtype='object')

In [13]:
data.dtypes

Age                          int64
Gender                      object
Marital Status              object
Occupation                  object
Income Level                 int64
Education Level             object
Policy Start Date           object
Policy Renewal Date         object
Claim History                int64
Insurance Products Owned    object
Coverage Amount              int64
Premium Amount               int64
Deductible                   int64
Policy Type                 object
Risk Profile                 int64
Previous Claims History      int64
Credit Score                 int64
Driving Record              object
Approved                     int64
dtype: object

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [15]:
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [16]:
def prepare_data(df):
    data = df.copy()
    
    data['Policy_Start_Date'] = pd.to_datetime(data['Policy Start Date'],format='mixed')
    data['Policy_Renewal_Date'] = pd.to_datetime(data['Policy Renewal Date'],format = 'mixed')
    data['Policy_Duration'] = (data['Policy_Renewal_Date'] - data['Policy_Start_Date']).dt.days

    data['Start_Year'] = data['Policy_Start_Date'].dt.year
    data['Start_Month'] = data['Policy_Start_Date'].dt.month
    data['Renewal_Year'] = data['Policy_Renewal_Date'].dt.year
    data['Renewal_Month'] = data['Policy_Renewal_Date'].dt.month
    
    # Create boolean feature for whether policy is close to renewal (within 30 days) - Convert to int
    now = pd.Timestamp.now()
    data['Near_Renewal'] = ((data['Policy_Renewal_Date'] - now).dt.days < 30).astype(int)
        
    
    categorical_features = ['Gender', 'Marital Status', 'Occupation', 'Education Level', 
                           'Insurance Products Owned', 'Policy Type', 'Driving Record']
    
    encoders = {}
    for feature in categorical_features:
        encoder = LabelEncoder()
        data[feature + '_Encoded'] = encoder.fit_transform(data[feature])
        encoders[feature] = encoder
    
    data = data.drop(['Gender', 'Marital Status', 'Occupation', 'Education Level', 
                     'Insurance Products Owned', 'Policy Type', 'Driving Record','Policy_Renewal_Date',
                     'Policy Start Date', 'Policy Renewal Date', 'Policy_Start_Date',], axis=1)
    

    # numeric check
    for col in data.columns:
        if data[col].dtype == 'object':
            print(f"Warning: Column {col} is still of type object, converting to numeric")
            data[col] = pd.to_numeric(data[col], errors='coerce')
    
    data = data.fillna(0)

    
    X = data.drop('Approved', axis=1)
    y = data['Approved']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, encoders

#### Training the tree-model 

In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
X_train, X_test, y_train, y_test, encoders = prepare_data(df)

In [19]:
X_train.head()

Unnamed: 0,Customer ID,Age,Income Level,Claim History,Coverage Amount,Premium Amount,Deductible,Risk Profile,Previous Claims History,Credit Score,...,Renewal_Year,Renewal_Month,Near_Renewal,Gender_Encoded,Marital Status_Encoded,Occupation_Encoded,Education Level_Encoded,Insurance Products Owned_Encoded,Policy Type_Encoded,Driving Record_Encoded
14009,6301,56,86334,3,781040,1076,1838,3,0,580,...,2024,7,1,1,0,0,2,2,0,1
32091,66142,24,47647,0,88626,4380,385,2,1,650,...,2023,2,1,1,4,8,0,1,2,3
33852,95593,52,111357,1,207125,4208,760,0,2,723,...,2024,10,1,0,1,7,3,4,3,0
3231,54453,44,66373,1,820081,4436,143,2,2,761,...,2024,6,1,1,0,4,1,0,2,3
41586,80177,66,33240,4,748608,1910,1358,1,2,813,...,2024,3,1,1,0,7,2,4,2,4


In [20]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Random Forest Model Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Random Forest Model Metrics:
Accuracy: 0.6979
Precision: 0.7504
Recall: 0.7712
F1 Score: 0.7607


In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define and train the XGBoost model
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"XGBoost Model Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Model Metrics:
Accuracy: 0.7043
Precision: 0.7702
Recall: 0.7484
F1 Score: 0.7591
