In [249]:
# for data manipulation
import pandas as pd
import numpy as np
# for plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go # note that github wont be able to display these plots because they are interactive
# for some processing
import math
from datetime import datetime, timedelta

In [250]:
import sys
import os

class SuppressPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout  # Save the original stdout
        sys.stdout = open(os.devnull, 'w')  # Redirect stdout to null

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()  # Close the null file
        sys.stdout = self._original_stdout  # Restore original stdout

In [251]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, make_scorer
from sklearn.utils import resample

In [252]:
folder = "kaggle/input/"
df = pd.read_csv(folder + "train.csv")
df

Unnamed: 0,CustomerID,State,Customer Lifetime Value,Response,Coverage,Coverage Index,Education,Education Index,Effective To Date,Employment Status,...,Policy Type,Policy Type Index,Policy,Policy Index,Renew Offer Type,Sales Channel,Sales Channel Index,Vehicle Size,Vehicle Size Index,Claim over 1k
0,QC35222,California,3622.69,No,Basic,0,Bachelor,2,1/1/2024,Employed,...,Corporate Auto,1,Corporate L2,4,3,Web,0,Medsize,1,0
1,AE98193,Washington,10610.21,No,Basic,0,High School or Below,0,1/1/2024,Unemployed,...,Personal Auto,0,Personal L1,0,1,Branch,1,Medsize,1,1
2,TM23514,Oregon,13868.02,No,Extended,1,College,1,1/1/2024,Employed,...,Personal Auto,0,Personal L3,2,3,Web,0,Medsize,1,0
3,QZ42725,Washington,3119.69,No,Basic,0,Bachelor,2,1/1/2024,Unemployed,...,Personal Auto,0,Personal L3,2,2,Agent,2,Medsize,1,0
4,SG81493,Arizona,5999.04,No,Premium,2,Bachelor,2,1/1/2024,Employed,...,Corporate Auto,1,Corporate L1,3,2,Web,0,Medsize,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7285,HC90344,California,27500.54,No,Basic,0,Bachelor,2,9/2/2024,Unemployed,...,Corporate Auto,1,Corporate L2,4,1,Branch,1,Medsize,1,0
7286,ZU83252,California,11750.03,No,Basic,0,Master,3,9/2/2024,Employed,...,Personal Auto,0,Personal L3,2,3,Web,0,Medsize,1,0
7287,PR80703,California,7757.04,No,Basic,0,Bachelor,2,9/2/2024,Employed,...,Personal Auto,0,Personal L3,2,1,Branch,1,Medsize,1,0
7288,NS23754,California,3465.16,No,Basic,0,High School or Below,0,9/2/2024,Employed,...,Corporate Auto,1,Corporate L3,5,3,Call Center,3,Large,2,0


In [253]:
df_train = df.drop(columns = ["CustomerID", "Coverage", "Education", "Employment Status", "Marital Status", "Policy Type", "Policy", "Sales Channel", "Vehicle Size"])
df_train.info()
df_train["Claim over 1k"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          7290 non-null   object 
 1   Customer Lifetime Value        7290 non-null   float64
 2   Response                       7290 non-null   object 
 3   Coverage Index                 7290 non-null   int64  
 4   Education Index                7290 non-null   int64  
 5   Effective To Date              7290 non-null   object 
 6   Employment Status Index        7290 non-null   int64  
 7   Gender                         7290 non-null   object 
 8   Income                         7290 non-null   int64  
 9   Marital Status Index           7290 non-null   int64  
 10  Months Since Last Claim        7290 non-null   int64  
 11  Months Since Policy Inception  7290 non-null   int64  
 12  Number of Open Complaints      7290 non-null   i

Claim over 1k
0    6458
1     832
Name: count, dtype: int64

In [254]:
def numeric_info(df, col):
  print(f"Min of {col}: ", df[col].min())
  print(f"Max of {col}: ", df[col].max())
  print(f"Mean of {col}: ", df[col].mean())
  print(f"Mendian of {col}: ", df[col].median())
  print(f"Std of {col}: ", df[col].std())

In [255]:
numeric_info(df_train, "Customer Lifetime Value")

Min of Customer Lifetime Value:  2562.31
Max of Customer Lifetime Value:  100208.5
Mean of Customer Lifetime Value:  10798.11662277092
Mendian of Customer Lifetime Value:  7845.015
Std of Customer Lifetime Value:  9180.878152795805


In [256]:
from plotly.subplots import make_subplots

def relation_to_claim_over_1k(df):
  plt.figure(figsize=(30, 80))
  numerical_cols = df.columns.tolist()
  not_include = ["Customer Lifetime Value", "Income", "Claim over 1k"]
  numerical_cols = [col for col in numerical_cols if col not in not_include]
  
  number_of_rows = math.ceil(len(numerical_cols) / 2)
  fig = make_subplots(rows = number_of_rows, cols=2, subplot_titles=numerical_cols, vertical_spacing=0.01, horizontal_spacing=0.05)
  for idx, col in enumerate(numerical_cols):
    df_count = df.groupby([col, "Claim over 1k"]).size().reset_index(name='count')
    unique_val = df_count[col].unique()
    i, j = (idx // 2) + 1, (idx % 2) + 1
    for val in unique_val:
      df_cur = df_count[df_count[col] == val]
      fig.add_trace(
        go.Bar(
          x = df_cur["Claim over 1k"],
          y = df_cur["count"],
          name = f"{val}"
        ),
        row=i, col=j
      )
  fig.update_layout(
      height=300 * number_of_rows,  # Adjust figure height dynamically based on the number of rows
      showlegend=False,
      title_text="How Each Column Affects 'Claim over 1k'"
  )
  # Show the figure with all subplots
  fig.show()

In [257]:
def binning(df, col, size):
  df[col + "_bin"] = pd.qcut(df[col], q=size, labels=False, duplicates="drop")

def pre_process(df):
  # State
  unique_states = df["State"].unique()
  dict_states = {}
  for i in range(len(unique_states)):
    dict_states[unique_states[i]] = i
  df["State"] = df["State"].apply(lambda x: dict_states[x])
  
  # Response
  # note that resonse should be either yes or no
  df["Response"] = df["Response"].apply(lambda x: 1 if x == "Yes" else 0)

  # Effective To Date
  df["Effective To Date"] = pd.to_datetime(df["Effective To Date"])
  df["Effective To Month"] = (df["Effective To Date"].dt.month - 1) // 3 # split into 4 seasons

  # Months Since Last Claim, Months Since Last Inception
  # group them into year
  df["Years Since Last Claim"] = df["Months Since Last Claim"] // 12
  df["Years Since Policy Inception"] = df["Months Since Policy Inception"] // 36 # period of 2 years

  # Gender
  df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "M" else 0)

  # CLV
  binning(df, "Customer Lifetime Value", 60)
  binning(df, "Income", 5)

  df.drop(columns=["Effective To Date", "Months Since Last Claim", "Months Since Policy Inception", "Customer Lifetime Value", "Income"], axis=1, inplace=True)

  # New col: Accident Likelihood
  max_education_index = df["Education Index"].max()
  max_income_bin = df["Income_bin"].max()
  max_employment_status_index = df["Employment Status Index"].max()
  max_marital_status_index = df["Marital Status Index"].max()
  df["Accident Likelihood"] = np.exp( 4 * (max_education_index - df["Education Index"]) + 3 * (max_income_bin - df["Income_bin"]) \
    + 3 * (df["Marital Status Index"] == 0))

  # New col: Claim over 1k Likelihood based on State and Gender
  group_over = ["State", "Gender", "Education Index", "Marital Status Index", "Employment Status Index"]
  likelihood_df = df.groupby(group_over)["Accident Likelihood"].mean().reset_index()
  likelihood_df.rename(columns={"Accident Likelihood": "Likelihood of Claim over 1k"}, inplace=True)
  # Merge the likelihood values back to the original dataframe
  df = pd.merge(df, likelihood_df, on=group_over, how="left")
  
  binning(df, "Likelihood of Claim over 1k", 30)
  df.drop(columns=["Likelihood of Claim over 1k"], axis=1, inplace=True)

  # New col: may relate to the Customer Lifetime Policy
  group_insurance = ["Number of Policies", "Coverage Index", "Years Since Policy Inception", "Policy Type Index", "Policy Index", "Sales Channel Index", "Renew Offer Type"]
  insurance_df = df.groupby(group_insurance)["Customer Lifetime Value_bin"].mean().reset_index()
  insurance_df.rename(columns={"Customer Lifetime Value_bin": "Insurance"}, inplace=True)
  # Merge
  df = pd.merge(df, insurance_df, on=group_insurance, how="left")

  binning(df, "Insurance", 30)
  remove_insurance = [string for string in group_insurance if string not in ["Coverage Index", "Number of Policies"]]
  df.drop(columns=remove_insurance+["Insurance"], inplace=True)

  # New col: may relate to Accident Likelihood
  group_outlier = ["Response", "Number of Open Complaints", "State", "Effective To Month", "Years Since Last Claim"]
  other_df = df.groupby(group_outlier)["Accident Likelihood"].mean().reset_index()
  other_df.rename(columns={"Accident Likelihood": "Other"}, inplace=True)
  # Merge
  df = pd.merge(df, other_df, on=group_outlier, how="left")

  binning(df, "Other", 30)
  df.drop(columns=group_outlier+["Other"], axis=1, inplace=True)

  binning(df, "Accident Likelihood", 4)
  df.drop(columns=["Accident Likelihood"], axis=1, inplace=True)

  return df

In [258]:
numeric_info(df_train, "Customer Lifetime Value")
numeric_info(df_train, "Income")

Min of Customer Lifetime Value:  2562.31
Max of Customer Lifetime Value:  100208.5
Mean of Customer Lifetime Value:  10798.11662277092
Mendian of Customer Lifetime Value:  7845.015
Std of Customer Lifetime Value:  9180.878152795805
Min of Income:  0
Max of Income:  134947
Mean of Income:  50573.28257887517
Mendian of Income:  45398.5
Std of Income:  41090.96814180818


In [259]:
df_process = pre_process(df_train.copy())
df_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Coverage Index                   7290 non-null   int64
 1   Education Index                  7290 non-null   int64
 2   Employment Status Index          7290 non-null   int64
 3   Gender                           7290 non-null   int64
 4   Marital Status Index             7290 non-null   int64
 5   Number of Policies               7290 non-null   int64
 6   Vehicle Size Index               7290 non-null   int64
 7   Claim over 1k                    7290 non-null   int64
 8   Customer Lifetime Value_bin      7290 non-null   int64
 9   Income_bin                       7290 non-null   int64
 10  Likelihood of Claim over 1k_bin  7290 non-null   int64
 11  Insurance_bin                    7290 non-null   int64
 12  Other_bin                        7290 non-null  

In [260]:
relation_to_claim_over_1k(df_process)

<Figure size 3000x8000 with 0 Axes>

In [261]:
import shap

def shap_plot(df_train, df_test):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  test_size = 0.2
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)

  # Train a Random Forest model
  model = RandomForestClassifier()
  model.fit(X_train, y_train)
  # Initialize the SHAP explainer
  explainer = shap.Explainer(model, feature_names=X.columns)
  
  # Calculate SHAP values for the test dataset
  shap_values = explainer(X_test)
  return shap_values
# shap_values = shap_plot(df_process, None)

In [262]:
# print(shap_values.shape)
# shap_values_0 = shap_values[:, :, 0]
# shap_values_1 = shap_values[:, :, 1]
# shap.plots.bar(shap_values_0, max_display=400)
# shap.plots.bar(shap_values_1, max_display=400)
# print(shap_values_0.shape, shap_values_1.shape)

In [263]:
standard_scaler = StandardScaler()

def apply_scaler(df, scaler):
  scaled_data = scaler.fit_transform(df)
  df_scaled = pd.DataFrame(scaled_data, columns=df.columns)
  return df_scaled

In [264]:
relation_to_claim_over_1k(apply_scaler(df_process, standard_scaler))

<Figure size 3000x8000 with 0 Axes>

In [265]:
def shuffle_df(df):
  df_shuffle = df.sample(frac=1).reset_index(drop=True) # shuffle df
  number_of_test_cases = 900
  df_train = df_shuffle.tail(len(df_shuffle) - number_of_test_cases).copy()
  df_test = df_shuffle.head(number_of_test_cases).copy()
  return df_train, df_test

In [266]:

df_train_from_train_dataset, df_test_from_train_dataset = shuffle_df(df_train)

In [267]:
df_train_from_train_dataset = pre_process(df_train_from_train_dataset)
df_train_from_train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6390 entries, 0 to 6389
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Coverage Index                   6390 non-null   int64
 1   Education Index                  6390 non-null   int64
 2   Employment Status Index          6390 non-null   int64
 3   Gender                           6390 non-null   int64
 4   Marital Status Index             6390 non-null   int64
 5   Number of Policies               6390 non-null   int64
 6   Vehicle Size Index               6390 non-null   int64
 7   Claim over 1k                    6390 non-null   int64
 8   Customer Lifetime Value_bin      6390 non-null   int64
 9   Income_bin                       6390 non-null   int64
 10  Likelihood of Claim over 1k_bin  6390 non-null   int64
 11  Insurance_bin                    6390 non-null   int64
 12  Other_bin                        6390 non-null  

In [268]:
df_test_from_train_dataset = pre_process(df_test_from_train_dataset)
df_test_from_train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Coverage Index                   900 non-null    int64
 1   Education Index                  900 non-null    int64
 2   Employment Status Index          900 non-null    int64
 3   Gender                           900 non-null    int64
 4   Marital Status Index             900 non-null    int64
 5   Number of Policies               900 non-null    int64
 6   Vehicle Size Index               900 non-null    int64
 7   Claim over 1k                    900 non-null    int64
 8   Customer Lifetime Value_bin      900 non-null    int64
 9   Income_bin                       900 non-null    int64
 10  Likelihood of Claim over 1k_bin  900 non-null    int64
 11  Insurance_bin                    900 non-null    int64
 12  Other_bin                        900 non-null    i

In [269]:
from imblearn.over_sampling import SMOTE

def balance_with_ratio(df, ratio):
  X = df.drop(columns=['Claim over 1k'])  # Features
  y = df['Claim over 1k']  # Target

  # Calculate the number of samples needed based on the target ratio
  count_majority = sum(y == 0)
  count_minority = sum(y == 1)
  # The target number of minority samples to reach the desired ratio
  desired_minority_count = count_majority // ratio

  # Set the sampling strategy to the target ratio
  sampling_strategy = desired_minority_count / count_majority

  # Apply SMOTE with the custom sampling strategy
  smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)

  # Create a balanced DataFrame
  df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
  df_balanced['Claim over 1k'] = y_resampled

  return df_balanced

def balance(df):
  # Separate features and target
  X = df.drop(columns=['Claim over 1k'])  # Features
  y = df['Claim over 1k']  # Target

  # Apply SMOTE to balance the classes
  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)

  # Check the original and resampled class distributions
  print("Original class distribution:")
  print(y.value_counts())
  print("\nResampled class distribution:")
  print(pd.Series(y_resampled).value_counts())
  
  # Create a df and Return
  df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
  df_balanced['Claim over 1k'] = y_resampled

  return df_balanced

def balance_old(df):
  df_majority = df[df["Claim over 1k"] == 0]
  df_minority = df[df["Claim over 1k"] == 1]
  # Upsample minority class
  df_minority_upsampled = resample(df_minority,
                                    replace=True,  # Sample with replacement
                                    n_samples=len(df_majority),  # Match the majority class
                                    random_state=42)  # Reproducible results

  # Combine majority class with upsampled minority class
  df_balanced = pd.concat([df_majority, df_minority_upsampled])
  print(df_balanced["Claim over 1k"].value_counts())
  relation_to_claim_over_1k(df_balanced)
  from imblearn.over_sampling import SMOTE

  X = df_balanced.drop(columns=['Claim over 1k'])  # Features
  y = df_balanced['Claim over 1k']  # Target

  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)
  
  # Create a df and Return
  df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
  df_balanced['Claim over 1k'] = y_resampled
  return df_balanced

In [270]:
ratio = 1
df_balance_train_from_train_dataset = balance_with_ratio(df_train_from_train_dataset, ratio)
print(df_balance_train_from_train_dataset["Claim over 1k"].value_counts())
relation_to_claim_over_1k(df_balance_train_from_train_dataset)

Claim over 1k
0    5657
1    5657
Name: count, dtype: int64


<Figure size 3000x8000 with 0 Axes>

In [353]:
def test_monitor(model, X, y):
  threshold = 0.5
  # Predict the test set
  y_proba = model.predict_proba(X)
  y_pred = (y_proba[:, 1] >= threshold).astype(int)
  # Evaluate the model
  accuracy = accuracy_score(y, y_pred)
  conf_matrix = confusion_matrix(y, y_pred).T
  class_report = classification_report(y, y_pred)

  # Calculate accuracy and F1 score
  print(f"Accuracy: {accuracy}")
  print("Confusion Matrix:")
  print(conf_matrix)
  print("Classification Report:")
  print(class_report)
  f1 = f1_score(y, y_pred)
  print(f"f1 score: ${f1}")
  
  #            Actual
  #             0   1
  # Predict 0   TP  FP
  #         1   FN  TN
  # Evaluate the model
  tp, fp, fn, tn = conf_matrix.ravel()
  # Calculate sensitivity and specificity
  sensitivity = tp / (tp + fn)  # True Positive Rate
  specificity = tn / (tn + fp)  # True Negative Rate

  print("Sensitivity (Recall):", sensitivity)
  print("Specificity:", specificity)

  return accuracy, sensitivity, specificity

In [354]:
def normalize(X, scaler):
  return scaler.fit_transform(X)

def get_train_params(X, y, scaler, test_size):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
  X_train_scaled = normalize(X_train, scaler)
  X_test_scaled = normalize(X_test, scaler)
  return X_train_scaled, X_test_scaled, y_train, y_test

In [392]:
def run_random_forest_oversample(X_train, y_train):
  # Initialize the Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=30, min_samples_leaf=5, min_samples_split=5, bootstrap=False)
  # Train the model on the training data
  # Note that random forest is a decision tree (if-else statment on each node), so the data does not have to be scaled
  rf_classifier.fit(X_train, y_train)
  return rf_classifier

def run_random_forest_normal(X_train, y_train):
  # Initialize the Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=20, min_samples_leaf=2, min_samples_split=5, bootstrap=False, class_weight="balanced")

  # Train the model on the training data
  # Note that random forest is a decision tree (if-else statment on each node), so the data does not have to be scaled
  rf_classifier.fit(X_train, y_train)
  return rf_classifier

def random_forest(df_train, df_test = None):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  test_size = 0.2 if df_test is None else 0.05
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, test_size)

  rf_classifier = run_random_forest_normal(X_train_scaled, y_train) if df_test is None else run_random_forest_oversample(X_train_scaled, y_train)
#   rf_classifier = run_random_forest_normal(X_train_scaled, y_train)
  accuracy_from_train, sensitivity_from_train, specificity_from_train = test_monitor(rf_classifier, X_test_scaled, y_test)
  
  if (df_test is None):
    return accuracy_from_train, sensitivity_from_train, specificity_from_train
  print("----------------")
  X_from_test = df_test.drop("Claim over 1k", axis=1)
  X_from_test_scaled = normalize(X_from_test, standard_scaler)
  y_from_test = df_test["Claim over 1k"]

  accuracy_from_test, sensitivity_from_test, specificity_from_test = test_monitor(rf_classifier, X_from_test_scaled, y_from_test)
  return accuracy_from_test, sensitivity_from_test, specificity_from_test

In [393]:
# # import shap
# def shap_plot(df_train):
#   # Get feature importance
#   X = df_train.drop(columns=["Claim over 1k"], axis=1)
#   y = df_train["Claim over 1k"]
#   model = RandomForestClassifier()
#   model.fit(X, y)

#   explainer = shap.Explainer(model, X, feature_names=X.columns)
#   shap_vals = explainer(X)

#   shap.plots.bar(shap_vals, max_display = 40)
# # shap_plot(df_balance_train)

In [394]:
def run_iteration(df_train, df_test, times, cols=[]):
  accuracy, sensitivity, specificity = 0, 0, 0
  df_shorten_train = df_train.drop(columns=cols, axis=1)
  df_shorten_test = df_test.drop(columns=cols, axis=1)
  for _ in range(times):
    with SuppressPrints():
      accuracy_get, sensitivity_get, specificity_get = random_forest(df_shorten_train, df_shorten_test)
      accuracy += float(accuracy_get)
      sensitivity += float(sensitivity_get)
      specificity += float(specificity_get)
  print(f"Average over {times} runs: ", accuracy / times, sensitivity / times, specificity / times)

def run_iteration_simple(df_train, times, cols=[]):
  accuracy, sensitivity, specificity = 0, 0, 0
  df_shorten_train = df_train.drop(columns=cols, axis=1)
  for _ in range(times):
    with SuppressPrints():
      accuracy_get, sensitivity_get, specificity_get = random_forest(df_shorten_train)
      accuracy += float(accuracy_get)
      sensitivity += float(sensitivity_get)
      specificity += float(specificity_get)
  print(f"Average over {times} runs: ", accuracy / times, sensitivity / times, specificity / times)
    

In [395]:
random_forest(df_process, None) # test with no oversample

Accuracy: 0.9218106995884774
Confusion Matrix:
[[1254   68]
 [  46   90]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1300
           1       0.66      0.57      0.61       158

    accuracy                           0.92      1458
   macro avg       0.81      0.77      0.78      1458
weighted avg       0.92      0.92      0.92      1458

f1 score: $0.6122448979591837
Sensitivity (Recall): 0.9646153846153847
Specificity: 0.569620253164557


(0.9218106995884774, 0.9646153846153847, 0.569620253164557)

In [396]:
run_iteration_simple(df_process, 5)

Average over 5 runs:  0.9213991769547324 0.9629230769230768 0.579746835443038


In [397]:
random_forest(df_balance_train_from_train_dataset, df_test_from_train_dataset) # oversample

Accuracy: 0.9558303886925795
Confusion Matrix:
[[272  11]
 [ 14 269]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       286
           1       0.95      0.96      0.96       280

    accuracy                           0.96       566
   macro avg       0.96      0.96      0.96       566
weighted avg       0.96      0.96      0.96       566

f1 score: $0.955595026642984
Sensitivity (Recall): 0.951048951048951
Specificity: 0.9607142857142857
----------------
Accuracy: 0.8722222222222222
Confusion Matrix:
[[709  23]
 [ 92  76]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.89      0.92       801
           1       0.45      0.77      0.57        99

    accuracy                           0.87       900
   macro avg       0.71      0.83      0.75       900
weighted avg       0.91      0.87      0.89       900

f1 score: $0.5692883895131086
Sensitivity 

(0.8722222222222222, 0.885143570536829, 0.7676767676767676)

In [398]:
run_iteration(df_balance_train_from_train_dataset, df_test_from_train_dataset, 5)

Average over 5 runs:  0.8644444444444443 0.8779026217228465 0.7555555555555555


In [399]:
# df_balance_train_from_train_dataset.columns

In [400]:
cols_to_drop = ["Education Index", "Vehicle Size Index"]

def run_drop(df_train, df_test):
  df_shorten_train = df_train.drop(columns=cols_to_drop, axis=1)
  df_shorten_test = df_test.drop(columns=cols_to_drop, axis=1)
  random_forest(df_shorten_train, df_shorten_test)
run_drop(df_balance_train_from_train_dataset, df_test_from_train_dataset)

Accuracy: 0.9363957597173145
Confusion Matrix:
[[266  16]
 [ 20 264]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       286
           1       0.93      0.94      0.94       280

    accuracy                           0.94       566
   macro avg       0.94      0.94      0.94       566
weighted avg       0.94      0.94      0.94       566

f1 score: $0.9361702127659575
Sensitivity (Recall): 0.9300699300699301
Specificity: 0.9428571428571428
----------------
Accuracy: 0.84
Confusion Matrix:
[[681  24]
 [120  75]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.85      0.90       801
           1       0.38      0.76      0.51        99

    accuracy                           0.84       900
   macro avg       0.68      0.80      0.71       900
weighted avg       0.90      0.84      0.86       900

f1 score: $0.5102040816326531
Sensitivity (Recall): 0.

In [401]:
run_iteration(df_balance_train_from_train_dataset, df_test_from_train_dataset, 5, cols_to_drop)

Average over 5 runs:  0.8264444444444443 0.8324594257178528 0.7777777777777779


In [402]:
random_forest(df_process.drop(columns=cols_to_drop, axis=1), None) # test with no oversample

Accuracy: 0.9197530864197531
Confusion Matrix:
[[1247   64]
 [  53   94]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1300
           1       0.64      0.59      0.62       158

    accuracy                           0.92      1458
   macro avg       0.80      0.78      0.79      1458
weighted avg       0.92      0.92      0.92      1458

f1 score: $0.6163934426229508
Sensitivity (Recall): 0.9592307692307692
Specificity: 0.5949367088607594


(0.9197530864197531, 0.9592307692307692, 0.5949367088607594)

In [403]:
run_iteration_simple(df_process, 5)

Average over 5 runs:  0.921536351165981 0.962923076923077 0.5810126582278481


In [330]:
def hyper_tune_random_forest(df_train):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, 0.2)
  # Step 2: Define the model
  rf_classifier = RandomForestClassifier(random_state=42)

  # Step 3: Specify hyperparameters to tune
  param_grid = {
    'n_estimators': [100, 200, 250],          # Number of trees
    'max_depth': [10, 20, 30],             # Maximum depth of the trees
    'min_samples_split': [5, 10],         # Minimum samples to split an internal node
    'min_samples_leaf': [5, 10],          # Minimum samples at a leaf node
    'bootstrap': [False],
    'class_weight': ['balanced']
  }

  # Step 4: Set up GridSearchCV
  grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
                 cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

  # Step 5: Fit the model
  grid_search.fit(X_train_scaled, y_train)

  # Step 6: Evaluate the best model
  best_model = grid_search.best_estimator_
  
  return best_model
print(hyper_tune_random_forest(df_process))
# RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
#                        min_samples_split=5, n_estimators=250, random_state=42)
# RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=5,
#                        min_samples_split=5, n_estimators=50, random_state=42)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, class_weight=balanced, max_depth=10, min_samples_leaf=5, min_sampl

In [283]:
def train(df_train, test_size):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, test_size)

  rf_classifier = run_random_forest_oversample(X_train_scaled, y_train)
  test_monitor(rf_classifier, X_test_scaled, y_test)
  return rf_classifier

def train_simple(df_train, test_size):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, test_size)

  rf_classifier = run_random_forest_normal(X_train_scaled, y_train)
  test_monitor(rf_classifier, X_test_scaled, y_test)
  return rf_classifier

In [284]:
# train to get the model out
my_model = train(df_balance_train_from_train_dataset.drop(columns=cols_to_drop, axis=1), 0.2)
# my_model = train_simple(df_process, 0.2)

Accuracy: 0.9279717189571366
Confusion Matrix:
[[1035   41]
 [ 122 1065]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.89      0.93      1157
           1       0.90      0.96      0.93      1106

    accuracy                           0.93      2263
   macro avg       0.93      0.93      0.93      2263
weighted avg       0.93      0.93      0.93      2263

f1 score: $0.9289140863497601
Sensitivity (Recall): 0.8945548833189283
Specificity: 0.9629294755877035


In [285]:
print(my_model)
def view_performance(model, df_test):
  df_test_shorten = df_test.drop(columns=cols_to_drop, axis=1)
  X = df_test_shorten.drop("Claim over 1k", axis=1)
  X_scaled = normalize(X, standard_scaler)
  y = df_test_shorten["Claim over 1k"]
  test_monitor(model, X_scaled, y)
view_performance(my_model, df_test_from_train_dataset)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=30,
                       min_samples_leaf=5, min_samples_split=5,
                       n_estimators=50)
Accuracy: 0.8055555555555556
Confusion Matrix:
[[645  19]
 [156  80]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.81      0.88       801
           1       0.34      0.81      0.48        99

    accuracy                           0.81       900
   macro avg       0.66      0.81      0.68       900
weighted avg       0.90      0.81      0.84       900

f1 score: $0.47761194029850745
Sensitivity (Recall): 0.8052434456928839
Specificity: 0.8080808080808081


In [286]:
submission_folder = "submission/"
df_my_submission = pd.read_csv(submission_folder + "my_best_submission.csv")
df_my_submission["Claim over 1k"].value_counts()

Claim over 1k
0    1693
1     151
Name: count, dtype: int64

In [287]:
df_best_submission = pd.read_csv(submission_folder + "best_submission.csv")
df_best_submission["Claim over 1k"].value_counts()

Claim over 1k
0    1695
1     149
Name: count, dtype: int64

In [288]:
df_bad_submission = pd.read_csv(submission_folder + "bad_submission.csv")
df_bad_submission["Claim over 1k"].value_counts()

Claim over 1k
0    1662
1     182
Name: count, dtype: int64

In [289]:
# compare the two results
def compare(df_1, df_2):
  cnt = 0
  for i in range(len(df_1)):
    if (df_1["Claim over 1k"][i] != df_2["Claim over 1k"][i]):
      cnt += 1
    if (df_1["CustomerID"][i] != df_2["CustomerID"][i]):
      print("Wrong order")
  print(cnt)

In [290]:
compare(df_my_submission, df_best_submission)

66


In [291]:
compare(df_my_submission, df_bad_submission)

71


In [292]:
compare(df_best_submission, df_bad_submission)

113


In [293]:
df_test = pd.read_csv(folder + "test.csv").drop(columns=["Coverage", "Education", "Employment Status", "Marital Status", "Policy Type", "Policy", "Sales Channel", "Vehicle Size"])
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1844 entries, 0 to 1843
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CustomerID                     1844 non-null   object 
 1   State                          1844 non-null   object 
 2   Customer Lifetime Value        1844 non-null   float64
 3   Response                       1844 non-null   object 
 4   Coverage Index                 1844 non-null   int64  
 5   Education Index                1844 non-null   int64  
 6   Effective To Date              1844 non-null   object 
 7   Employment Status Index        1844 non-null   int64  
 8   Gender                         1844 non-null   object 
 9   Income                         1844 non-null   int64  
 10  Marital Status Index           1844 non-null   int64  
 11  Months Since Last Claim        1844 non-null   int64  
 12  Months Since Policy Inception  1844 non-null   i

In [294]:
X_test = df_test.drop("CustomerID", axis=1)
X_test = pre_process(X_test)
X_test.drop(columns=cols_to_drop, axis=1, inplace=True)
X_test_scaled = normalize(X_test, standard_scaler)

In [295]:
def predict(model, X_test):
  y_test_pred = model.predict(X_test)
  customer_ID = np.array(df_test["CustomerID"])
  result = np.column_stack((customer_ID, y_test_pred))
  return pd.DataFrame(result).rename(columns={0: "CustomerID", 1: "Claim over 1k"})

In [296]:
df_run_test_ouput = predict(my_model, X_test_scaled)

In [297]:
df_run_test_ouput["Claim over 1k"].value_counts()

Claim over 1k
0    1385
1     459
Name: count, dtype: int64

In [298]:
compare(df_my_submission, df_run_test_ouput)

330


In [299]:
compare(df_best_submission, df_run_test_ouput)

344


In [300]:
df_balance_process = balance_with_ratio(df_process, ratio) # note that this data has not been splitted like above to create df_train_from_train_dataset, df_test_from_train_dataset
df_balance_process_shorten = df_balance_process.drop(columns=cols_to_drop, axis=1)
# relation_to_claim_over_1k(df_balance_process_shorten)

In [301]:
random_forest(df_balance_process_shorten) # using run_random_forest_simple, with no df_test

Accuracy: 0.934984520123839
Confusion Matrix:
[[1147   64]
 [ 104 1269]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      1251
           1       0.92      0.95      0.94      1333

    accuracy                           0.93      2584
   macro avg       0.94      0.93      0.93      2584
weighted avg       0.94      0.93      0.93      2584

f1 score: $0.9379157427937915
Sensitivity (Recall): 0.9168665067945644
Specificity: 0.9519879969992499


(0.934984520123839, 0.9168665067945644, 0.9519879969992499)

In [302]:
new_model = train(df_balance_process_shorten, 0.01)

Accuracy: 0.9153846153846154
Confusion Matrix:
[[55  3]
 [ 8 64]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.87      0.91        63
           1       0.89      0.96      0.92        67

    accuracy                           0.92       130
   macro avg       0.92      0.91      0.91       130
weighted avg       0.92      0.92      0.92       130

f1 score: $0.920863309352518
Sensitivity (Recall): 0.873015873015873
Specificity: 0.9552238805970149


In [303]:
print(new_model)
view_performance(new_model, df_test_from_train_dataset)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=30,
                       min_samples_leaf=5, min_samples_split=5,
                       n_estimators=50)
Accuracy: 0.81
Confusion Matrix:
[[639   9]
 [162  90]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.80      0.88       801
           1       0.36      0.91      0.51        99

    accuracy                           0.81       900
   macro avg       0.67      0.85      0.70       900
weighted avg       0.92      0.81      0.84       900

f1 score: $0.5128205128205128
Sensitivity (Recall): 0.797752808988764
Specificity: 0.9090909090909091


In [304]:
df_new_run_test_output = predict(new_model, X_test_scaled)
df_new_run_test_output["Claim over 1k"].value_counts()

Claim over 1k
0    1364
1     480
Name: count, dtype: int64

In [305]:
compare(df_my_submission, df_new_run_test_output)

341


In [306]:
compare(df_best_submission, df_new_run_test_output)

355


In [307]:
compare(df_bad_submission, df_new_run_test_output)

310


In [308]:
compare(df_run_test_ouput, df_new_run_test_output)

139


In [309]:
df_new_run_test_output.to_csv("submission.csv", index=False)

In [310]:
df_new_run_test_output["Claim over 1k"].value_counts()

Claim over 1k
0    1364
1     480
Name: count, dtype: int64