In [683]:
# for data manipulation
import pandas as pd
import numpy as np
# for plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go # note that github wont be able to display these plots because they are interactive
# for some processing
import math
from datetime import datetime, timedelta

In [684]:
import sys
import os

class SuppressPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout  # Save the original stdout
        sys.stdout = open(os.devnull, 'w')  # Redirect stdout to null

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()  # Close the null file
        sys.stdout = self._original_stdout  # Restore original stdout

In [685]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, make_scorer
from sklearn.utils import resample

In [686]:
folder = "/kaggle/input/asna-september-2024/"
df = pd.read_csv(folder + "train.csv")
df

Unnamed: 0,CustomerID,State,Customer Lifetime Value,Response,Coverage,Coverage Index,Education,Education Index,Effective To Date,Employment Status,...,Policy Type,Policy Type Index,Policy,Policy Index,Renew Offer Type,Sales Channel,Sales Channel Index,Vehicle Size,Vehicle Size Index,Claim over 1k
0,QC35222,California,3622.69,No,Basic,0,Bachelor,2,1/1/2024,Employed,...,Corporate Auto,1,Corporate L2,4,3,Web,0,Medsize,1,0
1,AE98193,Washington,10610.21,No,Basic,0,High School or Below,0,1/1/2024,Unemployed,...,Personal Auto,0,Personal L1,0,1,Branch,1,Medsize,1,1
2,TM23514,Oregon,13868.02,No,Extended,1,College,1,1/1/2024,Employed,...,Personal Auto,0,Personal L3,2,3,Web,0,Medsize,1,0
3,QZ42725,Washington,3119.69,No,Basic,0,Bachelor,2,1/1/2024,Unemployed,...,Personal Auto,0,Personal L3,2,2,Agent,2,Medsize,1,0
4,SG81493,Arizona,5999.04,No,Premium,2,Bachelor,2,1/1/2024,Employed,...,Corporate Auto,1,Corporate L1,3,2,Web,0,Medsize,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7285,HC90344,California,27500.54,No,Basic,0,Bachelor,2,9/2/2024,Unemployed,...,Corporate Auto,1,Corporate L2,4,1,Branch,1,Medsize,1,0
7286,ZU83252,California,11750.03,No,Basic,0,Master,3,9/2/2024,Employed,...,Personal Auto,0,Personal L3,2,3,Web,0,Medsize,1,0
7287,PR80703,California,7757.04,No,Basic,0,Bachelor,2,9/2/2024,Employed,...,Personal Auto,0,Personal L3,2,1,Branch,1,Medsize,1,0
7288,NS23754,California,3465.16,No,Basic,0,High School or Below,0,9/2/2024,Employed,...,Corporate Auto,1,Corporate L3,5,3,Call Center,3,Large,2,0


In [687]:
df_train = df.drop(columns = ["CustomerID", "Coverage", "Education", "Employment Status", "Marital Status", "Policy Type", "Policy", "Sales Channel", "Vehicle Size"])
df_train.info()
df_train["Claim over 1k"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          7290 non-null   object 
 1   Customer Lifetime Value        7290 non-null   float64
 2   Response                       7290 non-null   object 
 3   Coverage Index                 7290 non-null   int64  
 4   Education Index                7290 non-null   int64  
 5   Effective To Date              7290 non-null   object 
 6   Employment Status Index        7290 non-null   int64  
 7   Gender                         7290 non-null   object 
 8   Income                         7290 non-null   int64  
 9   Marital Status Index           7290 non-null   int64  
 10  Months Since Last Claim        7290 non-null   int64  
 11  Months Since Policy Inception  7290 non-null   int64  
 12  Number of Open Complaints      7290 non-null   i

Claim over 1k
0    6458
1     832
Name: count, dtype: int64

In [688]:
def numeric_info(df, col):
  print(f"Min of {col}: ", df[col].min())
  print(f"Max of {col}: ", df[col].max())
  print(f"Mean of {col}: ", df[col].mean())
  print(f"Mendian of {col}: ", df[col].median())
  print(f"Std of {col}: ", df[col].std())

In [689]:
numeric_info(df_train, "Customer Lifetime Value")

Min of Customer Lifetime Value:  2562.31
Max of Customer Lifetime Value:  100208.5
Mean of Customer Lifetime Value:  10798.11662277092
Mendian of Customer Lifetime Value:  7845.015
Std of Customer Lifetime Value:  9180.878152795793


In [690]:
from plotly.subplots import make_subplots

def relation_to_claim_over_1k(df):
  plt.figure(figsize=(30, 80))
  numerical_cols = df.columns.tolist()
  not_include = ["Customer Lifetime Value", "Income", "Claim over 1k"]
  numerical_cols = [col for col in numerical_cols if col not in not_include]
  
  number_of_rows = math.ceil(len(numerical_cols) / 2)
  fig = make_subplots(rows = number_of_rows, cols=2, subplot_titles=numerical_cols, vertical_spacing=0.01, horizontal_spacing=0.05)
  for idx, col in enumerate(numerical_cols):
    df_count = df.groupby([col, "Claim over 1k"]).size().reset_index(name='count')
    unique_val = df_count[col].unique()
    i, j = (idx // 2) + 1, (idx % 2) + 1
    for val in unique_val:
      df_cur = df_count[df_count[col] == val]
      fig.add_trace(
        go.Bar(
          x = df_cur["Claim over 1k"],
          y = df_cur["count"],
          name = f"{val}"
        ),
        row=i, col=j
      )
  fig.update_layout(
      height=300 * number_of_rows,  # Adjust figure height dynamically based on the number of rows
      showlegend=False,
      title_text="How Each Column Affects 'Claim over 1k'"
  )
  # Show the figure with all subplots
  fig.show()

In [691]:
def binning(df, col, size):
  df[col + "_bin"] = pd.qcut(df[col], q=size, labels=False, duplicates="drop")

def pre_process(df):
  # State
  unique_states = df["State"].unique()
  dict_states = {}
  for i in range(len(unique_states)):
    dict_states[unique_states[i]] = i
  df["State"] = df["State"].apply(lambda x: dict_states[x])
  
  # Response
  # note that resonse should be either yes or no
  df["Response"] = df["Response"].apply(lambda x: 1 if x == "Yes" else 0)

  # Effective To Date
  df["Effective To Date"] = pd.to_datetime(df["Effective To Date"])
  df["Effective To Month"] = (df["Effective To Date"].dt.month - 1) // 3 # split into 4 seasons

  # Months Since Last Claim, Months Since Last Inception
  # group them into year
  df["Years Since Last Claim"] = df["Months Since Last Claim"] // 12
  df["Years Since Policy Inception"] = df["Months Since Policy Inception"] // 36 # period of 2 years

  # Gender
  df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "M" else 0)

  # CLV
  binning(df, "Customer Lifetime Value", 30)
  binning(df, "Income", 5)

  df.drop(columns=["Effective To Date", "Months Since Last Claim", "Months Since Policy Inception", "Customer Lifetime Value", "Income"], axis=1, inplace=True)

  # New col: Accident Likelihood
  max_education_index = df["Education Index"].max()
  max_income_bin = df["Income_bin"].max()
  max_employment_status_index = df["Employment Status Index"].max()
  max_marital_status_index = df["Marital Status Index"].max()
  df["Accident Likelihood"] = np.exp( 0.4 * (max_education_index - df["Education Index"]) + 0.3 * (max_income_bin - df["Income_bin"]) \
    + 0.3 * (df["Marital Status Index"] == 0))

  binning(df, "Accident Likelihood", 4)
  df.drop(columns=["Accident Likelihood"], axis=1, inplace=True)
  return df

In [692]:
numeric_info(df_train, "Customer Lifetime Value")
numeric_info(df_train, "Income")

Min of Customer Lifetime Value:  2562.31
Max of Customer Lifetime Value:  100208.5
Mean of Customer Lifetime Value:  10798.11662277092
Mendian of Customer Lifetime Value:  7845.015
Std of Customer Lifetime Value:  9180.878152795793
Min of Income:  0
Max of Income:  134947
Mean of Income:  50573.28257887517
Mendian of Income:  45398.5
Std of Income:  41090.96814180824


In [693]:
df_process = pre_process(df_train.copy())
df_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   State                         7290 non-null   int64
 1   Response                      7290 non-null   int64
 2   Coverage Index                7290 non-null   int64
 3   Education Index               7290 non-null   int64
 4   Employment Status Index       7290 non-null   int64
 5   Gender                        7290 non-null   int64
 6   Marital Status Index          7290 non-null   int64
 7   Number of Open Complaints     7290 non-null   int64
 8   Number of Policies            7290 non-null   int64
 9   Policy Type Index             7290 non-null   int64
 10  Policy Index                  7290 non-null   int64
 11  Renew Offer Type              7290 non-null   int64
 12  Sales Channel Index           7290 non-null   int64
 13  Vehicle Size Index            729

In [694]:
relation_to_claim_over_1k(df_process)

<Figure size 3000x8000 with 0 Axes>

In [695]:
standard_scaler = StandardScaler()

def apply_scaler(df, scaler):
  scaled_data = scaler.fit_transform(df)
  df_scaled = pd.DataFrame(scaled_data, columns=df.columns)
  return df_scaled

In [696]:
relation_to_claim_over_1k(apply_scaler(df_process, standard_scaler))

<Figure size 3000x8000 with 0 Axes>

In [697]:
def shuffle_df(df):
  df_shuffle = df.sample(frac=1).reset_index(drop=True) # shuffle df
  number_of_test_cases = 1000
  df_train = df_shuffle.tail(len(df_shuffle) - number_of_test_cases).copy()
  df_test = df_shuffle.head(number_of_test_cases).copy()
  return df_train, df_test

In [698]:

df_train_from_train_dataset, df_test_from_train_dataset = shuffle_df(df_train)

In [699]:
pre_process(df_train_from_train_dataset)
df_train_from_train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6290 entries, 1000 to 7289
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   State                         6290 non-null   int64
 1   Response                      6290 non-null   int64
 2   Coverage Index                6290 non-null   int64
 3   Education Index               6290 non-null   int64
 4   Employment Status Index       6290 non-null   int64
 5   Gender                        6290 non-null   int64
 6   Marital Status Index          6290 non-null   int64
 7   Number of Open Complaints     6290 non-null   int64
 8   Number of Policies            6290 non-null   int64
 9   Policy Type Index             6290 non-null   int64
 10  Policy Index                  6290 non-null   int64
 11  Renew Offer Type              6290 non-null   int64
 12  Sales Channel Index           6290 non-null   int64
 13  Vehicle Size Index            

In [700]:
pre_process(df_test_from_train_dataset)
df_test_from_train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   State                         1000 non-null   int64
 1   Response                      1000 non-null   int64
 2   Coverage Index                1000 non-null   int64
 3   Education Index               1000 non-null   int64
 4   Employment Status Index       1000 non-null   int64
 5   Gender                        1000 non-null   int64
 6   Marital Status Index          1000 non-null   int64
 7   Number of Open Complaints     1000 non-null   int64
 8   Number of Policies            1000 non-null   int64
 9   Policy Type Index             1000 non-null   int64
 10  Policy Index                  1000 non-null   int64
 11  Renew Offer Type              1000 non-null   int64
 12  Sales Channel Index           1000 non-null   int64
 13  Vehicle Size Index            1000

In [701]:
from imblearn.over_sampling import SMOTE

def balance_with_ratio(df, ratio):
  X = df.drop(columns=['Claim over 1k'])  # Features
  y = df['Claim over 1k']  # Target

  # Calculate the number of samples needed based on the target ratio
  count_majority = sum(y == 0)
  count_minority = sum(y == 1)
  # The target number of minority samples to reach the desired ratio
  desired_minority_count = count_majority // ratio

  # Set the sampling strategy to the target ratio
  sampling_strategy = desired_minority_count / count_majority

  # Apply SMOTE with the custom sampling strategy
  smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)

  # Create a balanced DataFrame
  df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
  df_balanced['Claim over 1k'] = y_resampled

  return df_balanced

def balance(df):
  # Separate features and target
  X = df.drop(columns=['Claim over 1k'])  # Features
  y = df['Claim over 1k']  # Target

  # Apply SMOTE to balance the classes
  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)

  # Check the original and resampled class distributions
  print("Original class distribution:")
  print(y.value_counts())
  print("\nResampled class distribution:")
  print(pd.Series(y_resampled).value_counts())
  
  # Create a df and Return
  df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
  df_balanced['Claim over 1k'] = y_resampled

  return df_balanced

def balance_old(df):
  df_majority = df[df["Claim over 1k"] == 0]
  df_minority = df[df["Claim over 1k"] == 1]
  # Upsample minority class
  df_minority_upsampled = resample(df_minority,
                                    replace=True,  # Sample with replacement
                                    n_samples=len(df_majority),  # Match the majority class
                                    random_state=42)  # Reproducible results

  # Combine majority class with upsampled minority class
  df_balanced = pd.concat([df_majority, df_minority_upsampled])
  print(df_balanced["Claim over 1k"].value_counts())
  relation_to_claim_over_1k(df_balanced)
  from imblearn.over_sampling import SMOTE

  X = df_balanced.drop(columns=['Claim over 1k'])  # Features
  y = df_balanced['Claim over 1k']  # Target

  smote = SMOTE(random_state=42)
  X_resampled, y_resampled = smote.fit_resample(X, y)
  
  # Create a df and Return
  df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
  df_balanced['Claim over 1k'] = y_resampled
  return df_balanced

In [702]:
ratio = np.exp(2)
df_balance_train = balance_with_ratio(df_train_from_train_dataset, ratio)
print(df_balance_train["Claim over 1k"].value_counts())
relation_to_claim_over_1k(df_balance_train)

Claim over 1k
0    5557
1     752
Name: count, dtype: int64


<Figure size 3000x8000 with 0 Axes>

In [703]:
def test_monitor(model, X, y):
  threshold = 0.43
  # Predict the test set
  y_proba = model.predict_proba(X)
  y_pred = (y_proba[:, 1] >= threshold).astype(int)
  # Evaluate the model
  accuracy = accuracy_score(y, y_pred)
  conf_matrix = confusion_matrix(y, y_pred).T
  class_report = classification_report(y, y_pred)

  # Calculate accuracy and F1 score
  print(f"Accuracy: {accuracy}")
  print("Confusion Matrix:")
  print(conf_matrix)
  print("Classification Report:")
  print(class_report)
  f1 = f1_score(y, y_pred)
  print(f"f1 score: ${f1}")
  
  #            Actual
  #             0   1
  # Predict 0   TP  FP
  #         1   FN  TN
  # Evaluate the model
  tp, fp, fn, tn = conf_matrix.ravel()
  # Calculate sensitivity and specificity
  sensitivity = tp / (tp + fn)  # True Positive Rate
  specificity = tn / (tn + fp)  # True Negative Rate

  print("Sensitivity (Recall):", sensitivity)
  print("Specificity:", specificity)

  return accuracy, sensitivity, specificity

In [704]:
def normalize(X, scaler):
  return scaler.fit_transform(X)

def get_train_params(X, y, scaler, test_size):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
  X_train_scaled = normalize(X_train, scaler)
  X_test_scaled = normalize(X_test, scaler)
  return X_train_scaled, X_test_scaled, y_train, y_test

In [705]:
def run_random_forest_oversample(X_train, y_train):
  # Initialize the Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=200, criterion="entropy", random_state=42, max_depth=22, max_features="sqrt", min_samples_leaf=5, min_samples_split=10, bootstrap=False)
  # Train the model on the training data
  # Note that random forest is a decision tree (if-else statment on each node), so the data does not have to be scaled
  rf_classifier.fit(X_train, y_train)
  return rf_classifier

def run_random_forest_normal(X_train, y_train):
  # Initialize the Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=200, criterion="entropy", bootstrap=False)

  # Train the model on the training data
  # Note that random forest is a decision tree (if-else statment on each node), so the data does not have to be scaled
  rf_classifier.fit(X_train, y_train)
  return rf_classifier

def random_forest(df_train, df_test = None):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  test_size = 0.2
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, test_size)

  rf_classifier = run_random_forest_normal(X_train_scaled, y_train) if df_test is None else run_random_forest_oversample(X_train_scaled, y_train)
#   rf_classifier = run_random_forest_normal(X_train_scaled, y_train)
  accuracy_from_train, sensitivity_from_train, specificity_from_train = test_monitor(rf_classifier, X_test_scaled, y_test)
  
  if (df_test is None):
    return accuracy_from_train, sensitivity_from_train, specificity_from_train
  print("----------------")
  X_from_test = df_test.drop("Claim over 1k", axis=1)
  X_from_test_scaled = normalize(X_from_test, standard_scaler)
  y_from_test = df_test["Claim over 1k"]

  accuracy_from_test, sensitivity_from_test, specificity_from_test = test_monitor(rf_classifier, X_from_test_scaled, y_from_test)
  return accuracy_from_test, sensitivity_from_test, specificity_from_test

In [753]:
!pip install shap



In [755]:
import shap
def shap_plot(df_train):
    # Get feature importance
    X = df_train.drop(columns=["Claim over 1k"], axis=1)
    y = df_train["Claim over 1k"]
    model = RandomForestClassifier()
    model.fit(X, y)

    explainer = shap.Explainer(model, X, feature_names=X.columns)
    shap_vals = explainer(X)

    shap.plots.bar(shap_vals, max_display = 40)
shap_plot(df_balance_train)



IndexError: list index out of range

In [706]:
def run_iteration(df, times, cols=[]):
  accuracy, sensitivity, specificity = 0, 0, 0
  for _ in range(times):
    with SuppressPrints():
      df_local_train, df_local_test = shuffle_df(df)
      pre_process(df_local_train)
      pre_process(df_local_test)
      df_local_balance_train = balance_with_ratio(df_local_train, ratio).drop(columns=cols, axis=1)
      df_local_test.drop(columns=cols, axis=1, inplace=True)
      accuracy_get, sensitivity_get, specificity_get = random_forest(df_local_balance_train, df_local_test)
      accuracy += float(accuracy_get)
      sensitivity += float(sensitivity_get)
      specificity += float(specificity_get)
  print(f"Average over {times} runs: ", accuracy / times, sensitivity / times, specificity / times)
    

In [707]:
random_forest(df_process, None) # test with no oversample

Accuracy: 0.9197530864197531
Confusion Matrix:
[[1272   89]
 [  28   69]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1300
           1       0.71      0.44      0.54       158

    accuracy                           0.92      1458
   macro avg       0.82      0.71      0.75      1458
weighted avg       0.91      0.92      0.91      1458

f1 score: $0.5411764705882353
Sensitivity (Recall): 0.9784615384615385
Specificity: 0.43670886075949367


(0.9197530864197531, 0.9784615384615385, 0.43670886075949367)

In [708]:
random_forest(df_balance_train, df_test_from_train_dataset) # oversample

Accuracy: 0.9207606973058637
Confusion Matrix:
[[1085   87]
 [  13   77]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1098
           1       0.86      0.47      0.61       164

    accuracy                           0.92      1262
   macro avg       0.89      0.73      0.78      1262
weighted avg       0.92      0.92      0.91      1262

f1 score: $0.6062992125984252
Sensitivity (Recall): 0.98816029143898
Specificity: 0.4695121951219512
----------------
Accuracy: 0.929
Confusion Matrix:
[[888  58]
 [ 13  41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       901
           1       0.76      0.41      0.54        99

    accuracy                           0.93      1000
   macro avg       0.85      0.70      0.75      1000
weighted avg       0.92      0.93      0.92      1000

f1 score: $0.5359477124183006
Sensitivity (Recall):

(0.929, 0.9855715871254163, 0.41414141414141414)

In [709]:
run_iteration(df_train, 5)

Average over 5 runs:  0.9132 0.9809821778116813 0.4000223704610654


In [710]:
df_balance_train.columns

Index(['State', 'Response', 'Coverage Index', 'Education Index',
       'Employment Status Index', 'Gender', 'Marital Status Index',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type Index',
       'Policy Index', 'Renew Offer Type', 'Sales Channel Index',
       'Vehicle Size Index', 'Effective To Month', 'Years Since Last Claim',
       'Years Since Policy Inception', 'Customer Lifetime Value_bin',
       'Income_bin', 'Accident Likelihood_bin', 'Claim over 1k'],
      dtype='object')

In [711]:
cols_to_drop = ["Response", "Number of Open Complaints", "Policy Type Index"]

def run_drop(df_train, df_test):
  df_shorten_train = df_train.drop(columns=cols_to_drop, axis=1)
  df_shorten_test = df_test.drop(columns=cols_to_drop, axis=1)
  random_forest(df_shorten_train, df_shorten_test)
run_drop(df_balance_train, df_test_from_train_dataset)

Accuracy: 0.9215530903328051
Confusion Matrix:
[[1080   81]
 [  18   83]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1098
           1       0.82      0.51      0.63       164

    accuracy                           0.92      1262
   macro avg       0.88      0.74      0.79      1262
weighted avg       0.92      0.92      0.91      1262

f1 score: $0.6264150943396226
Sensitivity (Recall): 0.9836065573770492
Specificity: 0.5060975609756098
----------------
Accuracy: 0.929
Confusion Matrix:
[[888  58]
 [ 13  41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       901
           1       0.76      0.41      0.54        99

    accuracy                           0.93      1000
   macro avg       0.85      0.70      0.75      1000
weighted avg       0.92      0.93      0.92      1000

f1 score: $0.5359477124183006
Sensitivity (Recall

In [712]:
run_iteration(df_train, 5, cols_to_drop)

Average over 5 runs:  0.9204000000000001 0.9826915598918889 0.4741148662724548


In [713]:
def standard_run(df_train, test_size):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, test_size) # minimize test dataset to get the most out

  rf_classifier = run_random_forest_oversample(X_train_scaled, y_train)
  test_monitor(rf_classifier, X_test_scaled, y_test)
  return rf_classifier

In [714]:
def specificity(y_true, y_pred):
    tp, fp, fn, tn = confusion_matrix(y_true, y_pred).T.ravel()
    # print(tn / (tn + fp))
    return tn / (tn + fp)

specificity_scorer = make_scorer(specificity)

def hyper_tune_random_forest(df_train):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, standard_scaler, 0.00001)
  # Step 2: Define the model
  rf_classifier = RandomForestClassifier(random_state=42)

  # Step 3: Specify hyperparameters to tune
  param_grid = {
    'n_estimators': [100, 200],          # Number of trees
    'max_depth': [20, 30, 40, 50],             # Maximum depth of the trees
    'min_samples_split': [5, 10, 15],         # Minimum samples to split an internal node
    'min_samples_leaf': [5, 10, 15],          # Minimum samples at a leaf node
    'bootstrap': [False]
  }

  # Step 4: Set up GridSearchCV
  grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
                 cv=5, n_jobs=-1, verbose=2, scoring='f1')

  # Step 5: Fit the model
  grid_search.fit(X_train_scaled, y_train)

  # Step 6: Evaluate the best model
  best_model = grid_search.best_estimator_
  
  return best_model

# model = hyper_tune_random_forest(df_balance_train)
df_balance_train_shorten = df_balance_train.drop(columns=cols_to_drop, axis=1)
model = standard_run(df_balance_train_shorten, 0.2)

Accuracy: 0.9215530903328051
Confusion Matrix:
[[1080   81]
 [  18   83]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1098
           1       0.82      0.51      0.63       164

    accuracy                           0.92      1262
   macro avg       0.88      0.74      0.79      1262
weighted avg       0.92      0.92      0.91      1262

f1 score: $0.6264150943396226
Sensitivity (Recall): 0.9836065573770492
Specificity: 0.5060975609756098


In [715]:
print(model)
def see_performance_hyper_tune_random_forest():
  df_test_from_train_dataset_shorten = df_test_from_train_dataset.drop(columns=cols_to_drop, axis=1)
  old_X = df_test_from_train_dataset_shorten.drop("Claim over 1k", axis=1)
  old_X_scaled = normalize(old_X, standard_scaler)
  old_y = df_test_from_train_dataset_shorten["Claim over 1k"]
  test_monitor(model, old_X_scaled, old_y)
see_performance_hyper_tune_random_forest()

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=22,
                       min_samples_leaf=5, min_samples_split=10,
                       n_estimators=200, random_state=42)
Accuracy: 0.929
Confusion Matrix:
[[888  58]
 [ 13  41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       901
           1       0.76      0.41      0.54        99

    accuracy                           0.93      1000
   macro avg       0.85      0.70      0.75      1000
weighted avg       0.92      0.93      0.92      1000

f1 score: $0.5359477124183006
Sensitivity (Recall): 0.9855715871254163
Specificity: 0.41414141414141414


In [716]:
submission_folder = "/kaggle/input/asna-sep-2024-submissions/"
df_submission = pd.read_csv(submission_folder + "my_submission.csv")
df_submission["Claim over 1k"].value_counts()

Claim over 1k
0    1693
1     151
Name: count, dtype: int64

In [717]:
df_best_submission = pd.read_csv(submission_folder + "best_submission.csv")
df_best_submission["Claim over 1k"].value_counts()

Claim over 1k
0    1695
1     149
Name: count, dtype: int64

In [718]:
df_bad_submission = pd.read_csv(submission_folder + "bad_submission.csv")
df_bad_submission["Claim over 1k"].value_counts()

Claim over 1k
0    1662
1     182
Name: count, dtype: int64

In [719]:
df_test = pd.read_csv(folder + "test.csv").drop(columns=["Coverage", "Education", "Employment Status", "Marital Status", "Policy Type", "Policy", "Sales Channel", "Vehicle Size"])
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1844 entries, 0 to 1843
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CustomerID                     1844 non-null   object 
 1   State                          1844 non-null   object 
 2   Customer Lifetime Value        1844 non-null   float64
 3   Response                       1844 non-null   object 
 4   Coverage Index                 1844 non-null   int64  
 5   Education Index                1844 non-null   int64  
 6   Effective To Date              1844 non-null   object 
 7   Employment Status Index        1844 non-null   int64  
 8   Gender                         1844 non-null   object 
 9   Income                         1844 non-null   int64  
 10  Marital Status Index           1844 non-null   int64  
 11  Months Since Last Claim        1844 non-null   int64  
 12  Months Since Policy Inception  1844 non-null   i

In [720]:
X_test = df_test.drop("CustomerID", axis=1)
pre_process(X_test)
X_test.drop(columns=cols_to_drop, axis=1, inplace=True)
X_test_scaled = normalize(X_test, standard_scaler)

In [721]:
def predict(model, X_test):
  y_test_pred = model.predict(X_test)
  customer_ID = np.array(df_test["CustomerID"])
  result = np.column_stack((customer_ID, y_test_pred))
  return pd.DataFrame(result).rename(columns={0: "CustomerID", 1: "Claim over 1k"})

In [722]:
df_run_test_ouput = predict(model, X_test_scaled)

In [732]:
df_run_test_ouput["Claim over 1k"].value_counts()

Claim over 1k
0    1765
1      79
Name: count, dtype: int64

In [733]:
# compare the two results
def compare(df_1, df_2):
  cnt = 0
  for i in range(len(df_1)):
    if (df_1["Claim over 1k"][i] != df_2["Claim over 1k"][i]):
      cnt += 1
    if (df_1["CustomerID"][i] != df_2["CustomerID"][i]):
      print("Wrong order")
  print(cnt)

In [734]:
compare(df_submission, df_run_test_ouput)

80


In [735]:
compare(df_best_submission, df_run_test_ouput)

82


In [736]:
compare(df_submission, df_best_submission)

66


In [741]:
df_balance_process = balance_with_ratio(df_process, ratio) # note that this data has not been splitted like above to create df_train_from_train_dataset, df_test_from_train_dataset
df_balance_process_shorten = df_balance_process.drop(columns=cols_to_drop, axis=1)
relation_to_claim_over_1k(df_balance_process_shorten)

<Figure size 3000x8000 with 0 Axes>

In [742]:
random_forest(df_balance_process_shorten) # using run_random_forest_simple

Accuracy: 0.9195637355146558
Confusion Matrix:
[[1265   89]
 [  29   84]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1294
           1       0.74      0.49      0.59       173

    accuracy                           0.92      1467
   macro avg       0.84      0.73      0.77      1467
weighted avg       0.91      0.92      0.91      1467

f1 score: $0.5874125874125874
Sensitivity (Recall): 0.9775888717156105
Specificity: 0.48554913294797686


(0.9195637355146558, 0.9775888717156105, 0.48554913294797686)

In [743]:
my_model = standard_run(df_balance_process_shorten, 0.2)

Accuracy: 0.9168370824812543
Confusion Matrix:
[[1269   97]
 [  25   76]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1294
           1       0.75      0.44      0.55       173

    accuracy                           0.92      1467
   macro avg       0.84      0.71      0.75      1467
weighted avg       0.91      0.92      0.91      1467

f1 score: $0.5547445255474452
Sensitivity (Recall): 0.9806800618238022
Specificity: 0.4393063583815029


In [744]:
df_new_run_test_output = predict(my_model, X_test_scaled)
df_new_run_test_output["Claim over 1k"].value_counts()

Claim over 1k
0    1751
1      93
Name: count, dtype: int64

In [745]:
compare(df_submission, df_new_run_test_output)

68


In [746]:
compare(df_best_submission, df_new_run_test_output)

80


In [747]:
compare(df_best_submission, df_bad_submission)

113


In [748]:
compare(df_bad_submission, df_new_run_test_output)

99


In [749]:
df_new_run_test_output.to_csv("submission.csv", index=False)