In [340]:
# for data manipulation
import pandas as pd
import numpy as np
# for plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go # note that github wont be able to display these plots because they are interactive
# for some processing
import math
from datetime import datetime, timedelta

In [341]:
import sys
import os

class SuppressPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout  # Save the original stdout
        sys.stdout = open(os.devnull, 'w')  # Redirect stdout to null

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()  # Close the null file
        sys.stdout = self._original_stdout  # Restore original stdout

In [342]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report, make_scorer
from sklearn.utils import resample
from sklearn.feature_selection import RFE

In [343]:
folder = "kaggle/input/"
df = pd.read_csv(folder + "train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CustomerID                     7290 non-null   object 
 1   State                          7290 non-null   object 
 2   Customer Lifetime Value        7290 non-null   float64
 3   Response                       7290 non-null   object 
 4   Coverage                       7290 non-null   object 
 5   Coverage Index                 7290 non-null   int64  
 6   Education                      7290 non-null   object 
 7   Education Index                7290 non-null   int64  
 8   Effective To Date              7290 non-null   object 
 9   Employment Status              7290 non-null   object 
 10  Employment Status Index        7290 non-null   int64  
 11  Gender                         7290 non-null   object 
 12  Income                         7290 non-null   i

In [344]:
# drop the unnecessary columns
df_train = df.drop(columns = ["CustomerID", "Coverage", "Education", "Employment Status", "Marital Status", "Policy Type", "Policy", "Sales Channel", "Vehicle Size"])
df_train.info()
df_train["Claim over 1k"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290 entries, 0 to 7289
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   State                          7290 non-null   object 
 1   Customer Lifetime Value        7290 non-null   float64
 2   Response                       7290 non-null   object 
 3   Coverage Index                 7290 non-null   int64  
 4   Education Index                7290 non-null   int64  
 5   Effective To Date              7290 non-null   object 
 6   Employment Status Index        7290 non-null   int64  
 7   Gender                         7290 non-null   object 
 8   Income                         7290 non-null   int64  
 9   Marital Status Index           7290 non-null   int64  
 10  Months Since Last Claim        7290 non-null   int64  
 11  Months Since Policy Inception  7290 non-null   int64  
 12  Number of Open Complaints      7290 non-null   i

Claim over 1k
0    6458
1     832
Name: count, dtype: int64

In [345]:
from plotly.subplots import make_subplots

def relation_to_claim_over_1k(df):
  plt.figure(figsize=(30, 80))
  numerical_cols = df.columns.tolist()
  not_include = ["Customer Lifetime Value", "Income", "Claim over 1k"]
  numerical_cols = [col for col in numerical_cols if col not in not_include]
  
  number_of_rows = math.ceil(len(numerical_cols) / 2)
  fig = make_subplots(rows = number_of_rows, cols=2, subplot_titles=numerical_cols, vertical_spacing=0.01, horizontal_spacing=0.05)
  for idx, col in enumerate(numerical_cols):
    df_count = df.groupby([col, "Claim over 1k"]).size().reset_index(name='count')
    unique_val = df_count[col].unique()
    i, j = (idx // 2) + 1, (idx % 2) + 1
    for val in unique_val:
      df_cur = df_count[df_count[col] == val]
      fig.add_trace(
        go.Bar(
          x = df_cur["Claim over 1k"],
          y = df_cur["count"],
          name = f"{val}"
        ),
        row=i, col=j
      )
  fig.update_layout(
      height=300 * number_of_rows,  # Adjust figure height dynamically based on the number of rows
      showlegend=False,
      title_text="How Each Column Affects 'Claim over 1k'"
  )
  # Show the figure with all subplots
  fig.show()

In [346]:
def numeric_info(df, col):
  print(f"Min of {col}: ", df[col].min())
  print(f"Max of {col}: ", df[col].max())
  print(f"Mean of {col}: ", df[col].mean())
  print(f"Mendian of {col}: ", df[col].median())
  print(f"Std of {col}: ", df[col].std())

In [347]:
def binning(df, col, size):
  df[col + "_bin"] = pd.qcut(df[col], q=size, labels=False, duplicates="drop")

def convert(n, range_max, range_min):
  return n / (range_max - range_min)

def pre_process(df):
  # State
  unique_states = df["State"].unique()
  dict_states = {}
  for i in range(len(unique_states)):
    dict_states[unique_states[i]] = i
  df["State"] = df["State"].apply(lambda x: dict_states[x])
  
  # Policy Index
  df["Policy Index"] = df["Policy Index"] // 3

  # Response
  # note that resonse should be either yes or no
  df["Response"] = df["Response"].apply(lambda x: 1 if x == "Yes" else 0)

  # Effective To Date
  df["Effective To Date"] = pd.to_datetime(df["Effective To Date"])
  df["Effective To Season"] = (df["Effective To Date"].dt.month - 1) // 3 # split into 4 seasons
  df.drop("Effective To Date", axis=1, inplace=True)

  # Months Since Last Claim, Months Since Last Inception
  binning(df, "Months Since Last Claim", 10)
  binning(df, "Months Since Policy Inception", 10)

  # Gender
  df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "M" else 0)

  # CLV
  binning(df, "Customer Lifetime Value", 50)
  # Categorize CLV
  clv_bins = [df['Customer Lifetime Value'].min(), 5000, 10000, df['Customer Lifetime Value'].max()]
  clv_labels = [0, 1, 2]
  df['CLV_Category'] = pd.cut(df['Customer Lifetime Value'], bins=clv_bins, labels=clv_labels, include_lowest=True)
  
  # Income
  binning(df, "Income", 30)
  # Categorize Income
  income_bins = [df['Income'].min(), 40000, 80000, df['Income'].max()]
  income_labels = [0, 1, 2]
  df['Income_Category'] = pd.cut(df['Income'], bins=income_bins, labels=income_labels, include_lowest=True)

  # New col: CLV per number of policies and months since policy inception
  df["Money Spent per Policy"] = df["Customer Lifetime Value"] / ((df["Coverage Index"] + 1) * (df["Number of Policies"] + 1))

  binning(df, "Money Spent per Policy", 50)

  # New col: Accident Likelihood
  max_education_index = df["Education Index"].max()
  min_education_index = df["Education Index"].min()
  max_income_bin = df["Income_bin"].max()
  min_income_bin = df["Income_bin"].min()
  max_marital_status_index = df["Marital Status Index"].max()
  min_marital_status_index = df["Marital Status Index"].min()
  df["Accident Likelihood"] = np.exp( 4 * convert(max_education_index - df["Education Index"], max_education_index, min_education_index) \
    + 3 * convert(max_income_bin - df["Income_bin"], max_income_bin, min_income_bin) \
    + 3 * (df["Marital Status Index"].apply(lambda x: max_marital_status_index if x == 0 else 0)))

  binning(df, "Accident Likelihood", 10)

  # New col: Claim over 1k Likelihood based on State and Gender
  group_over = ["State", "Gender", "Employment Status Index", "Income_bin"]
  likelihood_df = df.groupby(group_over)["Accident Likelihood"].mean().reset_index()
  likelihood_df.rename(columns={"Accident Likelihood": "Location Based Likelihood"}, inplace=True)
  # Merge the likelihood values back to the original dataframe
  df = pd.merge(df, likelihood_df, on=group_over, how="left")
  
  binning(df, "Location Based Likelihood", 5)

  # New col: may relate to the Customer Lifetime Value
  df["Insurance"] = np.exp(convert(df["Number of Policies"], df["Number of Policies"].max(), df["Number of Policies"].min()) \
    + convert(df["Coverage Index"], df["Coverage Index"].max(), df["Coverage Index"].min()) \
    + convert(df["Number of Policies"], df["Number of Policies"].max(), df["Number of Policies"].min()) * convert(df["Coverage Index"], df["Coverage Index"].max(), df["Coverage Index"].min())
    # + 0.5 * convert(df["Policy Index"], df["Policy Index"].max(), df["Policy Index"].min()) \
    # + 0.5 * convert(df["Sales Channel Index"], df["Sales Channel Index"].max(), df["Sales Channel Index"].min()) \
    # + 0.5 * convert(df["Renew Offer Type"], df["Renew Offer Type"].max(), df["Renew Offer Type"].min())
    # + 0.5 * convert(df["Months Since Policy Inception"], df["Months Since Policy Inception"].max(), df["Months Since Policy Inception"].min())
    )

  binning(df, "Insurance", 40)

  # New col: may relate to Cusomter Lifetime Value
  df["Customer Interaction"] = np.exp(convert(df["Response"], df["Response"].max(), df["Response"].min()) \
    + convert(df["Number of Open Complaints"], df["Number of Open Complaints"].max(), df["Number of Open Complaints"].min()))
  
  binning(df, "Customer Interaction", 30)

  # New col: may realte to Accident Likelihood
  group_datetime = ["Months Since Last Claim", "Effective To Season"]
  datetime_df = df.groupby(group_datetime)["Accident Likelihood"].mean().reset_index()
  datetime_df.rename(columns={"Accident Likelihood": "Date Time"}, inplace=True)
  # Merge
  df = pd.merge(df, datetime_df, on=group_datetime, how="left")

  binning(df, "Date Time", 30)

  # cols_to_remove = ["Customer Lifetime Value", "Accident Likelihood", "Income", "Insurance", "Location Based Likelihood", "Date Time", \
  #   "Months Since Policy Inception", "Vehicle Size Index", "Gender", "Effective To Season", "Policy Index", "Customer Interaction", \
  #   "Education Index", "Months Since Last Claim", "Renew Offer Type", "State", "Number of Open Complaints", "Sales Channel Index", "Response", "Policy Type Index"
  #   ]
  # df.drop(columns=cols_to_remove, axis=1, inplace=True)
  # cols_to_use = ["Customer Lifetime Value", "Money Spent per Policy", "Accident Likelihood", "Number of Policies", "Income", "Coverage Index", \
  #   "Employment Status Index", "Insurance", "Marital Status Index", "Location Based Likelihood", "Claim over 1k", \
  #   "Date Time", "Education Index", "Months Since Last Claim", "Vehicle Size Index", "Gender", "Months Since Policy Inception"
  #   ]
  
  cols_to_use = ["Customer Lifetime Value", "Money Spent per Policy_bin", "Accident Likelihood_bin", "Number of Policies", "Income_bin", "Coverage Index", \
    "Employment Status Index", "Insurance_bin", "Marital Status Index", "Location Based Likelihood_bin", \
    "Date Time", "Months Since Last Claim", "Gender", "Months Since Policy Inception", \
    "CLV_Category", "Income_Category"
    ]
  if ("Claim over 1k" in df.columns.tolist()):
    cols_to_use.append("Claim over 1k")
  return df

In [348]:
df_process = pre_process(df_train.copy())

In [349]:
relation_to_claim_over_1k(df_process)







<Figure size 3000x8000 with 0 Axes>

In [350]:
import shap

def shap_plot(df_train):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  test_size = 0.2
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)

  # Train a Random Forest model
  model = RandomForestClassifier()
  model.fit(X_train, y_train)
  # Initialize the SHAP explainer
  explainer = shap.Explainer(model, feature_names=X.columns)
  
  # Calculate SHAP values for the test dataset
  shap_values = explainer(X_test)

  print(shap_values.shape)
  shap_values_0 = shap_values[:, :, 0]
  shap_values_1 = shap_values[:, :, 1]
  shap.plots.bar(shap_values_0, max_display=400)
  shap.plots.bar(shap_values_1, max_display=400)
  print(shap_values_0.shape, shap_values_1.shape)
  return shap_values
# shap_values = shap_plot(df_process)

In [351]:
def test_monitor(model, X, y):
  threshold = 0.5
  # Predict the test set
  y_proba = model.predict_proba(X)
  y_pred = (y_proba[:, 1] >= threshold).astype(int)
  # Evaluate the model
  accuracy = accuracy_score(y, y_pred)
  conf_matrix = confusion_matrix(y, y_pred).T
  class_report = classification_report(y, y_pred)

  # Calculate accuracy and F1 score
  print(f"Accuracy: {accuracy}")
  print("Confusion Matrix:")
  print(conf_matrix)
  print("Classification Report:")
  print(class_report)
  f1 = f1_score(y, y_pred)
  print(f"f1 score: ${f1}")
  
  #            Actual
  #             0   1
  # Predict 0   TP  FP
  #         1   FN  TN
  # Evaluate the model
  tp, fp, fn, tn = conf_matrix.ravel()
  # Calculate sensitivity and specificity
  sensitivity = tp / (tp + fn)  # True Positive Rate
  specificity = tn / (tn + fp)  # True Negative Rate

  print("Sensitivity (Recall):", sensitivity)
  print("Specificity:", specificity)

  return accuracy, sensitivity, specificity

In [352]:
def normalize(X, scaler):
  return scaler.fit_transform(X)

def get_train_params(X, y, test_size):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
  return X_train, X_test, y_train, y_test

In [353]:
def run_random_forest_normal(X_train, y_train):
  # Initialize the Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=30, min_samples_leaf=2, min_samples_split=5, bootstrap=False)

  rf_classifier.fit(X_train, y_train)
  return rf_classifier

def random_forest(df_train):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, 0.2)

  rf_classifier = run_random_forest_normal(X_train_scaled, y_train)
  accuracy_from_train, sensitivity_from_train, specificity_from_train = test_monitor(rf_classifier, X_test_scaled, y_test)
  
  return accuracy_from_train, sensitivity_from_train, specificity_from_train

In [354]:
def run_iteration_simple(df_train, times, cols=[]):
  accuracy, sensitivity, specificity = 0, 0, 0
  df_shorten_train = df_train.drop(columns=cols, axis=1)
  for _ in range(times):
    with SuppressPrints():
      accuracy_get, sensitivity_get, specificity_get = random_forest(df_shorten_train)
      accuracy += float(accuracy_get)
      sensitivity += float(sensitivity_get)
      specificity += float(specificity_get)
  print(f"Average over {times} runs: ", accuracy / times, sensitivity / times, specificity / times)

In [355]:
random_forest(df_process) # test with no oversample

Accuracy: 0.9286694101508917
Confusion Matrix:
[[1276   80]
 [  24   78]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1300
           1       0.76      0.49      0.60       158

    accuracy                           0.93      1458
   macro avg       0.85      0.74      0.78      1458
weighted avg       0.92      0.93      0.92      1458

f1 score: $0.6
Sensitivity (Recall): 0.9815384615384616
Specificity: 0.4936708860759494


(0.9286694101508917, 0.9815384615384616, 0.4936708860759494)

In [356]:
run_iteration_simple(df_process, 5)

Average over 5 runs:  0.9310013717421125 0.9824615384615385 0.5075949367088608


In [357]:
def hyper_tune_random_forest(df_train):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, 0.2)
  # Step 2: Define the model
  rf_classifier = RandomForestClassifier(random_state=42)

  # Step 3: Specify hyperparameters to tune
  param_grid = {
    'n_estimators': [100, 200, 250],          # Number of trees
    'max_depth': [10, 20, 30],             # Maximum depth of the trees
    'min_samples_split': [5, 10],         # Minimum samples to split an internal node
    'min_samples_leaf': [2, 5, 10],          # Minimum samples at a leaf node
    'bootstrap': [False, True],
    'class_weight': [None, 'balanced']
  }

  # Step 4: Set up GridSearchCV
  grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
                 cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

  # Step 5: Fit the model
  grid_search.fit(X_train_scaled, y_train)

  # Step 6: Evaluate the best model
  best_model = grid_search.best_estimator_
  
  return best_model
# best_model = hyper_tune_random_forest(df_process)
# print(best_model)

In [358]:
def train_simple(df_train, test_size):
  df_train.value_counts()
  # Get the params
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train_scaled, X_test_scaled, y_train, y_test = get_train_params(X, y, test_size)

  rf_classifier = run_random_forest_normal(X_train_scaled, y_train)
  test_monitor(rf_classifier, X_test_scaled, y_test)
  return rf_classifier

In [359]:
my_model = train_simple(df_process, 0.2)

Accuracy: 0.9307270233196159
Confusion Matrix:
[[1278   79]
 [  22   79]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1300
           1       0.78      0.50      0.61       158

    accuracy                           0.93      1458
   macro avg       0.86      0.74      0.79      1458
weighted avg       0.92      0.93      0.92      1458

f1 score: $0.61003861003861
Sensitivity (Recall): 0.9830769230769231
Specificity: 0.5


In [360]:
df_test = pd.read_csv(folder + "test.csv").drop(columns=["Coverage", "Education", "Employment Status", "Marital Status", "Policy Type", "Policy", "Sales Channel", "Vehicle Size"])
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1844 entries, 0 to 1843
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CustomerID                     1844 non-null   object 
 1   State                          1844 non-null   object 
 2   Customer Lifetime Value        1844 non-null   float64
 3   Response                       1844 non-null   object 
 4   Coverage Index                 1844 non-null   int64  
 5   Education Index                1844 non-null   int64  
 6   Effective To Date              1844 non-null   object 
 7   Employment Status Index        1844 non-null   int64  
 8   Gender                         1844 non-null   object 
 9   Income                         1844 non-null   int64  
 10  Marital Status Index           1844 non-null   int64  
 11  Months Since Last Claim        1844 non-null   int64  
 12  Months Since Policy Inception  1844 non-null   i

In [364]:
X_test = pre_process(df_test.copy()).drop("CustomerID", axis=1)

In [365]:
def predict(model, X_test):
  y_test_pred = model.predict(X_test)
  customer_ID = np.array(df_test["CustomerID"])
  result = np.column_stack((customer_ID, y_test_pred))
  return pd.DataFrame(result).rename(columns={0: "CustomerID", 1: "Claim over 1k"})

In [366]:
df_run_test_ouput = predict(my_model, X_test)

In [367]:
df_run_test_ouput["Claim over 1k"].value_counts()

Claim over 1k
0    1700
1     144
Name: count, dtype: int64

In [368]:
df_run_test_ouput.to_csv("new_submission/submission.csv", index=False)

In [369]:
import lightgbm as lgb



def run_light_gbm(X_train, X_test, y_train, y_test):
  # Create LightGBM datasets
  train_data = lgb.Dataset(X_train, label=y_train)
  test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

  # Set parameters
  params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'max_depth': -1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1
  }

  # Train the model
  num_round = 100
  bst = lgb.train(params, train_data, num_round, valid_sets=test_data)

  # Predict on the test set
  y_pred = bst.predict(X_test)
  y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred).T
  class_report = classification_report(y_test, y_pred)

  # Calculate accuracy and F1 score
  print(f"Accuracy: {accuracy}")
  print("Confusion Matrix:")
  print(conf_matrix)
  print("Classification Report:")
  print(class_report)
  f1 = f1_score(y_test, y_pred)
  print(f"f1 score: ${f1}")
  
  #            Actual
  #             0   1
  # Predict 0   TP  FP
  #         1   FN  TN
  # Evaluate the model
  tp, fp, fn, tn = conf_matrix.ravel()
  # Calculate sensitivity and specificity
  sensitivity = tp / (tp + fn)  # True Positive Rate
  specificity = tn / (tn + fp)  # True Negative Rate

  print("Sensitivity (Recall):", sensitivity)
  print("Specificity:", specificity)

  return bst, accuracy, sensitivity, specificity

def light_gbm(df_train):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train, X_test, y_train, y_test = get_train_params(X, y, 0.2)

  bst, accuracy, sensitivity, specificity = run_light_gbm(X_train, X_test, y_train, y_test)
  return bst, accuracy, sensitivity, specificity

In [370]:
bst = light_gbm(df_process)[0]

[LightGBM] [Info] Number of positive: 674, number of negative: 5158
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1804
[LightGBM] [Info] Number of data points in the train set: 5832, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115569 -> initscore=-2.035074
[LightGBM] [Info] Start training from score -2.035074
Accuracy: 0.9300411522633745
Confusion Matrix:
[[1276   78]
 [  24   80]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1300
           1       0.77      0.51      0.61       158

    accuracy                           0.93      1458
   macro avg       0.86      0.74      0.79      1458
weighted avg       0.92      0.93      0.92      1458

f1 score: $0.610687022

In [371]:
def run_iteration_light_gbm(df_train, times, cols=[]):
  accuracy, sensitivity, specificity = 0, 0, 0
  df_shorten_train = df_train.drop(columns=cols, axis=1)
  for _ in range(times):
    with SuppressPrints():
      bst, accuracy_get, sensitivity_get, specificity_get = light_gbm(df_shorten_train)
      accuracy += float(accuracy_get)
      sensitivity += float(sensitivity_get)
      specificity += float(specificity_get)
  print(f"Average over {times} runs: ", accuracy / times, sensitivity / times, specificity / times)

In [372]:
run_iteration_light_gbm(df_process, 20)

Average over 20 runs:  0.9300411522633745 0.9815384615384616 0.5063291139240504


In [373]:
def hyper_tune_light_gbm(df_train):
  X = df_train.drop("Claim over 1k", axis=1)
  y = df_train["Claim over 1k"]
  X_train, X_test, y_train, y_test = get_train_params(X, y, 0.2)
  # Define the model
  lgb_model = lgb.LGBMClassifier()

  # Define the hyperparameter grid
  param_grid = {
      'num_leaves': [31, 50, 70],
      'max_depth': [-1, 5, 10],
      'learning_rate': [0.01, 0.1, 0.2],
      'n_estimators': [100, 200]
  }

  # Set up the Grid Search
  grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, 
                            scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

  # Fit the Grid Search
  grid_search.fit(X_train, y_train)

  best_model = grid_search.best_estimator_
  return best_model
# hyper_tune_light_gbm(df_process)

In [374]:
def predict_light_gbm(bst, X_test):
  y_pred = bst.predict(X_test)
  y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
  customer_ID = np.array(df_test["CustomerID"])
  result = np.column_stack((customer_ID, y_pred))
  return pd.DataFrame(result).rename(columns={0: "CustomerID", 1: "Claim over 1k"})

In [375]:
df_light_gbm_run_test_output = predict_light_gbm(bst, X_test)

In [376]:
df_light_gbm_run_test_output["Claim over 1k"].value_counts()

Claim over 1k
0    1695
1     149
Name: count, dtype: int64

In [377]:
df_run_test_ouput.to_csv("new_submission/light_gbm_submission.csv", index=False)

In [378]:
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier

In [379]:
def get_results(model, cv, X, y):
    metrics = {'accuracy' : make_scorer(accuracy_score), 
               'f1' : make_scorer(f1_score), 
               'precision' : make_scorer(precision_score),
               'recall' : make_scorer(recall_score)}
    
    scores =  cross_validate(model, X, y, scoring = metrics, cv = cv, n_jobs = -1)
    return scores
    
# def print_results(model, cv, X, y):
#     accuracy = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)
#     f1 = cross_val_score(model, X, y, scoring = 'f1', cv = cv, n_jobs = -1)
#     print(f"Accuracy: {accuracy}")
#     print(f"F1 Score: {f1}")
#     print("--------------------------------------")
#     print(f"Mean Accuracy: {accuracy.mean()}")
#     print(f"F1 Accuracy: {f1.mean()}")
    
def print_results(res):
  for key,val in res.items():
    if key == 'fit_time' or key == 'score_time': continue 
    print(f'{key}: {val.mean()} ± {val.std()}')
    print(val)
    print('\n')

In [380]:
def run_k_fold_light_gbm(df_train):
  X = df_train.drop(columns=["Claim over 1k"], axis=1)
  y = df_train["Claim over 1k"]
  
  value_counts = df_train['Claim over 1k'].value_counts()
  weighting = value_counts[0] / value_counts[1]

  model = LGBMClassifier(verbose=-1)

  kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

  res = get_results(model, kf, X, y)
  print_results(res)
  

In [381]:
run_k_fold_light_gbm(df_process)

test_accuracy: 0.9323319615912208 ± 0.005296079392460915
[0.93209877 0.91906722 0.93415638 0.93347051 0.93964335 0.93209877
 0.92866941 0.92866941 0.93004115 0.93484225 0.92935528 0.93415638
 0.94718793 0.93072702 0.93552812 0.93484225 0.93141289 0.93689986
 0.92866941 0.93072702 0.92935528 0.93484225 0.93552812 0.92729767
 0.92729767 0.92866941 0.9266118  0.93964335 0.93347051 0.92866941
 0.91975309 0.93552812 0.93621399 0.93895748 0.93415638 0.93621399
 0.9266118  0.9266118  0.93895748 0.93141289 0.93415638 0.94170096
 0.92592593 0.92455418 0.93552812 0.93689986 0.93758573 0.92798354
 0.93552812 0.92866941]


test_f1: 0.657067003899264 ± 0.02993917731038307
[0.66211604 0.57857143 0.66666667 0.64981949 0.69863014 0.64516129
 0.63120567 0.64383562 0.66       0.6779661  0.6360424  0.66433566
 0.74587459 0.63799283 0.68243243 0.67128028 0.65277778 0.66666667
 0.63888889 0.65762712 0.64359862 0.67353952 0.66901408 0.6369863
 0.61313869 0.63380282 0.63481229 0.70666667 0.66435986 0.6338028

In [382]:
def train_k_fold_lgbm(df_train):
  X = df_train.drop(columns=["Claim over 1k"], axis=1)
  y = df_train["Claim over 1k"]
  X_train, X_test, y_train, y_test = get_train_params(X, y, 0.2)

  model = LGBMClassifier()
  model.fit(X_train, y_train)
  return model
lbmb_model = train_k_fold_lgbm(df_process)

[LightGBM] [Info] Number of positive: 674, number of negative: 5158
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1804
[LightGBM] [Info] Number of data points in the train set: 5832, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115569 -> initscore=-2.035074
[LightGBM] [Info] Start training from score -2.035074


In [383]:
df_k_fold_light_gbm_run_test = predict(lbmb_model, X_test)

In [384]:
df_k_fold_light_gbm_run_test["Claim over 1k"].value_counts()

Claim over 1k
0    1699
1     145
Name: count, dtype: int64

In [385]:
df_k_fold_light_gbm_run_test.to_csv("new_submission/submission.csv", index=False)