In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer, silhouette_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import DBSCAN, KMeans
from sklearn.mixture import BayesianGaussianMixture
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA

from category_encoders import TargetEncoder
import lightgbm as lgb

In [71]:
#* I import different version of our dataset so we can see how different transformations effect the models predictions 


X_train = pd.read_csv("X_train_tran.csv")
y_train = pd.read_csv("y_train_tran.csv")
y_train = np.ravel(y_train)
X_val = pd.read_csv("X_val_tran.csv")
y_val = pd.read_csv("y_val_tran.csv")
myColumns = X_train.columns
print(X_val.shape)
print(y_val.shape)
print(X_val.shape)
print(y_val.shape)


X = pd.read_csv("X.csv")
X_train_cluster = pd.read_csv("X_train_cluster.csv")
X_val_cluster = pd.read_csv("X_val_cluster.csv")
X_test_cluster = pd.read_csv("X_test_cluster.csv")

(1000, 15)
(1000, 1)
(1000, 15)
(1000, 1)


In [9]:
#
#! GradientBoostingClassifier

continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']

# Define the parameter grid to search over
param_grid = {
    'classifier__n_estimators': [250, 300, 350],
    'classifier__learning_rate': [0.01, 0.05, 0.08, 0.1],
    'classifier__max_depth': [3, 4, 5]
}

prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])

gradientBoosting_cls = GradientBoostingClassifier()

# Create the pipeline
pipe_gbcls = Pipeline([
    ('preprocessor', prep),
    ('classifier', gradientBoosting_cls)
])

# Create the GridSearchCV object
grid_search_gbcls = GridSearchCV(pipe_gbcls, param_grid, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_gbcls.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_gbcls.best_params_)
print("Best Score (F1 Macro):", grid_search_gbcls.best_score_)

# Get the best model from the grid search
best_model = grid_search_gbcls.best_estimator_

y_pred = best_model.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'classifier__learning_rate': 0.08, 'classifier__max_depth': 4, 'classifier__n_estimators': 300}
Best Score (F1 Macro): 0.7363049516612827

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       796
           1       0.72      0.45      0.56       204

    accuracy                           0.85      1000
   macro avg       0.80      0.70      0.73      1000
weighted avg       0.84      0.85      0.84      1000



<font color='lightblue'> 
Ok, so the general approach is to first test a few models that are often recommended for binary classification problems. The based on how they preform un-optimized , i will select that model and try to fine tune it using gridsearchCV
</font>

<font color='Tangerine'> 
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__learning_rate': 0.08, 'classifier__max_depth': 4, 'classifier__n_estimators': 300}
Best Score (F1 Macro): 0.7363049516612827

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       796
           1       0.72      0.45      0.56       204

    accuracy                           0.85      1000
   macro avg       0.80      0.70      0.73      1000
weighted avg       0.84      0.85      0.84      1000
</font>

In [11]:
#
#! RandomForestClassifier

continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])


# Define the parameter grid
param = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
}


# Define the classifier
rf_cls = RandomForestClassifier()


# Create the pipeline
pipe_rf_cls = Pipeline(steps=[
    ('preprocessor', prep),
    ('classifier', rf_cls)
])


# Perform grid search
grid_search = GridSearchCV(pipe_rf_cls, param, cv=5, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Use the best model for prediction
y_pred = best_model.predict(X_val_cluster)

# Generate the classification report
report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       796
           1       0.71      0.43      0.54       204

    accuracy                           0.85      1000
   macro avg       0.79      0.69      0.72      1000
weighted avg       0.84      0.85      0.83      1000



<font color='Tangerine'> 
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       796
           1       0.71      0.43      0.54       204

    accuracy                           0.85      1000
   macro avg       0.79      0.69      0.72      1000
weighted avg       0.84      0.85      0.83      1000
</font>

In [14]:
#
#! RandomForestClassifier

param_grid = {
    'preprocessor__target__smoothing': [1.0,2, 5, 7],
    'classifier__n_estimators': [250, 300, 350],
    'classifier__max_depth': [6,7,8,9]
}


continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']

all_features = continuous_features + ordinal_features + cluster_features + continuous_features_engine + categorical_features


prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), all_features),   
    ])


rf_cls = RandomForestClassifier()



# Create the pipeline
pipe_rf_cls_target = Pipeline(steps=[
    ('preprocessor', prep),
    ('classifier', rf_cls)
])


# pipe_target_enc
# Create the GridSearchCV object
grid_search_rf_target = GridSearchCV(pipe_rf_cls_target, param_grid, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
# grid_search.fit(X_train, y_train)
grid_search_rf_target.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_rf_target.best_params_)
print("Best Score (F1 Macro):", grid_search_rf_target.best_score_)

# Get the best model from the grid search
best_model = grid_search_rf_target.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_val_cluster)

# Generate the classification report
report = classification_report(y_val, y_pred)
print("Classification Report:")
print(report)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'classifier__max_depth': 9, 'classifier__n_estimators': 250, 'preprocessor__target__smoothing': 5}
Best Score (F1 Macro): 0.725606372969776
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.91       796
           1       0.76      0.43      0.55       204

    accuracy                           0.86      1000
   macro avg       0.82      0.70      0.73      1000
weighted avg       0.85      0.86      0.84      1000



<font color='Tangerine'> 
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__max_depth': 9, 'classifier__n_estimators': 250, 'preprocessor__target__smoothing': 5}
Best Score (F1 Macro): 0.725606372969776
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.91       796
           1       0.76      0.43      0.55       204

    accuracy                           0.86      1000
   macro avg       0.82      0.70      0.73      1000
weighted avg       0.85      0.86      0.84      1000
</font>

In [22]:
#
#! LGBMClassifier

continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])




param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__num_leaves': [10, 20, 30],
    'classifier__min_child_samples': [10, 20, 30],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__reg_alpha': [0.0, 0.1, 0.5],
    'classifier__scale_pos_weight': [1, 2, 5]
}


# Define the classifier
lgm_cls = lgb.LGBMClassifier()


# Create the pipeline
pipe_lgm_cls = Pipeline(steps=[
    ('preprocessor', prep),
    ('classifier', lgm_cls)
])


# Perform grid search
grid_search_lgm_cls = GridSearchCV(pipe_lgm_cls, param_grid, cv=5, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_lgm_cls.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_lgm_cls.best_params_)

# Get the best model from the grid search
best_model = grid_search_lgm_cls.best_estimator_

# Use the best model for prediction
y_pred = best_model.predict(X_val_cluster)

# Generate the classification report
report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


ValueError: Invalid parameter 'max_depth' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('target', TargetEncoder(),
                                                  ['BalanceCredit',
                                                   'AgeProducts',
                                                   'SatisfactionProducts',
                                                   'CardProducts',
                                                   'TenurePoints']),
                                                 ('scaler', StandardScaler(),
                                                  ['CreditScore', 'Age',
                                                   'Balance', 'NumOfProducts',
                                                   'EstimatedSalary',
                                                   'PointsEarned']),
                                                 ('ordinal', 'passthrough',
                                                  ['Tenure', 'CardTypeOrd',
                                                   'NumOfProducts',
                                                   'NumOfProducts']),
                                                 ('cluster', 'passthrough',
                                                  ['ClusterKMeans',
                                                   'ClusterBGM']),
                                                 ('categorical', 'passthrough',
                                                  ['HasCrCard',
                                                   'IsActiveMember',
                                                   'GenderBinary',
                                                   'GenderBinary'])])),
                ('classifier', LGBMClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [23]:
#
#! LGBMClassifier

continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


# Define the classifier
lgm_cls = lgb.LGBMClassifier()

# Define the parameter grid for the LGBMClassifier estimator
lgm_cls_params = {
    'learning_rate': [0.01, 0.05, 0.8],
    'max_depth': [3, 5, 7],
    'num_leaves': [10, 20, 30],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.6, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'scale_pos_weight': [1, 2, 5]
}

prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])


# Update the parameter grid to include the parameters of the LGBMClassifier estimator
param_grid = {
    'classifier__' + k: v for k, v in lgm_cls_params.items()
}

# Create the pipeline
pipe_lgm_cls = Pipeline(steps=[
    ('preprocessor', prep),
    ('classifier', lgm_cls)
])

# Perform grid search
grid_search_lgm_cls = GridSearchCV(pipe_lgm_cls, param_grid, cv=5, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_lgm_cls.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_lgm_cls.best_params_)

# Get the best model from the grid search
best_model = grid_search_lgm_cls.best_estimator_

# Use the best model for prediction
y_pred = best_model.predict(X_val_cluster)

# Generate the classification report
report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits


Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_samples': 10, 'classifier__num_leaves': 10, 'classifier__reg_alpha': 0.0, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.6}

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       796
           1       0.62      0.60      0.61       204

    accuracy                           0.84      1000
   macro avg       0.76      0.75      0.76      1000
weighted avg       0.84      0.84      0.84      1000



<font color='Tangerine'> 
Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_samples': 10, 'classifier__num_leaves': 10, 'classifier__reg_alpha': 0.0, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.6}

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       796
           1       0.62      0.60      0.61       204

    accuracy                           0.84      1000
   macro avg       0.76      0.75      0.76      1000
weighted avg       0.84      0.84      0.84      1000
</font>

<font color='Tangerine'> 
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4}
Best Score (F1 Macro): 0.7037421002959607
Model name:  RandomForestClassifier
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.99      0.93       796
           1       0.89      0.44      0.59       204

    accuracy                           0.87      1000
   macro avg       0.88      0.71      0.76      1000
weighted avg       0.88      0.87      0.86      1000
</font>

In [26]:
continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


# Increase the weight value in proportion to its frequency to balance a bit for the current undersampling
class_weights = dict(zip(np.unique(y_train), [len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train)[i]) for i in np.unique(y_train)]))


# Create an instance of the LGBMClassifier
lgm = lgb.LGBMClassifier(class_weight=class_weights)

# Define the parameter grid
param_grid = {
    'classifier__num_leaves': [20,25,30],
    'classifier__max_depth': [7,8,9],
    'classifier__learning_rate': [0.05, 0.1,0.2,0.25]
}


prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])

# Create the pipeline
pipe_lgm_cls_222 = Pipeline([
    ('preprocessor', prep),
    ('classifier', lgm)
])

# Create the GridSearchCV object
grid_search_lgm_222 = GridSearchCV(pipe_lgm_cls_222, param_grid, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_lgm_222.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_lgm_222.best_params_)
print("Best Score (F1 Macro):", grid_search_lgm_222.best_score_)

# Get the best model from the grid search
best_model = grid_search_lgm_222.best_estimator_


y_pred = best_model.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'classifier__learning_rate': 0.25, 'classifier__max_depth': 7, 'classifier__num_leaves': 30}
Best Score (F1 Macro): 0.7354734319564414

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       796
           1       0.53      0.59      0.56       204

    accuracy                           0.81      1000
   macro avg       0.71      0.73      0.72      1000
weighted avg       0.82      0.81      0.81      1000



<font color='Tangerine'> 
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__num_leaves': 20}
Best Score (F1 Macro): 0.7398664837604242

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       796
           1       0.78      0.54      0.64       204

    accuracy                           0.88      1000
   macro avg       0.84      0.75      0.78      1000
weighted avg       0.87      0.88      0.87      1000
</font>

In [14]:
target_enc_features = ['GenderBinary', 'CountryOrd', 'SurnameOrd', 'HasCrCard', 'IsActiveMember', 'Tenure', 'NumOfProducts',
                       'SatisfactionScore', 'PointsEarnedQuant', 'AgeOrd', 'TaxBracket', 'BalanceOrd']


# Increase the weight value in proportion to its frequency to balance a bit for the current undersampling
class_weights = dict(zip(np.unique(y_train), [len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train)[i]) for i in np.unique(y_train)]))


# Create an instance of the LGBMClassifier
logreg = LogisticRegression(class_weight=class_weights) # class_weight=class_weights

# Define the parameter grid
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l2']
}

preprocessor = ColumnTransformer([
    ('target_encoder', TargetEncoder(), target_enc_features),
])


# Create the pipeline
pipe_logReg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', logreg)
])

# Create the GridSearchCV object
grid_search = GridSearchCV(pipe_logReg, param_grid, cv=5, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (F1 Macro):", grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_val)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'classifier__C': 1.0, 'classifier__penalty': 'l2'}
Best Score (F1 Macro): 0.6389185818926236

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.69      0.78       796
           1       0.36      0.70      0.48       204

    accuracy                           0.69      1000
   macro avg       0.63      0.69      0.63      1000
weighted avg       0.79      0.69      0.72      1000



In [28]:
continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


# Increase the weight value in proportion to its frequency to balance a bit for the current undersampling
class_weights = dict(zip(np.unique(y_train), [len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train)[i]) for i in np.unique(y_train)]))


# Create an instance of the LGBMClassifier
# lgm = lgb.LGBMClassifier(class_weight=class_weights)
logreg = LogisticRegression(max_iter=1500 ,class_weight=class_weights) # class_weight=class_weights

# Define the parameter grid
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l2']
}


prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])

# Create the pipeline
pipe_log_reg = Pipeline([
    ('preprocessor', prep),
    ('classifier', logreg)
])

# Create the GridSearchCV object
grid_search_log_reg = GridSearchCV(pipe_log_reg, param_grid, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_log_reg.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_log_reg.best_params_)
print("Best Score (F1 Macro):", grid_search_log_reg.best_score_)

# Get the best model from the grid search
best_model = grid_search_log_reg.best_estimator_


y_pred = best_model.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2'}
Best Score (F1 Macro): 0.690685149293826

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.77      0.83       796
           1       0.43      0.69      0.53       204

    accuracy                           0.75      1000
   macro avg       0.67      0.73      0.68      1000
weighted avg       0.81      0.75      0.77      1000



<font color='Tangerine'> 
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2'}
Best Score (F1 Macro): 0.690685149293826

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.77      0.83       796
           1       0.43      0.69      0.53       204

    accuracy                           0.75      1000
   macro avg       0.67      0.73      0.68      1000
weighted avg       0.81      0.75      0.77      1000
</font>

In [33]:
continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


# Increase the weight value in proportion to its frequency to balance a bit for the current undersampling
class_weights = dict(zip(np.unique(y_train), [len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train)[i]) for i in np.unique(y_train)]))


catboost_cls = CatBoostClassifier(class_weights=class_weights)


# Define the parameter grid
# param_grid = {
#     'classifier__learning_rate': [0.05, 0.1, 0.15],
#     'classifier__depth': [3, 5, 7, 9],
#     'classifier__n_estimators': [150, 200, 300]
# }


param_grid = {
    'classifier__learning_rate': [0.05, 0.1, 0.15],
    'classifier__depth': [3, 5, 7, 9],
    'classifier__n_estimators': [150, 200, 300],
    'classifier__l2_leaf_reg': [1, 3, 5],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bylevel': [0.8, 1.0],
}



prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])

# Create the pipeline
pipe_catboost_cls = Pipeline([
    ('preprocessor', prep),
    ('classifier', catboost_cls)
])

# Create the GridSearchCV object
grid_search_catboost_cls = GridSearchCV(pipe_catboost_cls, param_grid, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_catboost_cls.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_catboost_cls.best_params_)
print("Best Score (F1 Macro):", grid_search_catboost_cls.best_score_)

# Get the best model from the grid search
best_model = grid_search_catboost_cls.best_estimator_


y_pred = best_model.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
0:	learn: 0.6644162	total: 18.8ms	remaining: 3.74s
1:	learn: 0.6431847	total: 36.9ms	remaining: 3.65s
2:	learn: 0.6239523	total: 56.7ms	remaining: 3.72s
3:	learn: 0.6073529	total: 75.9ms	remaining: 3.72s
4:	learn: 0.5924002	total: 91.5ms	remaining: 3.57s
5:	learn: 0.5823188	total: 107ms	remaining: 3.47s
6:	learn: 0.5653201	total: 132ms	remaining: 3.63s
7:	learn: 0.5526096	total: 165ms	remaining: 3.95s
8:	learn: 0.5432181	total: 223ms	remaining: 4.72s
9:	learn: 0.5325088	total: 247ms	remaining: 4.69s
10:	learn: 0.5255730	total: 285ms	remaining: 4.89s
11:	learn: 0.5180769	total: 314ms	remaining: 4.92s
12:	learn: 0.5127119	total: 335ms	remaining: 4.82s
13:	learn: 0.5081483	total: 356ms	remaining: 4.74s
14:	learn: 0.5031399	total: 384ms	remaining: 4.74s
15:	learn: 0.4976469	total: 410ms	remaining: 4.71s
16:	learn: 0.4948376	total: 436ms	remaining: 4.69s
17:	learn: 0.4891475	total: 482ms	remaining: 4.87s
18:	learn: 0.4836258	to

<font color='Tangerine'> 
Best Parameters: {'classifier__colsample_bylevel': 0.8, 'classifier__depth': 9, 'classifier__l2_leaf_reg': 1, 'classifier__learning_rate': 0.05, 'classifier__n_estimators': 200, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7402284236119909

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       796
           1       0.56      0.64      0.60       204

    accuracy                           0.82      1000
   macro avg       0.73      0.76      0.74      1000
weighted avg       0.83      0.82      0.83      1000
</font>

In [32]:
#* With semi-duplicate features i.e. Balance AND BalanceCredit

continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned', 'TenurePoints', 'CardProducts', 'SatisfactionProducts', 'AgeProducts', 'BalanceCredit']

categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary']

cluster_features = ['ClusterKMeans', 'ClusterBGM']

ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']

target_enc_features = ['GenderBinary', 'CountryOrd', 'SurnameOrd', 'HasCrCard', 'IsActiveMember',  'Tenure', 'NumOfProducts','SatisfactionScore', 'TenurePoints', 'CardProducts', 'SatisfactionProducts', 'AgeProducts', 'BalanceCredit']

# Increase the weight value in proportion to its frequency to balance a bit for the current undersampling
class_weights = dict(zip(np.unique(y_train), [len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train)[i]) for i in np.unique(y_train)]))

# Create an instance of the XGBClassifier
xgb_cls = XGBClassifier()

# Define the parameter grid
param_grid = {
    'classifier__learning_rate': [0.05, 0.1, 0.15],
    'classifier__max_depth': [3, 5, 7,9],
    'classifier__n_estimators': [150, 200, 300]
}

prep = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])



# Create the pipeline
pipe_xgb_cls = Pipeline([
    ('preprocessor', prep),
    ('classifier', xgb_cls)
])

# Create the GridSearchCV object
grid_search_xg = GridSearchCV(pipe_xgb_cls, param_grid, cv=3, scoring='f1_macro', verbose=4, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_xg.fit(X, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_xg.best_params_)
print("Best Score (F1 Macro):", grid_search_xg.best_score_)

# Get the best model from the grid search
best_model = grid_search_xg.best_estimator_

y_pred = best_model.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 5, 'classifier__n_estimators': 150}
Best Score (F1 Macro): 0.7315665204230263

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       796
           1       0.75      0.45      0.56       204

    accuracy                           0.86      1000
   macro avg       0.81      0.70      0.74      1000
weighted avg       0.85      0.86      0.84      1000



<font color='Tangerine'> 
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 300}
Best Score (F1 Macro): 0.7385244777083596

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       796
           1       0.73      0.47      0.57       204

    accuracy                           0.86      1000
   macro avg       0.80      0.71      0.74      1000
weighted avg       0.85      0.86      0.84      1000
</font>

<font color='lightblue'> 
After testing some typical models that are used for binary classification. I think the best model is catboost. Though there are definitely more hyperparameters to test that i did not have the time for this time.

Lastly i will try some other configurations with catboost and then try the best scoring on the X_test set and save that model.  
</font>

In [21]:
import pandas as pd
from sklearn.utils import resample

# y_train_series = pd.Series(y_train)
y_train = pd.read_csv("y_train_tran.csv")

# Combine X_train and y_train into a single DataFrame
train_data = pd.concat([X, y_train], axis=1)

# Separate the samples of each class
class_0 = train_data[train_data['Exited'] == 0]
class_1 = train_data[train_data['Exited'] == 1]

# Resample the majority class (class_0) to match the number of samples in the minority class (class_1)
class_0_resampled = resample(class_0, replace=True, n_samples=len(class_1), random_state=42)

# Combine the resampled class_0 with the original class_1 to create a balanced dataset
balanced_data = pd.concat([class_0_resampled, class_1])

# Separate the features (X) and the target variable (y) from the balanced dataset
X_balanced = balanced_data.drop('Exited', axis=1)
y_balanced = balanced_data['Exited']


In [20]:
len(X_train)
print(len(X_balanced))
y_balanced.value_counts()

3260


Exited
0    1630
1    1630
Name: count, dtype: int64

In [None]:
continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned']
continuous_features_engine = ['BalanceCredit', 'AgeProducts', 'SatisfactionProducts', 'CardProducts', 'TenurePoints']
categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary', 'GenderBinary']
cluster_features = ['ClusterKMeans', 'ClusterBGM']
ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']


catboost_cls = CatBoostClassifier()


param_grid = {
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__depth': [6,7],
}


prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features_engine),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])

# Create the pipeline
pipe_catboost_cls = Pipeline([
    ('preprocessor', prep),
    ('classifier', catboost_cls)
])

# Create the GridSearchCV object
grid_search_catboost_cls_hyper = GridSearchCV(pipe_catboost_cls, param_grid, scoring='f1_macro', n_jobs=-1, verbose=2, cv=2)

# Fit the GridSearchCV object to the data
grid_search_catboost_cls_hyper.fit(X_balanced, y_balanced)

# Print the best parameters and best score
print("Best Parameters:", grid_search_catboost_cls_hyper.best_params_)
print("Best Score (F1 Macro):", grid_search_catboost_cls_hyper.best_score_)

# Get the best model from the grid search
best_model = grid_search_catboost_cls_hyper.best_estimator_


y_pred = best_model.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

In [83]:
continuous_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'PointsEarned', 'TenurePoints', 'CardProducts', 'SatisfactionProducts', 'AgeProducts', 'BalanceCredit']

categorical_features = ['HasCrCard', 'IsActiveMember', 'GenderBinary']

cluster_features = ['ClusterKMeans', 'ClusterBGM']

ordinal_features = ['Tenure', 'CardTypeOrd', 'NumOfProducts', 'NumOfProducts']

target_enc_features = ['GenderBinary', 'CountryOrd', 'SurnameOrd', 'HasCrCard', 'IsActiveMember',  'Tenure', 'NumOfProducts','SatisfactionScore', 'TenurePoints', 'CardProducts', 'SatisfactionProducts', 'AgeProducts', 'BalanceCredit']


# Create an instance of the XGBClassifier
xgb_cls = XGBClassifier()


param_grid = {
    'classifier__learning_rate': [0.02],
    'classifier__max_depth': [2, 3, 4],
    'classifier__n_estimators': [200],
    'classifier__reg_alpha': [0.5],
    'classifier__min_child_weight': [1, 2, 3],
    'classifier__subsample': [0.8],
    'classifier__reg_lambda': [0.5],
    'classifier__scale_pos_weight': [2]
}

prep = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])



# Create the pipeline
pipe_xgb_cls = Pipeline([
    ('preprocessor', prep),
    ('classifier', xgb_cls)
])

# Create the GridSearchCV object
grid_search_xg_hyper = GridSearchCV(pipe_xgb_cls, param_grid, cv=2, scoring='f1_macro', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_xg_hyper.fit(X , y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_xg_hyper.best_params_)
print("Best Score (F1 Macro):", grid_search_xg_hyper.best_score_)

# Get the best model from the grid search
best_model_final_trans = grid_search_xg_hyper.best_estimator_

y_pred = best_model_final_trans.predict(X_val_cluster)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 3, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7435654641714827

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       796
           1       0.62      0.59      0.60       204

    accuracy                           0.84      1000
   macro avg       0.76      0.75      0.75      1000
weighted avg       0.84      0.84      0.84      1000



<font color='Tangerine'> 
Fitting 2 folds for each of 36 candidates, totalling 72 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 3, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7435654641714827

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       796
           1       0.62      0.59      0.60       204

    accuracy                           0.84      1000
   macro avg       0.76      0.75      0.75      1000
weighted avg       0.84      0.84      0.84      1000
</font>

<font color='lightblue'> 
Best hyperparameters i could find for X_train_cluster
</font>

In [63]:
X_train.head()

Unnamed: 0,Tenure,NumOfProducts,HasCrCard,IsActiveMember,SatisfactionScore,CreditScoreOrd,AgeOrd,TaxBracket,BalanceOrd,PointsEarnedQuant,CardTypeOrd,SurnameOrd,CountryOrd,CountryHappy,GenderBinary
0,2,2,1,0,2,3,1,4,0,1.0,1.0,0.0,1,6.48,0
1,1,1,1,1,1,4,1,3,1,2.0,2.0,0.0,2,6.69,0
2,2,1,0,1,1,4,4,4,3,2.0,0.0,0.0,2,6.69,0
3,4,1,1,0,1,2,1,1,2,0.0,2.0,2.0,3,7.03,1
4,8,2,0,1,3,3,3,3,0,2.0,0.0,0.0,2,6.69,1


In [67]:
#
#* Let us look at the features without the ones created in the feature engineering notebook 

all_features = ['Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'SatisfactionScore',
               'CreditScoreOrd', 'AgeOrd', 'TaxBracket', 'BalanceOrd', 'PointsEarnedQuant',
               'CardTypeOrd', 'SurnameOrd', 'CountryOrd', 'GenderBinary']

# Create an instance of the XGBClassifier
xgb_cls = XGBClassifier()


param_grid = {
    'classifier__learning_rate': [0.02],
    'classifier__max_depth': [2, 3, 4],
    'classifier__n_estimators': [200],
    'classifier__reg_alpha': [0.5],
    'classifier__min_child_weight': [1, 2, 3],
    'classifier__subsample': [0.8],
    'classifier__reg_lambda': [0.5],
    'classifier__scale_pos_weight': [2]
}

prep = ColumnTransformer(
    transformers=[
        ('all_features', 'passthrough', all_features)      
    ])



# Create the pipeline
pipe_xgb_cls = Pipeline([
    ('preprocessor', prep),
    ('classifier', xgb_cls)
])

# Create the GridSearchCV object
grid_search_xg_hyper = GridSearchCV(pipe_xgb_cls, param_grid, cv=7, scoring='f1_macro', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_xg_hyper.fit(X_train , y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_xg_hyper.best_params_)
print("Best Score (F1 Macro):", grid_search_xg_hyper.best_score_)

# Get the best model from the grid search
best_model = grid_search_xg_hyper.best_estimator_

y_pred = best_model.predict(X_val)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

#* Again, no real difference.

Fitting 7 folds for each of 9 candidates, totalling 63 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7556525898799006

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       796
           1       0.64      0.57      0.60       204

    accuracy                           0.85      1000
   macro avg       0.76      0.74      0.75      1000
weighted avg       0.84      0.85      0.84      1000



<font color='Tangerine'> 
Fitting 7 folds for each of 9 candidates, totalling 63 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7556525898799006

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       796
           1       0.64      0.57      0.60       204

    accuracy                           0.85      1000
   macro avg       0.76      0.74      0.75      1000
weighted avg       0.84      0.85      0.84      1000
</font>

In [68]:
#
#* Let us look at the features without the ones created in the feature engineering notebook 

all_features = ['Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'SatisfactionScore',
               'CreditScoreOrd', 'AgeOrd', 'TaxBracket', 'BalanceOrd', 'PointsEarnedQuant',
               'CardTypeOrd', 'SurnameOrd', 'CountryOrd', 'GenderBinary']

# Create an instance of the XGBClassifier
xgb_cls = XGBClassifier()


param_grid = {
    'classifier__learning_rate': [0.02],
    'classifier__max_depth': [2, 3, 4],
    'classifier__n_estimators': [200],
    'classifier__reg_alpha': [0.5],
    'classifier__min_child_weight': [1, 2, 3],
    'classifier__subsample': [0.8],
    'classifier__reg_lambda': [0.5],
    'classifier__scale_pos_weight': [2]
}

prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), all_features),      
    ])



# Create the pipeline
pipe_xgb_cls = Pipeline([
    ('preprocessor', prep),
    ('classifier', xgb_cls)
])

# Create the GridSearchCV object
grid_search_xg_hyper = GridSearchCV(pipe_xgb_cls, param_grid, cv=7, scoring='f1_macro', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_xg_hyper.fit(X_train , y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search_xg_hyper.best_params_)
print("Best Score (F1 Macro):", grid_search_xg_hyper.best_score_)

# Get the best model from the grid search
best_model = grid_search_xg_hyper.best_estimator_

y_pred = best_model.predict(X_val)

report = classification_report(y_val, y_pred)
print()
print("Classification Report:")
print(report)

#* Again, no real difference.Target encoding seemed to have effect before

Fitting 7 folds for each of 9 candidates, totalling 63 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7556525898799006

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       796
           1       0.64      0.57      0.60       204

    accuracy                           0.85      1000
   macro avg       0.76      0.74      0.75      1000
weighted avg       0.84      0.85      0.84      1000



<font color='Tangerine'> 
Fitting 7 folds for each of 9 candidates, totalling 63 fits
Warning: No categorical columns found. Calling 'transform' will only return input data.
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 4, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7556525898799006

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       796
           1       0.64      0.57      0.60       204

    accuracy                           0.85      1000
   macro avg       0.76      0.74      0.75      1000
weighted avg       0.84      0.85      0.84      1000
</font>

<font color='lightblue'> 
Lastly, lets just try the original dataset, before we try predicting the test set 
</font>

In [77]:
df = pd.read_csv("Customer-Churn-Records.csv")
df = df.drop(columns=['RowNumber','CustomerId'])
df = df.drop(columns=['Complain'])
df = df.rename(columns={'Card Type': 'CardType'})
df = df.rename(columns={'Point Earned': 'PointsEarned'})
df = df.rename(columns={'Geography': 'Country'})
df = df.rename(columns={'Satisfaction Score': 'SatisfactionScore'})
# save original columns
myColumns = df.columns
y = df['Exited']
df = df.drop(['Exited','Surname'], axis=1) # I drop Surname since we have not encoded it
# yy = np.ravel(y)


X_trainDF, X_tempDF, y_trainDF, y_tempDF = train_test_split(df, y, test_size=0.2, random_state=40)

# X_train, X_temp, y_train, y_temp = train_test_split(df.drop('Exited', axis=1), df['Exited'], test_size=0.2, stratify=df['Exited'], random_state=42)

# We use train_test_split twice so we can get a validation set as well
X_valDF, X_testDF, y_valDF, y_testDF = train_test_split(X_tempDF, y_tempDF, test_size=0.5, stratify=y_tempDF, random_state=42)

In [88]:
# could benefit from scaling
continuous_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 'PointsEarned']
# Need to be one hot encoded
categorical_features = ['Country', 'Gender', 'CardType']
# Are either in 0/1 boolean or is ordinal as default
passthrough_features = ['Tenure', 'HasCrCard', 'IsActiveMember', 'SatisfactionScore','Tenure', 'NumOfProducts']


xgb_cls = XGBClassifier()


param_grid = {
    'classifier__learning_rate': [0.02],
    'classifier__max_depth': [2, 3, 4],
    'classifier__n_estimators': [200],
    'classifier__reg_alpha': [0.5],
    'classifier__min_child_weight': [1, 2, 3],
    'classifier__subsample': [0.8],
    'classifier__reg_lambda': [0.5],
    'classifier__scale_pos_weight': [2]
}



prep = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_features),
        ('scaler', StandardScaler(), continuous_features),
        ('passthrough', 'passthrough', passthrough_features),
    ]
)


# Create the pipeline
pipe_xg_cls_default = Pipeline([
    ('preprocessor', prep),
    ('classifier', xgb_cls)
])

# Create the GridSearchCV object
grid_search_xg_default = GridSearchCV(pipe_xg_cls_default, param_grid, scoring='f1_macro', n_jobs=-1, verbose=2, cv=7)

# Fit the GridSearchCV object to the data
grid_search_xg_default.fit(X_trainDF, y_trainDF)

# Print the best parameters and best score
print("Best Parameters:", grid_search_xg_default.best_params_)
print("Best Score (F1 Macro):", grid_search_xg_default.best_score_)

# Get the best model from the grid search
best_model_default = grid_search_xg_default.best_estimator_

y_pred = best_model_default.predict(X_valDF)

report = classification_report(y_valDF, y_pred)
print()
print("Classification Report:")
print(report)

#* Wow, the original did better! So much work... Though this has different random rows compared to X_train & X_train_cluster

Fitting 7 folds for each of 9 candidates, totalling 63 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7625244018295915

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       808
           1       0.67      0.61      0.64       192

    accuracy                           0.87      1000
   macro avg       0.79      0.77      0.78      1000
weighted avg       0.86      0.87      0.87      1000



<font color='Tangerine'> 
Fitting 7 folds for each of 9 candidates, totalling 63 fits
Best Parameters: {'classifier__learning_rate': 0.02, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5, 'classifier__scale_pos_weight': 2, 'classifier__subsample': 0.8}
Best Score (F1 Macro): 0.7625244018295915

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       808
           1       0.67      0.61      0.64       192

    accuracy                           0.87      1000
   macro avg       0.79      0.77      0.78      1000
weighted avg       0.86      0.87      0.87      1000
</font>

<font color='LightBlue'> 
Now let us try the test set on both and save the model
</font>

In [87]:
# testing the best model with the extensive feature engineering

y_test = pd.read_csv("y_test_tran.csv")

y_pred = best_model_final_trans.predict(X_test_cluster)

report = classification_report(y_test, y_pred)
print()
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       796
           1       0.66      0.65      0.65       204

    accuracy                           0.86      1000
   macro avg       0.78      0.78      0.78      1000
weighted avg       0.86      0.86      0.86      1000



<font color='Tangerine'> 

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       796
           1       0.66      0.65      0.65       204

    accuracy                           0.86      1000
   macro avg       0.78      0.78      0.78      1000
weighted avg       0.86      0.86      0.86      1000
</font>

In [90]:
# testing the model with basic encoding only
y_pred = best_model_default.predict(X_testDF)

report = classification_report(y_testDF, y_pred)
print()
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       808
           1       0.65      0.61      0.63       192

    accuracy                           0.86      1000
   macro avg       0.78      0.77      0.77      1000
weighted avg       0.86      0.86      0.86      1000



<font color='Tangerine'> 

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       808
           1       0.65      0.61      0.63       192

    accuracy                           0.86      1000
   macro avg       0.78      0.77      0.77      1000
weighted avg       0.86      0.86      0.86      1000
</font>

<font color='lightblue'> 
In the end both models performed equally well
</font>

In [92]:
from joblib import dump

# save the pipeline
dump(pipe_xgb_cls, 'pipeline_trans_xgboost.joblib')
dump(pipe_xg_cls_default, 'pipeline_default_xgboost.joblib')

['pipeline_trans_xgboost.joblib']

In [94]:

from joblib import load

myPipe = load('pipeline_trans_xgboost.joblib')

In [95]:
myPipe

<font color=''> 
Thank you for reading my notebooks!
</font>