# Churner Random Forest Classifier

In [1]:
%%html
<a href="https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset/data?select=Bank+Customer+Churn+Prediction.csv">Bank Customer Churn Dataset</a>

In [2]:
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [3]:
df = pd.read_csv("../data/Bank Customer Churn Prediction.csv")

In [4]:
print(df["country"].unique())
print(df["gender"].unique())

['France' 'Spain' 'Germany']
['Female' 'Male']


In [5]:
df = pd.get_dummies(df, columns=['country', 'gender'], drop_first=True)
df.head()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_Germany,country_Spain,gender_Male
0,15634602,619,42,2,0.0,1,1,1,101348.88,1,False,False,False
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,False,True,False
2,15619304,502,42,8,159660.8,3,1,0,113931.57,1,False,False,False
3,15701354,699,39,1,0.0,2,0,0,93826.63,0,False,False,False
4,15737888,850,43,2,125510.82,1,1,1,79084.1,0,False,True,False


In [7]:
df['churn'].value_counts() 

churn
0    7963
1    2037
Name: count, dtype: int64

In [8]:
X = df.drop(["churn"], axis=1) 
y = df['churn'] 

X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 
X_train.shape, X_test.shape 

((7500, 12), (2500, 12))

In [9]:
model = RandomForestClassifier(n_estimators = 2) 
model.fit(X_train, y_train) 

# predict the model 
y_pred = model.predict(X_test) 

# performance evaluation metrics 
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2252
           1       0.34      0.68      0.45       248

    accuracy                           0.84      2500
   macro avg       0.65      0.77      0.68      2500
weighted avg       0.90      0.84      0.86      2500



In [11]:
RandomForestClassifier??

[0;31mInit signature:[0m
[0mRandomForestClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'gini'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;34m'sqrt'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[

In [14]:
param_grid = {  
    'criterion' : ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [5, 10], 
    'max_leaf_nodes': [5, 10],
} 

# GridSearchCV

In the Grid Search technique, we systematically explore every possible combination from a predefined list of hyperparameter values. This process resembles a grid, where values are arranged in a matrix. Each unique parameter set is carefully assessed, and the model's accuracy is recorded. After evaluating all combinations, the model with the parameter set that yields the highest accuracy is identified as the optimal choice.

In [15]:
grid_search = GridSearchCV(model, param_grid=param_grid) 
grid_search.fit(X_train, y_train) 
print(grid_search.best_estimator_) 

ValueError: Invalid parameter 'zack_is_the_best' for estimator RandomForestClassifier(max_depth=5, max_leaf_nodes=5, n_estimators=2). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

In [13]:
model_grid = grid_search.best_estimator_
y_pred_grid = model.predict(X_test) 
print(classification_report(y_pred_grid, y_test)) 

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2252
           1       0.34      0.68      0.45       248

    accuracy                           0.84      2500
   macro avg       0.65      0.77      0.68      2500
weighted avg       0.90      0.84      0.86      2500



# RandomizedSearchCV

Random search involves exploring various combinations of hyperparameters randomly to discover the optimal solution for a constructed model. This technique tests out random combinations within specified value ranges. To optimize through random search, the function is assessed at a certain number of randomly selected configurations within the parameter space.

In [12]:
random_search = RandomizedSearchCV(model,param_grid) 
random_search.fit(X_train, y_train) 
print(random_search.best_estimator_) 

RandomForestClassifier(criterion='entropy', max_depth=5, max_features=None,
                       max_leaf_nodes=10, n_estimators=2)


In [13]:
model_random = random_search.best_estimator_
y_pred_rand = model.predict(X_test) 
print(classification_report(y_pred_rand, y_test)) 

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2254
           1       0.34      0.69      0.45       246

    accuracy                           0.84      2500
   macro avg       0.65      0.77      0.68      2500
weighted avg       0.90      0.84      0.86      2500

