In [54]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [55]:
df=pd.read_csv("/content/Churn_Modelling.csv")

In [56]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [57]:
#Drop the unwanted columns
df.drop(columns=['RowNumber','CustomerId','Surname'],axis=1,inplace=True)

In [58]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [59]:
#Dividing the dataset into X and y
X=df.drop('Exited',axis=1)
y=df['Exited']

In [60]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.00,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,699,France,Female,39,1,0.00,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


In [61]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [62]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [63]:
X_train.shape,X_test.shape

((8000, 10), (2000, 10))

In [64]:
#Pipeline for processing numerical data
num_pipeline=Pipeline([
      ('num_imputation',SimpleImputer(strategy='mean')),
      ('feature_scaling',MinMaxScaler()),
      ('pca',PCA(0.90))
    ])


In [65]:
num_pipeline

In [66]:
#pipeline for processing categorical data
cat_pipeline=Pipeline([
    ('cat_imputation',SimpleImputer(fill_value='missing',strategy='constant')),
    ('one_hot_encoding',OneHotEncoder(sparse=False,handle_unknown='ignore'))
])

In [67]:
cat_pipeline

In [68]:
num_cols=X.select_dtypes(include=np.number).columns.tolist()

In [69]:
cat_cols=X.select_dtypes(include='object').columns.tolist()

In [70]:
num_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [71]:
cat_cols

['Geography', 'Gender']

In [72]:
preprocessor=ColumnTransformer([
    ('categorical',cat_pipeline,cat_cols),
    ('numerical',num_pipeline,num_cols)
])

pipe=Pipeline( [
    ('preprocessor',preprocessor),
    ('estimator',RandomForestClassifier())
])

In [73]:
pipe.fit(X_train,y_train)

In [74]:
pipe.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [75]:
pipe.score(X_test,y_test)

0.812

In [53]:
#Hyperparameter Tuning

parameters={
    'estimator__n_estimators':[100,150,200],
    'estimator__max_depth':[5,7,10,15],
    'estimator__min_samples_split':[2,3,4],
    'estimator__max_features':[2,4,6,8,10]
}

grid_search=GridSearchCV(
    pipe,
    param_grid=parameters,
    n_jobs=1
)

grid_search.fit(X_train,y_train)

In [76]:
grid_search.best_params_

{'estimator__max_depth': 10,
 'estimator__max_features': 10,
 'estimator__min_samples_split': 4,
 'estimator__n_estimators': 100}

In [79]:
pipe2=Pipeline( [
    ('preprocessor',preprocessor),
    ('estimator',RandomForestClassifier(n_estimators=100,
                                        max_features=10,
                                        min_samples_split=4,
                                        max_depth=10))
])

In [80]:
pipe2.fit(X_train,y_train)

In [81]:
pipe2.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [82]:
pipe2.score(X_test,y_test)

0.823