### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

### Defining Dataset

In [2]:
df = pd.read_csv('Churn_Modelling.csv')

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.shape

(10000, 14)

In [5]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [6]:
del df['Surname']
del df['CustomerId']
df.isnull().sum()

RowNumber          0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
target_count = df['Exited'].value_counts()
print("\nExited count:")
print(target_count)


Exited count:
Exited
0    7963
1    2037
Name: count, dtype: int64


In [8]:
exited_rate = df['Exited'].mean()
print("\nExited rate:")
print(f"{exited_rate * 100:.2f}%")


Exited rate:
20.37%


In [9]:
X = df.iloc[:,:-1]
X.head(2)

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1,619,France,Female,42,2,0.0,1,1,1,101348.88
1,2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58


In [10]:
y = df['Exited']
y.head(2)

0    1
1    0
Name: Exited, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.35,random_state=25)

In [12]:
print("df : ",df.shape)
print("x_train : ",x_train.shape)
print("x_test : ",x_test.shape)
print("y_train : ",y_train.shape)
print("y_test : ",y_test.shape)

df :  (10000, 12)
x_train :  (6500, 11)
x_test :  (3500, 11)
y_train :  (6500,)
y_test :  (3500,)


In [13]:
# Get dummy variables for both training and test sets
x_train_encoded = pd.get_dummies(x_train)
x_test_encoded = pd.get_dummies(x_test)

# Align columns of x_test_encoded with x_train_encoded
common_columns = x_train_encoded.columns.intersection(x_test_encoded.columns)
x_test_encoded = x_test_encoded[common_columns]


# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train_encoded)
X_test_scaled = scaler.transform(x_test_encoded)


In [14]:
X_train_scaled

array([[-1.48947952,  0.53172828,  0.28563648, ...,  1.75509303,
         1.07845332, -1.07845332],
       [-0.20679476, -0.49672414,  0.38055172, ..., -0.56977037,
         1.07845332, -1.07845332],
       [ 0.98854495, -0.83954161, -0.66351597, ..., -0.56977037,
         1.07845332, -1.07845332],
       ...,
       [-0.71798953, -0.1954603 , -0.56860073, ..., -0.56977037,
        -0.92725386,  0.92725386],
       [ 0.56399926, -0.14351826, -0.94826171, ..., -0.56977037,
         1.07845332, -1.07845332],
       [ 1.22239306, -1.03692137,  0.09580599, ..., -0.56977037,
         1.07845332, -1.07845332]])

In [15]:
X_test_scaled

array([[-5.01888812e-01, -1.26546635e+00, -2.83854993e-01, ...,
        -5.69770368e-01,  1.07845332e+00, -1.07845332e+00],
       [-3.19890947e-01,  6.42499079e-02,  7.60212705e-01, ...,
         1.75509303e+00, -9.27253856e-01,  9.27253856e-01],
       [ 1.19977383e+00,  3.44736932e-01, -1.23300745e+00, ...,
        -5.69770368e-01,  1.07845332e+00, -1.07845332e+00],
       ...,
       [ 2.60553486e-01,  1.93416340e+00,  1.90721234e-01, ...,
         1.75509303e+00,  1.07845332e+00, -1.07845332e+00],
       [ 1.18411435e+00,  1.31085890e+00,  8.90743071e-04, ...,
        -5.69770368e-01, -9.27253856e-01,  9.27253856e-01],
       [ 7.94367492e-01,  1.60173433e+00,  1.51953467e+00, ...,
        -5.69770368e-01, -9.27253856e-01,  9.27253856e-01]])

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
lr = LogisticRegression(random_state=20)

In [18]:
lr.fit(X_train_scaled,y_train)

In [19]:
y_pred = lr.predict(X_test_scaled)

In [45]:
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score,accuracy_score,classification_report

In [21]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

Accuracy: 0.8188571428571428
Precision: 0.5893536121673004
Recall: 0.2276064610866373
F1-Score: 0.3283898305084746


In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
from sklearn.model_selection import GridSearchCV

#In case of classifier like knn the parameter to be tuned is n_neighbors

n_neighbors= list(np.arange(1,20,2))
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] 
metric =['minkowski','euclidean','cosine','manhattan']

In [24]:
parameters = {"n_neighbors" :n_neighbors,
             "algorithm":algorithm,
             "metric":metric}
parameters

{'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
 'metric': ['minkowski', 'euclidean', 'cosine', 'manhattan']}

In [25]:
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(estimator  = knn,param_grid = parameters,cv=5,verbose=4, n_jobs =-1)
knn_cv.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


100 fits failed out of a total of 800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\neighbors\_classification.py", line 233, in fit
    return self._fit(X, y)
  File "C:\Users\User\AppData\Local\Programs\Python\

In [26]:
knn_cv.best_params_

{'algorithm': 'auto', 'metric': 'cosine', 'n_neighbors': 19}

In [27]:
knn = KNeighborsClassifier(n_neighbors=19,
    algorithm='auto',
    metric='cosine',
    n_jobs=-1)

In [28]:
knn.fit(x_train_encoded,y_train)

In [29]:
y_pred=knn.predict(X_test_scaled)



In [30]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

Accuracy: 0.8034285714285714
Precision: 0.0
Recall: 0.0
F1-Score: 0.0


In [31]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 4, 6, 8, 10, 11, 13],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier()
rf_grid = GridSearchCV(rfc,param_grid=param_grid,n_jobs=-1,cv=3,verbose=2)

In [36]:
rf_grid.fit(X_train_scaled,y_train)

Fitting 3 folds for each of 378 candidates, totalling 1134 fits


In [37]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 13,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 50}

In [38]:
rfc = RandomForestClassifier( n_estimators=50,
    max_depth=13,
    min_samples_split=5,
    min_samples_leaf=1,
    bootstrap=True)

In [40]:
rfc.fit(X_train_scaled,y_train)

In [42]:
y_pred = rfc.predict(X_test_scaled)

In [48]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

# Calculate Classification Report
clr = classification_report(y_test, y_pred)
print("Classification Report:\n",clr)

Accuracy: 0.872
Precision: 0.804177545691906
Recall: 0.4522760646108664
F1-Score: 0.5789473684210528
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92      2819
           1       0.80      0.45      0.58       681

    accuracy                           0.87      3500
   macro avg       0.84      0.71      0.75      3500
weighted avg       0.87      0.87      0.86      3500

