In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import *
from sklearn.model_selection import *

df = pd.read_csv('Churn_Modelling.csv')
df.head(5)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [3]:
# Drop kolom yang gak guna
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis = 1)

In [4]:
# Lakukan one hot encoding untuk geography dan gender
df = pd.get_dummies(df, columns=['Geography', 'Gender'], dtype = int)

In [8]:
df['Exited'].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [6]:
def check_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3-Q1

    outliers = df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))]
    return outliers

def print_outliers(df):
    for i in df.columns:
        outliers = check_outliers(df[i])
        print(f'Outliers for column {i}: {len(outliers)}')

def handle_outliers(df): 
    for i in df.columns:
        if i not in ['Exited', 'Geography_Spain']:
            Q1 = df[i].quantile(0.25)
            Q3 = df[i].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            df[i] = np.where(df[i] < lower_bound, lower_bound, df[i])
            df[i] = np.where(df[i] > upper_bound, upper_bound, df[i])

            print(f'Column {i} has been processed')

    print('It is done UAHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH')

print_outliers(df)

Outliers for column CreditScore: 15
Outliers for column Age: 359
Outliers for column Tenure: 0
Outliers for column Balance: 0
Outliers for column NumOfProducts: 60
Outliers for column HasCrCard: 0
Outliers for column IsActiveMember: 0
Outliers for column EstimatedSalary: 0
Outliers for column Exited: 2037
Outliers for column Geography_France: 0
Outliers for column Geography_Germany: 0
Outliers for column Geography_Spain: 2477
Outliers for column Gender_Female: 0
Outliers for column Gender_Male: 0


In [7]:
handle_outliers(df)
print(print_outliers(df))

Column CreditScore has been processed
Column Age has been processed
Column Tenure has been processed
Column Balance has been processed
Column NumOfProducts has been processed
Column HasCrCard has been processed
Column IsActiveMember has been processed
Column EstimatedSalary has been processed
Column Geography_France has been processed
Column Geography_Germany has been processed
Column Gender_Female has been processed
Column Gender_Male has been processed
It is done UAHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
Outliers for column CreditScore: 0
Outliers for column Age: 0
Outliers for column Tenure: 0
Outliers for column Balance: 0
Outliers for column NumOfProducts: 0
Outliers for column HasCrCard: 0
Outliers for column IsActiveMember: 0
Outliers for column EstimatedSalary: 0
Outliers for column Exited: 2037
Outliers for column Geography_France: 0
Outliers for column Geography_Germany: 0
Outliers for column Geography_Spain: 2477
Outliers for column Gender_Female: 0
Outliers for column Gender_

In [9]:
# data normalization
minmax = MinMaxScaler()

X = df.drop(columns=['Exited'], axis = 1)
y = df['Exited']

X = minmax.fit_transform(X)

In [10]:
y.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [11]:
# Model Selection
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
smote = SMOTE(random_state=0)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)

Kfold = KFold(n_splits=10)
scores = cross_val_score(KNN, X, y, cv = Kfold)

print('K-Fold Score', scores)
print('K fold mean score', scores.mean())

K-Fold Score [0.814 0.805 0.825 0.835 0.83  0.812 0.831 0.832 0.825 0.832]
K fold mean score 0.8240999999999999


In [14]:
# prediction
from sklearn.metrics import *
y_pred = KNN.predict(X_test)
print('Accuracy:', accuracy_score(y_pred, y_test))
print('Mean Squared Error:', mean_squared_error(y_pred, y_test))
print('Classification report:', classification_report(y_pred, y_test))

Accuracy: 0.74
Mean Squared Error: 0.26
Classification report:               precision    recall  f1-score   support

           0       0.76      0.90      0.82      1361
           1       0.65      0.40      0.50       639

    accuracy                           0.74      2000
   macro avg       0.71      0.65      0.66      2000
weighted avg       0.73      0.74      0.72      2000



In [17]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
Kfold2 = StratifiedKFold(n_splits=10)
scores2 = cross_val_score(RF, X, y, cv = Kfold2)

print('K-Fold Score', scores2)
print('K fold mean score', scores2.mean())

K-Fold Score [0.865 0.855 0.87  0.86  0.859 0.851 0.86  0.869 0.858 0.862]
K fold mean score 0.8609


In [18]:
# prediction
y_pred2 = RF.predict(X_test)
print('Accuracy:', accuracy_score(y_pred2, y_test))
print('Mean Squared Error:', mean_squared_error(y_pred2, y_test))
print('Classification report:', classification_report(y_pred2, y_test))

Accuracy: 0.837
Mean Squared Error: 0.163
Classification report:               precision    recall  f1-score   support

           0       0.89      0.90      0.90      1585
           1       0.61      0.58      0.60       415

    accuracy                           0.84      2000
   macro avg       0.75      0.74      0.75      2000
weighted avg       0.83      0.84      0.84      2000

