In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
churn = pd.read_csv('Churn_Modelling.csv')
churn

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# Data Preprocessing

# Drop irrelevalant columns

In [5]:
churn.drop(columns=['RowNumber' , 'CustomerId','Surname'], inplace=True)

# Encode Categorical Features

In [7]:
#from sklearn.preprocessing import LabelEncoder

In [8]:
#le = LabelEncoder()

In [9]:
#churn['Geography']= le.fit_transform(churn['Geography'])

In [10]:
#churn['Gender'] =  le.fit_transform(churn['Gender'])

In [11]:
#churn.duplicated().any()

In [12]:
churn_encoded = pd.get_dummies(churn, columns=['Geography','Gender'], drop_first=True, dtype='int64')

# Seperate Features and Target Variables

In [14]:
X = churn_encoded.drop(columns='Exited')
y = churn_encoded['Exited']

# Feature Scaling


In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [18]:
X = scaler.fit_transform(X)
X

array([[-0.32622142,  0.29351742, -1.04175968, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [-0.44003595,  0.19816383, -1.38753759, ..., -0.57873591,
         1.74273971, -1.09598752],
       [-1.53679418,  0.29351742,  1.03290776, ..., -0.57873591,
        -0.57380915, -1.09598752],
       ...,
       [ 0.60498839, -0.27860412,  0.68712986, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [ 1.25683526,  0.29351742, -0.69598177, ...,  1.72790383,
        -0.57380915,  0.91241915],
       [ 1.46377078, -1.04143285, -0.35020386, ..., -0.57873591,
        -0.57380915, -1.09598752]])

# Split The Data Into Training and Test Sets

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Importing Models

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize models

In [25]:
lrc = LogisticRegression()
svc = SVC()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn = KNeighborsClassifier()

# Fit Data

In [27]:
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
dtc.fit(X_train, y_train)
rfc.fit(X_train, y_train)
knn.fit(X_train, y_train)

In [28]:
lrc_predict = lrc.predict(X_test)
svc_predict = svc.predict(X_test)
dtc_predict = dtc.predict(X_test)
rfc_predict = rfc.predict(X_test)
knn_predict = knn.predict(X_test)

In [29]:
print(lrc_predict,svc_predict,dtc_predict,rfc_predict,knn_predict)

[0 0 0 ... 0 0 0] [0 0 0 ... 1 0 0] [0 0 0 ... 1 0 1] [0 0 0 ... 1 0 0] [0 0 0 ... 1 0 0]


# Evaluation Models

In [31]:
from sklearn import metrics

# Accuracy

In [33]:
lrc_acc = metrics.accuracy_score(y_test, lrc_predict)
svc_acc = metrics.accuracy_score(y_test, svc_predict)
dtc_acc = metrics.accuracy_score(y_test, dtc_predict)
rfc_acc = metrics.accuracy_score(y_test, rfc_predict)
knn_acc = metrics.accuracy_score(y_test, knn_predict)

In [34]:
print(lrc_acc,svc_acc,dtc_acc,rfc_acc,knn_acc)

0.811 0.856 0.782 0.868 0.8295


# F1 - Score

In [36]:
lrc_f1 = metrics.f1_score(y_test,lrc_predict)
svc_f1 = metrics.f1_score(y_test,svc_predict)
dtc_f1 = metrics.f1_score(y_test,dtc_predict)
rfc_f1 = metrics.f1_score(y_test,rfc_predict)
knn_f1 = metrics.f1_score(y_test,knn_predict)

In [37]:
print(lrc_f1,svc_f1,dtc_f1,rfc_f1,knn_f1)

0.2947761194029851 0.5102040816326531 0.46958637469586373 0.5913312693498451 0.4612954186413902


In [38]:
print(metrics.classification_report(y_test, lrc_predict))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000



# Saving Models

In [40]:
import joblib

In [41]:
rfc = RandomForestClassifier()
rfc.fit(X,y)

In [42]:
joblib.dump(rfc, 'ChurnModel')

['ChurnModel']

In [43]:
churnModel = joblib.load('ChurnModel')

# Prepare New Data

In [45]:
churn.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [46]:
def prepare_input(data):
    input_df = pd.DataFrame([data])


    # input_df['Geography']=le.transform(input_df['Geography'])
    # input_df['Gender']=le.transform(input_df['Gender'])

    # Process categorical variables for encoding
    for col in ['Geography', 'Gender']:
        for unique_val in churn[col].unique():
            col_name = f"{col}_{unique_val}"
            input_df[col_name]= 1 if (unique_val == eval(col)) else 0

    scaler_input = scaler.transform(input_df)
    return scaled_input

In [47]:
def predict_for_newUser(data, model):
    prepared_data = prepare_input(data)
    prediction = model.predict(prepared_data)
    return prediction[0]

In [48]:
new_user_data = {
    'CreditScore' : 600,
    'Geography' : 'Germany',
    'Gender' : 'Male',
    'Age' : 45,
    'Tenure' : 2,
    'Balance' : 60000,
    'NumOfProducts' : 2,
    'HasCrCard' : 1,
    'IsActiveMember' : 1,
    'EstimatedSalary' : 50000
}
predict_for_newUser(new_user_data, churnModel)

NameError: name 'Geography' is not defined