In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
plt.style.use(['seaborn-v0_8-bright','dark_background'])

In [2]:
data=pd.read_csv('Customer_Data.csv')
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,Male,0.0,self_employed,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,2019-05-21
1,2,2348,35,Male,0.0,self_employed,,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,2019-11-01
2,4,2194,31,Male,0.0,salaried,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,NaT
3,5,2329,90,,,self_employed,1020.0,2,582,2291.91,...,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1,2019-08-06
4,6,1579,42,Male,2.0,self_employed,1494.0,3,388,927.72,...,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1,2019-11-03


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28382 entries, 0 to 28381
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   customer_id                     28382 non-null  int64  
 1   vintage                         28382 non-null  int64  
 2   age                             28382 non-null  int64  
 3   gender                          27857 non-null  object 
 4   dependents                      25919 non-null  float64
 5   occupation                      28302 non-null  object 
 6   city                            27579 non-null  float64
 7   customer_nw_category            28382 non-null  int64  
 8   branch_code                     28382 non-null  int64  
 9   current_balance                 28382 non-null  float64
 10  previous_month_end_balance      28382 non-null  float64
 11  average_monthly_balance_prevQ   28382 non-null  float64
 12  average_monthly_balance_prevQ2  

In [4]:
columns=data.columns
columns

Index(['customer_id', 'vintage', 'age', 'gender', 'dependents', 'occupation',
       'city', 'customer_nw_category', 'branch_code', 'current_balance',
       'previous_month_end_balance', 'average_monthly_balance_prevQ',
       'average_monthly_balance_prevQ2', 'current_month_credit',
       'previous_month_credit', 'current_month_debit', 'previous_month_debit',
       'current_month_balance', 'previous_month_balance', 'churn',
       'last_transaction'],
      dtype='object')

# Data exploration and pre-processing

In [5]:
data.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
last_transaction                     0
dtype: int64

In [6]:
# Removing rows with missing values.

In [7]:
data.dropna(inplace=True,axis=0,subset=['city','gender','occupation','dependents'])

In [8]:
data['last_transaction']=pd.to_datetime(data['last_transaction'])
data.dropna(inplace=True,axis=0,subset=['last_transaction'])

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22067 entries, 0 to 28381
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   customer_id                     22067 non-null  int64         
 1   vintage                         22067 non-null  int64         
 2   age                             22067 non-null  int64         
 3   gender                          22067 non-null  object        
 4   dependents                      22067 non-null  float64       
 5   occupation                      22067 non-null  object        
 6   city                            22067 non-null  float64       
 7   customer_nw_category            22067 non-null  int64         
 8   branch_code                     22067 non-null  int64         
 9   current_balance                 22067 non-null  float64       
 10  previous_month_end_balance      22067 non-null  float64       
 11  average

In [10]:
# Creating a column for days since last transaction
from datetime import datetime
today=datetime.now()
data['datetime_now']=today
data['days_since_last_transaction']=data['datetime_now']-data['last_transaction']
data['days_since_last_transaction']=data['days_since_last_transaction'].dt.days
data['days_since_last_transaction'].head()

0    1937
4    1771
5    1773
6    1811
7    1885
Name: days_since_last_transaction, dtype: int64

In [11]:
data=data.drop(columns=['last_transaction','datetime_now'])

In [12]:
# converting 'gender' to int or float by mapping Male to 1 and Female to 0
data['gender']=data['gender'].replace({'Male': 1, 'Female': 0})
data['gender'].head()

0    1
4    1
5    0
6    1
7    1
Name: gender, dtype: int64

In [13]:
# converting 'occupation' to int or float by mapping values
data['occupation'].unique()

array(['self_employed', 'retired', 'salaried', 'student', 'company'],
      dtype=object)

In [14]:
data['occupation']=data['occupation'].replace({'self_employed': 1, 'salaried': 2, 'retired': 3, 'student': 4, 'company': 5})
data['occupation'].head()

0    1
4    1
5    1
6    3
7    1
Name: occupation, dtype: int64

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22067 entries, 0 to 28381
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   customer_id                     22067 non-null  int64  
 1   vintage                         22067 non-null  int64  
 2   age                             22067 non-null  int64  
 3   gender                          22067 non-null  int64  
 4   dependents                      22067 non-null  float64
 5   occupation                      22067 non-null  int64  
 6   city                            22067 non-null  float64
 7   customer_nw_category            22067 non-null  int64  
 8   branch_code                     22067 non-null  int64  
 9   current_balance                 22067 non-null  float64
 10  previous_month_end_balance      22067 non-null  float64
 11  average_monthly_balance_prevQ   22067 non-null  float64
 12  average_monthly_balance_prevQ2  22067

In [16]:
# Checking the target variable 'churn' distribution
data['churn'].value_counts()/len(data)

churn
0    0.806317
1    0.193683
Name: count, dtype: float64

# Implementing model

In [17]:
# separating dependant and independent variables
X=data.drop(columns=['customer_id','churn'])
Y=data['churn']

In [18]:
# scaling the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_X=scaler.fit_transform(X)

In [19]:
# splitting dataset
from sklearn.model_selection import train_test_split as tts
train_x, test_x, train_y, test_y=tts(scaled_X,Y,train_size=0.8,stratify=Y)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((17653, 19), (4414, 19), (17653,), (4414,))

In [20]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(class_weight='balanced')

In [21]:
classifier.fit(train_x,train_y)
predicted_values=classifier.predict(test_x)
predicted_probabilities=classifier.predict_proba(test_x)
predicted_values, predicted_probabilities

(array([1, 0, 0, ..., 1, 1, 0], dtype=int64),
 array([[0.45968138, 0.54031862],
        [0.69290083, 0.30709917],
        [0.50117501, 0.49882499],
        ...,
        [0.47638573, 0.52361427],
        [0.25093826, 0.74906174],
        [0.60254352, 0.39745648]]))

In [22]:
from sklearn.metrics import confusion_matrix
cf=confusion_matrix(test_y,predicted_values)
cf

array([[2747,  812],
       [ 319,  536]], dtype=int64)

In [23]:
# Accuracy
classifier.score(test_x,test_y)

0.7437698232895333

In [24]:
# Precision
from sklearn.metrics import precision_score
precision=precision_score(test_y,predicted_values)
precision

0.39762611275964393

In [25]:
# Recall
from sklearn.metrics import recall_score
recall=recall_score(test_y,predicted_values)
recall

0.6269005847953216

In [26]:
# f1-score
from sklearn.metrics import f1_score
f1=f1_score(test_y,predicted_values)
f1

0.48660916931457104

In [27]:
from sklearn.metrics import classification_report
k=classification_report(test_y,predicted_values)
k

'              precision    recall  f1-score   support\n\n           0       0.90      0.77      0.83      3559\n           1       0.40      0.63      0.49       855\n\n    accuracy                           0.74      4414\n   macro avg       0.65      0.70      0.66      4414\nweighted avg       0.80      0.74      0.76      4414\n'

### Precision-recall curve

In [28]:
from sklearn.metrics import precision_recall_curve
precision_points, recall_points, threshold_points=precision_recall_curve(test_y,predicted_probabilites[:,1])
precision_points.shape, recall_points.shape, threshold_points.shape

NameError: name 'predicted_probabilites' is not defined

In [None]:
plt.figure(figsize=(7,5),dpi=100)
plt.plot(threshold_points, precision_points[:-1],color='red', label='Precision')
plt.plot(threshold_points, recall_points[:-1],color='green',label='Recall')
plt.xlabel('Thresholds')
plt.ylabel('Score')
plt.title('Precision-Recall tradeoff')
plt.legend()
plt.show()

In [None]:
# Threshold near 0.55

### AUC-ROC curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, threshold=roc_curve(test_y, predicted_probabilities[:,1])

In [None]:
plt.figure(figsize=(7,5),dpi=100)
plt.plot(fpr,tpr,color='green')
plt.plot([0,1],[0,1],label='baseline',color='red')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('AUC-ROC curve')
plt.show()
roc_auc_score(test_y,predicted_probabilities[:,1])

### Coefficients plot

In [None]:
# arranging the data
c=classifier.coef_.reshape(-1)
x=X.columns

coef_table=pd.DataFrame({'coefficients': c, 'variables': x})

coef_table=coef_table.sort_values(by='coefficients')
coef_table.head()

In [None]:
plt.figure(figsize=(8,6), dpi=120)
plt.barh(coef_table['variables'], coef_table['coefficients'])
plt.xlabel('coefficients magnitude')
plt.ylabel('variables')
plt.title('coefficients plot')
plt.show()