In [33]:
# Balancing ------------>
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from collections import Counter


In [34]:
df = pd.read_csv('churn_prediction.csv')
df.head(3)

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,Male,0.0,self_employed,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,2019-05-21
1,2,2348,35,Male,0.0,self_employed,,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,2019-11-01
2,4,2194,31,Male,0.0,salaried,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,NaT


In [35]:
df['churn'].value_counts()

churn
0    23122
1     5260
Name: count, dtype: int64

In [36]:
# Highly imbalanced data.
df.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
last_transaction                     0
dtype: int64

In [37]:
# Filling null values

# Convert Gender
df['gender'] = df['gender'].map({'Male':0, 'Female':1})

# Replace with -1 for missing gender
df['gender'] = df['gender'].fillna(-1)

# Replacing with max. occurence values

df['dependents'] = df['dependents'].fillna(0)
df['occupation'] = df['occupation'].fillna('self_employed')

df['city'] = df['city'].fillna(1020)

In [38]:
df['occupation'].value_counts()

occupation
self_employed    17556
salaried          6704
student           2058
retired           2024
company             40
Name: count, dtype: int64

In [39]:
df.isnull().sum()

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
last_transaction                  0
dtype: int64

In [40]:
df['gender'].value_counts()

gender
 0.0    16548
 1.0    11309
-1.0      525
Name: count, dtype: int64

In [41]:
df['gender'].unique()

array([ 0., -1.,  1.])

In [42]:
categorical = df.select_dtypes(include = 'object')

In [43]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in categorical:
    df[i] = le.fit_transform(df[i])

In [44]:
df.head(3)

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,0.0,0.0,3,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,137
1,2,2348,35,0.0,0.0,3,1020.0,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,299
2,4,2194,31,0.0,0.0,2,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,360


In [45]:
df.shape

(28382, 21)

In [46]:
from scipy.stats import zscore
z = np.abs(zscore(df))
df1 = df[(z<3).all(axis = 1)]

df = df1.copy()

In [47]:
df.head(3)

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,0.0,0.0,3,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,137
1,2,2348,35,0.0,0.0,3,1020.0,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,299
2,4,2194,31,0.0,0.0,2,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,360


In [55]:
df.shape

(26304, 21)

In [56]:
# x = df.drop('Attrition', axis = 1)

from sklearn.model_selection import train_test_split
X = df.drop(['churn', 'customer_id', 'last_transaction'], axis = 1)
y = df['churn']


In [57]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_sc = Scaler_X.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size = 0.2, random_state = 100)

1. SMOTE Technique --> handling class imbalance using SMOTE based techniques

In [58]:
# SMOTE Technique

from imblearn.over_sampling import SMOTE
from collections import Counter

counter = Counter(y_train)
print('Before', counter)

# Oversampling the train dataset using SMOTE
smt = SMOTE()

# X_train, y_train = smt.fit_resample(X_train, y_train)
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After', counter)

Before Counter({0: 17139, 1: 3904})
After Counter({0: 17139, 1: 17139})


2. ADASYN Technique

In [59]:
# 2. ADASYN Technique

from imblearn.over_sampling import ADASYN

counter = Counter(y_train)
print('Before', counter)
# Oversampling the train dataset using ADASYN

ada = ADASYN(random_state = 130)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

counter = Counter(y_train_ada)
print('After', counter)

Before Counter({0: 17139, 1: 3904})
After Counter({1: 17516, 0: 17139})


3. Hybrid Technique

 
    3.1) SMOTE + Tomek Links

In [61]:
# 3.1. SMOTE + Tomek Links

from imblearn.combine import SMOTETomek

counter = Counter(y_train)
print('Before', counter)

# Oversampling the train dataset using Smote + Tomek

smtom = SMOTETomek(random_state = 139)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)

counter = Counter(y_train_smtom)
print('After', counter)

Before Counter({0: 17139, 1: 3904})
After Counter({0: 16979, 1: 16979})


3. Hybrid Technique

 
    3.2) SMOTE + ENN

In [62]:
# 3.2. SMOTE + ENN

from imblearn.combine import SMOTEENN
counter = Counter(y_train)
print('Before', counter)

# Oversampling the train dataset using SMOTE + ENN

smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)
counter = Counter(y_train_smenn)

print('After', counter)

Before Counter({0: 17139, 1: 3904})
After Counter({1: 15080, 0: 9895})


Model Building --->> Imbalanced data

In [63]:
# Not needed.
model = list()
resample = list()
preicision = list()
recall = list()
F1score = list()
AUCROC = list()

In [65]:
# Model with the imbalanced data  
 # This data is binary classification so, we can use here Logistic, DecisionTree, RandomForest

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier