In [2]:
import numpy as np
import pandas as pd

import xgboost as xgb 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.feature_selection import RFE

from sklearn import metrics
from sklearn.model_selection import (train_test_split, GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import (confusion_matrix, RocCurveDisplay, classification_report)
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Reading the data

In [3]:
X_train = np.loadtxt('X_train_PCA_output.csv', delimiter=',')
X_train[:1]

array([[-3.220393, -0.267416, -0.768893, -0.180897, -0.720181, -0.138849,
        -0.10309 , -0.257033, -0.149509, -0.140965,  0.19278 ,  0.436008,
         0.022754, -0.363063,  0.386826,  0.265992, -0.304641, -0.540855,
         0.093602,  0.197309, -0.013214,  0.08058 ,  0.165219,  0.165249,
         0.688659]])

In [4]:
X_test = np.loadtxt("X_test_PCA_output.csv", delimiter=',')
X_test[:1,:]

array([[-1.15335 ,  2.985367, -0.404837, -0.954784,  0.329782, -0.148697,
         0.880579,  1.200678, -0.285082, -0.355571, -0.422965, -0.067902,
         0.67266 , -1.056494,  0.139933, -0.464362, -0.416842, -0.538887,
         0.246619, -0.180824, -0.08942 ,  0.335719,  0.253902, -0.082015,
         0.282404]])

In [5]:
y_train = pd.read_csv('y_train_PCA_output.csv')
y_train.head()

Unnamed: 0,churn_probability
0,1
1,0
2,0
3,0
4,0


In [6]:
y_test = pd.read_csv('y_test_PCA_output.csv')
y_test.head()

Unnamed: 0,churn_probability
0,1
1,1
2,0
3,0
4,1


In [7]:
y_train.shape

(55999, 1)

In [8]:
y_train = y_train.values.reshape(-1,)
y_train

array([1, 0, 0, ..., 0, 0, 0])

------
## Model Building

### Logistic Regression 

In [9]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [10]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [11]:
# Evaluation metrics
def evaluation_metric(y_train, y_test, y_train_pred, y_test_pred, estimator_name=None):
    print("="*50)
    print("=================== {} =========================".format(estimator_name))
    print("="*50)
    print("Train data :")
    print("Confusion Metrix :")
    confusion = confusion_matrix(y_train, y_train_pred)
    print(confusion)
    print("Accuracy Score = ", metrics.accuracy_score(y_train, y_train_pred))
    tp = confusion[1,1]
    tn = confusion[0,0]
    fp = confusion[0,1]
    fn = confusion[1,0]
    sensitivity = tp / float(tp + fn)
    print('sensitivity = ', sensitivity)
    
    specificity = tn / float(tn + fp)
    print("Specificity = ", specificity)
    print("*" * 30)
    print()
    
    print("Test data :")
    print("Confusion Metrix :")
    confusion = confusion_matrix(y_test, y_test_pred)
    print(confusion)
    print("Accuracy Score = ", metrics.accuracy_score(y_test, y_test_pred))
    tp = confusion[1,1]
    tn = confusion[0,0]
    fp = confusion[0,1]
    fn = confusion[1,0]
    sensitivity = tp / float(tp + fn)
    print('sensitivity = ', sensitivity)
    
    specificity = tn / float(tn + fp)
    print("Specificity = ", specificity)
        

In [12]:
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50064   195]
 [ 5556   184]]
Accuracy Score =  0.8973017375310274
sensitivity =  0.03205574912891986
Specificity =  0.9961200978929147
******************************

Test data :
Confusion Metrix :
[[12558    50]
 [ 1356    36]]
Accuracy Score =  0.8995714285714286
sensitivity =  0.02586206896551724
Specificity =  0.9960342639593909


#### if we dont need sensitivity and specificity, we can delete it

### Insight:
- Train and test accuracy is good/high around 90 %

-----
### Logistic Regression using RFE

In [13]:
logrfe = LogisticRegression()
rfe = RFE(logrfe, n_features_to_select=10)
rfe = rfe.fit(X_train, y_train)

In [14]:
rfe.support_

array([ True,  True, False, False,  True,  True, False, False, False,
        True, False,  True,  True, False,  True, False, False, False,
       False, False, False, False,  True, False,  True])

In [15]:
list(zip(rfe.support_, rfe.ranking_))

[(True, 1),
 (True, 1),
 (False, 13),
 (False, 12),
 (True, 1),
 (True, 1),
 (False, 15),
 (False, 10),
 (False, 8),
 (True, 1),
 (False, 6),
 (True, 1),
 (True, 1),
 (False, 4),
 (True, 1),
 (False, 3),
 (False, 7),
 (False, 2),
 (False, 16),
 (False, 11),
 (False, 14),
 (False, 5),
 (True, 1),
 (False, 9),
 (True, 1)]

In [16]:
selected_columns = []
for i in range(X_train.shape[1]):
    if rfe.support_[i] == True:
        selected_columns.append(i)

X_train_rfe = X_train[:,selected_columns]
X_test_rfe = X_test[:, selected_columns]

lr_rfe = LogisticRegression()
lr_rfe.fit(X_train_rfe, y_train)

In [17]:
y_train_pred = lr_rfe.predict(X_train_rfe)
y_test_pred = lr_rfe.predict(X_test_rfe)
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50055   204]
 [ 5548   192]]
Accuracy Score =  0.8972838800692869
sensitivity =  0.033449477351916376
Specificity =  0.9959410254879723
******************************

Test data :
Confusion Metrix :
[[12556    52]
 [ 1354    38]]
Accuracy Score =  0.8995714285714286
sensitivity =  0.027298850574712645
Specificity =  0.9958756345177665


### Insight:
- Accuracy did not change using RFE.  
- It is same as logic regression accuracy.   
- If we select 2 or 3 features in rfe, then also we get around 90% accuracy but confusion metrics is different. most of the data is considered as not churned. and Not churned is percentage is high, thats why we are getting almost same accuracy even with 2 or 3 features. 

--------

### Regularization

### Ridge Regularization

In [18]:
params = {'C': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000]}

cv = 5
ridge_logreg = LogisticRegression(penalty='l2')

grid_search = GridSearchCV(estimator=ridge_logreg, 
                           param_grid=params,
                           cv=cv,
                           scoring='accuracy')

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(best_model)

LogisticRegression(C=0.01)


In [19]:
c = 0.01
ridge_logreg = LogisticRegression(penalty='l2', C=c)
ridge_logreg.fit(X_train, y_train)

In [20]:
y_train_pred = ridge_logreg.predict(X_train)
y_test_pred = ridge_logreg.predict(X_test)
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50097   162]
 [ 5572   168]]
Accuracy Score =  0.897605314380614
sensitivity =  0.02926829268292683
Specificity =  0.9967766967110369
******************************

Test data :
Confusion Metrix :
[[12564    44]
 [ 1360    32]]
Accuracy Score =  0.8997142857142857
sensitivity =  0.022988505747126436
Specificity =  0.996510152284264


In [21]:
ridge_logreg.coef_

array([[-0.3418862 ,  0.33044098, -0.0237924 ,  0.03555987, -0.1723573 ,
         0.18485554, -0.00940886, -0.04017483,  0.07769601, -0.21843596,
         0.10772074,  0.22994248,  0.19739957, -0.13714892,  0.29921831,
         0.16172853, -0.09206669, -0.12920152,  0.01320818,  0.04857718,
         0.01627356,  0.11068139,  0.14566478,  0.06948265,  0.81107818]])

### Insight:
- Regularization applies penalty term to loss function.  
- Regularization helps prevent overfitting.  
- But in this case rige regularization is not helping us.  
- Ridge regularization accuracy is similar to logistic regression accuracy.

----
### Lasso Regression

In [22]:
params = {'C': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000]}

cv = 5
lasso_logreg = LogisticRegression(penalty='l1', solver='saga')

grid_search = GridSearchCV(estimator=lasso_logreg, 
                           param_grid=params,
                           cv=cv,
                           scoring='accuracy')

grid_search.fit(X_train, y_train)

best_lr_model = grid_search.best_estimator_
print(best_lr_model)

LogisticRegression(C=0.01, penalty='l1', solver='saga')


In [23]:
c = 0.01
lasso_logreg = LogisticRegression(penalty='l1', C=c, solver='saga')
lasso_logreg.fit(X_train, y_train)

In [24]:
lasso_y_train = lasso_logreg.predict(X_train)
lasso_y_test = lasso_logreg.predict(X_test)
evaluation_metric(y_train, y_test, lasso_y_train, lasso_y_test)

Train data :
Confusion Metrix :
[[50128   131]
 [ 5596   144]]
Accuracy Score =  0.8977303166127967
sensitivity =  0.025087108013937282
Specificity =  0.997393501661394
******************************

Test data :
Confusion Metrix :
[[12571    37]
 [ 1367    25]]
Accuracy Score =  0.8997142857142857
sensitivity =  0.017959770114942528
Specificity =  0.9970653553299492


In [25]:
lasso_logreg.coef_

array([[-0.3340189 ,  0.31164455, -0.0043861 ,  0.0343232 , -0.16293386,
         0.16982544,  0.        , -0.01381992,  0.04976698, -0.18786947,
         0.07663957,  0.19990407,  0.17799187, -0.10090105,  0.27215725,
         0.11263697, -0.04538323, -0.09671819,  0.        ,  0.        ,
         0.        ,  0.06129856,  0.08114365,  0.        ,  0.78209729]])

### Insight:  
- Lasso regularization helps prevent overfitting as well as feature selection.  
- Few coefficients are 0. That mean those featurs are unnecessary in model building.  
- In this case lasso regression is giving very good result on train data and test data also.  
- Accuracy is 99%.
- Specificity and sensitivity is also high.
----- 

## Decision Tree

In [26]:
# Init a decision tree model
dt_base_model = DecisionTreeClassifier(random_state=100)

params = {
    "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    "min_samples_leaf": [5, 10, 20, 50, 100]
}


dt_grid_search = GridSearchCV(estimator=dt_base_model,
                           param_grid=params,
                           cv=4,
                           n_jobs=-1, verbose=1, scoring="accuracy")



dt_grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits


In [27]:
# Check the accuracy of the best model
dt_grid_search.best_score_

0.9029982498749911

In [28]:
# Get the best tree
dt_best = dt_grid_search.best_estimator_

In [29]:
# Decision Tree Evaluation
dt_y_train_pred = dt_best.predict(X_train)
dt_y_test_pred = dt_best.predict(X_test)
evaluation_metric(y_train, y_test, dt_y_train_pred, dt_y_test_pred, "Decision Tree")

Train data :
Confusion Metrix :
[[49546   713]
 [ 4607  1133]]
Accuracy Score =  0.9049983035411346
sensitivity =  0.19738675958188154
Specificity =  0.9858134861417855
******************************

Test data :
Confusion Metrix :
[[12411   197]
 [ 1113   279]]
Accuracy Score =  0.9064285714285715
sensitivity =  0.20043103448275862
Specificity =  0.984375


## Random Forest

In [30]:
rf_base = RandomForestClassifier(random_state=100, n_jobs=-1)

params = {
    'max_depth': [5, 10, 20, 30],
    'min_samples_leaf': [5, 10, 20, 50],
    'n_estimators': [10, 20, 50, 100]
}

rf_grid_search = GridSearchCV(estimator=rf_base,
                              param_grid=params,
                              cv = 4,
                              n_jobs=-1,
                              scoring="accuracy")

rf_grid_search.fit(X_train, y_train)

In [31]:
rf_grid_search.best_score_

0.9073912077903523

In [32]:
rf_best = grid_search.best_estimator_
rf_best

In [33]:
# Decision Tree Evaluation
rf_y_train_pred = dt_best.predict(X_train)
rf_y_test_pred = dt_best.predict(X_test)
evaluation_metric(y_train, y_test, rf_y_train_pred, rf_y_test_pred, "Random Forest")

Train data :
Confusion Metrix :
[[49546   713]
 [ 4607  1133]]
Accuracy Score =  0.9049983035411346
sensitivity =  0.19738675958188154
Specificity =  0.9858134861417855
******************************

Test data :
Confusion Metrix :
[[12411   197]
 [ 1113   279]]
Accuracy Score =  0.9064285714285715
sensitivity =  0.20043103448275862
Specificity =  0.984375


## XGBoost

In [34]:
xgb_base = xgb.XGBClassifier(n_jobs = -1,objective = 'binary:logistic')

params = {
        'n_estimators' : [10, 20, 50, 100, 200],
        'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.25],
        'min_child_weight': [1, 5, 7, 10],
        'gamma': [0.1, 0.5, 1, 1.5, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 10, 12]
        }

param_comb = 800

random_search = RandomizedSearchCV(xgb_base, 
                                   param_distributions=params, 
                                   n_iter=param_comb, scoring='accuracy', 
                                   n_jobs=-1, cv=4, verbose=0, random_state=100)

random_search.fit(X_train, y_train)

In [35]:
random_search.best_score_

0.9084269412100865

In [36]:
xgb_best = random_search.best_estimator_

In [37]:
# Decision Tree Evaluation
xgb_y_train_pred = xgb_best.predict(X_train)
xgb_y_test_pred = xgb_best.predict(X_test)
evaluation_metric(y_train, y_test, xgb_y_train_pred, xgb_y_test_pred, "XGBoost")

Train data :
Confusion Metrix :
[[50018   241]
 [ 4077  1663]]
Accuracy Score =  0.9228914802050037
sensitivity =  0.2897212543554007
Specificity =  0.9952048389343202
******************************

Test data :
Confusion Metrix :
[[12483   125]
 [ 1130   262]]
Accuracy Score =  0.9103571428571429
sensitivity =  0.1882183908045977
Specificity =  0.9900856598984772


-------------------
## Validate Model With Unseen Data


In [42]:
X_unseen = np.loadtxt('X_unseen_PCA_output.csv', delimiter=',')
X_unseen[:1,:]

array([[-3.119042,  0.006179, -0.940812, -0.320406,  0.129718, -0.474852,
         0.234046, -0.250148,  0.967874, -0.989921, -0.102018, -2.061132,
        -3.845834,  2.843013, -1.790694, -2.147914,  0.55072 ,  0.054401,
         0.399007, -0.446444, -0.780929,  0.064753, -0.032409, -0.538241,
         0.467631]])

In [43]:
xgb_y_unseen = xgb_best.predict(X_unseen)


In [50]:
xgb_y_unseen_df = pd.DataFrame(xgb_y_unseen, columns=['churn_probability'])
xgb_y_unseen_df

Unnamed: 0,churn_probability
0,0
1,0
2,0
3,0
4,0
...,...
29995,0
29996,0
29997,0
29998,0


In [51]:
xgb_y_unseen_df.value_counts()

churn_probability
0                    29189
1                      811
Name: count, dtype: int64

In [52]:
xgb_y_unseen_df.to_csv("y_unseen_PCA_output.csv")