In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import confusion_matrix

from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Reading the data

In [121]:
X_train = np.loadtxt('X_train_PCA_output.csv', delimiter=',')
X_train[:1]

array([[-3.220393, -0.267416, -0.768893, -0.180897, -0.720181, -0.138849,
        -0.10309 , -0.257033, -0.149509, -0.140965,  0.19278 ,  0.436008,
         0.022754, -0.363063,  0.386826,  0.265992, -0.304641, -0.540855,
         0.093602,  0.197309, -0.013214,  0.08058 ,  0.165219,  0.165249,
         0.688659]])

In [122]:
X_test = np.loadtxt("X_test_PCA_output.csv", delimiter=',')
X_test[:1,:]

array([[-1.15335 ,  2.985367, -0.404837, -0.954784,  0.329782, -0.148697,
         0.880579,  1.200678, -0.285082, -0.355571, -0.422965, -0.067902,
         0.67266 , -1.056494,  0.139933, -0.464362, -0.416842, -0.538887,
         0.246619, -0.180824, -0.08942 ,  0.335719,  0.253902, -0.082015,
         0.282404]])

In [123]:
y_train = pd.read_csv('y_train_PCA_output.csv', squeeze=True)
y_train.head()

0    1
1    0
2    0
3    0
4    0
Name: churn_probability, dtype: int64

In [124]:
y_test = pd.read_csv('y_test_PCA_output.csv', squeeze=True)
y_test.head()

0    1
1    1
2    0
3    0
4    1
Name: churn_probability, dtype: int64

In [125]:
y_train.shape

(55999,)

In [126]:
y_train = y_train.values.reshape(-1,)
y_train

array([1, 0, 0, ..., 0, 0, 0])

------
------
## Model Building

### Logistic Regression 

In [127]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [128]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [129]:
# Evaluation metrics
def evaluation_metric(y_train, y_test, y_train_pred, y_test_pred):
    print("Train data :")
    print("Confusion Metrix :")
    confusion = confusion_matrix(y_train, y_train_pred)
    print(confusion)
    print("Accuracy Score = ", metrics.accuracy_score(y_train, y_train_pred))
    tp = confusion[1,1]
    tn = confusion[0,0]
    fp = confusion[0,1]
    fn = confusion[1,0]
    sensitivity = tp / float(tp + fn)
    print('sensitivity = ', sensitivity)
    
    specificity = tn / float(tn + fp)
    print("Specificity = ", specificity)
    print("*" * 30)
    print()
    
    print("Test data :")
    print("Confusion Metrix :")
    confusion = confusion_matrix(y_test, y_test_pred)
    print(confusion)
    print("Accuracy Score = ", metrics.accuracy_score(y_test, y_test_pred))
    tp = confusion[1,1]
    tn = confusion[0,0]
    fp = confusion[0,1]
    fn = confusion[1,0]
    sensitivity = tp / float(tp + fn)
    print('sensitivity = ', sensitivity)
    
    specificity = tn / float(tn + fp)
    print("Specificity = ", specificity)
        

In [130]:
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50064   195]
 [ 5556   184]]
Accuracy Score =  0.8973017375310274
sensitivity =  0.03205574912891986
Specificity =  0.9961200978929147
******************************

Test data :
Confusion Metrix :
[[12558    50]
 [ 1356    36]]
Accuracy Score =  0.8995714285714286
sensitivity =  0.02586206896551724
Specificity =  0.9960342639593909


#### if we dont need sensitivity and specificity, we can delete it

### Insight:
- Train and test accuracy is good/high around 90 %

-----
### Logistic Regression using RFE

In [131]:
logrfe = LogisticRegression()
rfe = RFE(logrfe, n_features_to_select=10)
rfe = rfe.fit(X_train, y_train)

In [132]:
rfe.support_

array([ True,  True, False, False,  True,  True, False, False, False,
        True, False,  True,  True, False,  True, False, False, False,
       False, False, False, False,  True, False,  True])

In [133]:
list(zip(rfe.support_, rfe.ranking_))

[(True, 1),
 (True, 1),
 (False, 13),
 (False, 12),
 (True, 1),
 (True, 1),
 (False, 15),
 (False, 10),
 (False, 8),
 (True, 1),
 (False, 6),
 (True, 1),
 (True, 1),
 (False, 4),
 (True, 1),
 (False, 3),
 (False, 7),
 (False, 2),
 (False, 16),
 (False, 11),
 (False, 14),
 (False, 5),
 (True, 1),
 (False, 9),
 (True, 1)]

In [134]:
selected_columns = []
for i in range(X_train.shape[1]):
    if rfe.support_[i] == True:
        selected_columns.append(i)

X_train_rfe = X_train[:,selected_columns]
X_test_rfe = X_test[:, selected_columns]

lr_rfe = LogisticRegression()
lr_rfe.fit(X_train_rfe, y_train)

In [135]:
y_train_pred = lr_rfe.predict(X_train_rfe)
y_test_pred = lr_rfe.predict(X_test_rfe)
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50055   204]
 [ 5548   192]]
Accuracy Score =  0.8972838800692869
sensitivity =  0.033449477351916376
Specificity =  0.9959410254879723
******************************

Test data :
Confusion Metrix :
[[12556    52]
 [ 1354    38]]
Accuracy Score =  0.8995714285714286
sensitivity =  0.027298850574712645
Specificity =  0.9958756345177665


### Insight:
- Accuracy did not change using RFE.  
- It is same as logic regression accuracy.   
- If we select 2 or 3 features in rfe, then also we get around 90% accuracy but confusion metrics is different. most of the data is considered as not churned. and Not churned is percentage is high, thats why we are getting almost same accuracy even with 2 or 3 features. 

--------

### Regularization

### Ridge Regularization

In [140]:
params = {'C': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000]}

cv = 5
ridge_logreg = LogisticRegression(penalty='l2')

grid_search = GridSearchCV(estimator=ridge_logreg, 
                           param_grid=params,
                           cv=cv,
                           scoring='accuracy')

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(best_model)

LogisticRegression(C=0.01)


In [141]:
c = 0.01
ridge_logreg = LogisticRegression(penalty='l2', C=c)
ridge_logreg.fit(X_train, y_train)

In [142]:
y_train_pred = ridge_logreg.predict(X_train)
y_test_pred = ridge_logreg.predict(X_test)
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50097   162]
 [ 5572   168]]
Accuracy Score =  0.897605314380614
sensitivity =  0.02926829268292683
Specificity =  0.9967766967110369
******************************

Test data :
Confusion Metrix :
[[12564    44]
 [ 1360    32]]
Accuracy Score =  0.8997142857142857
sensitivity =  0.022988505747126436
Specificity =  0.996510152284264


In [143]:
ridge_logreg.coef_

array([[-0.3418862 ,  0.33044098, -0.0237924 ,  0.03555987, -0.1723573 ,
         0.18485554, -0.00940886, -0.04017483,  0.07769601, -0.21843596,
         0.10772074,  0.22994247,  0.19739958, -0.13714892,  0.29921831,
         0.16172853, -0.09206669, -0.12920152,  0.01320818,  0.04857718,
         0.01627356,  0.11068141,  0.14566479,  0.06948268,  0.81107814]])

### Insight:
- Regularization applies penalty term to loss function.  
- Regularization helps prevent overfitting.  
- But in this case rige regularization is not helping us.  
- Ridge regularization accuracy is similar to logistic regression accuracy.

----
### Lasso Regression

In [146]:
params = {'C': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000]}

cv = 5
lasso_logreg = LogisticRegression(penalty='l1', solver='saga')

grid_search = GridSearchCV(estimator=lasso_logreg, 
                           param_grid=params,
                           cv=cv,
                           scoring='accuracy')

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(best_model)

LogisticRegression(C=0.01, penalty='l1', solver='saga')


In [147]:
c = 0.01
lasso_logreg = LogisticRegression(penalty='l1', C=c, solver='saga')
lasso_logreg.fit(X_train, y_train)

In [148]:
y_train = lasso_logreg.predict(X_train)
y_test = lasso_logreg.predict(X_test)
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[55667    57]
 [    2   273]]
Accuracy Score =  0.9989464097573171
sensitivity =  0.9927272727272727
Specificity =  0.9989771014284688
******************************

Test data :
Confusion Metrix :
[[13924    14]
 [    0    62]]
Accuracy Score =  0.999
sensitivity =  1.0
Specificity =  0.9989955517290859


In [149]:
lasso_logreg.coef_

array([[-0.33403975,  0.31164731, -0.00440235,  0.03434129, -0.16293132,
         0.16985374,  0.        , -0.01381954,  0.04977613, -0.18787281,
         0.07664192,  0.19986474,  0.17799398, -0.10088965,  0.27217686,
         0.1126606 , -0.04537641, -0.09674933,  0.        ,  0.        ,
         0.        ,  0.06130691,  0.08112735,  0.        ,  0.7820824 ]])

### Insight:  
- Lasso regularization helps prevent overfitting as well as feature selection.  
- Few coefficients are 0. That mean those featurs are unnecessary in model building.  
- In this case lasso regression is giving very good result on train data and test data also.  
- Accuracy is 99%.
- Specificity and sensitivity is also high.
----- 