In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import confusion_matrix

from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Reading the data

In [2]:
X_train = np.loadtxt('X_train_PCA_output.csv', delimiter=',')
X_train[:1]

array([[-3.220393, -0.267416, -0.768893, -0.180897, -0.720181, -0.138849,
        -0.10309 , -0.257033, -0.149509, -0.140965,  0.19278 ,  0.436008,
         0.022754, -0.363063,  0.386826,  0.265992, -0.304641, -0.540855,
         0.093602,  0.197309, -0.013214,  0.08058 ,  0.165219,  0.165249,
         0.688659]])

In [3]:
X_test = np.loadtxt("X_test_PCA_output.csv", delimiter=',')
X_test[:1,:]

array([[-1.15335 ,  2.985367, -0.404837, -0.954784,  0.329782, -0.148697,
         0.880579,  1.200678, -0.285082, -0.355571, -0.422965, -0.067902,
         0.67266 , -1.056494,  0.139933, -0.464362, -0.416842, -0.538887,
         0.246619, -0.180824, -0.08942 ,  0.335719,  0.253902, -0.082015,
         0.282404]])

In [4]:
y_train = pd.read_csv('y_train_PCA_output.csv', squeeze=True)
y_train.head()

0    1
1    0
2    0
3    0
4    0
Name: churn_probability, dtype: int64

In [5]:
y_test = pd.read_csv('y_test_PCA_output.csv', squeeze=True)
y_test.head()

0    1
1    1
2    0
3    0
4    1
Name: churn_probability, dtype: int64

In [8]:
y_train.shape

(55999,)

In [10]:
y_train = y_train.values.reshape(-1,)
y_train

array([1, 0, 0, ..., 0, 0, 0])

## Model Building

### Logistic Regression 

In [38]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [39]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [46]:
# Evaluation metrics
def evaluation_metric(y_train, y_test, y_train_pred, y_test_pred):
    print("Train data :")
    print("Confusion Metrix :")
    confusion = confusion_matrix(y_train, y_train_pred)
    print(confusion)
    print("Accuracy Score = ", metrics.accuracy_score(y_train, y_train_pred))
    tp = confusion[1,1]
    tn = confusion[0,0]
    fp = confusion[0,1]
    fn = confusion[1,0]
    sensitivity = tp / float(tp + fn)
    print('sensitivity = ', sensitivity)
    
    specificity = tn / float(tn + fp)
    print("Specificity = ", specificity)
    print("*" * 30)
    print()
    
    print("Test data :")
    print("Confusion Metrix :")
    confusion = confusion_matrix(y_test, y_test_pred)
    print(confusion)
    print("Accuracy Score = ", metrics.accuracy_score(y_test, y_test_pred))
    tp = confusion[1,1]
    tn = confusion[0,0]
    fp = confusion[0,1]
    fn = confusion[1,0]
    sensitivity = tp / float(tp + fn)
    print('sensitivity = ', sensitivity)
    
    specificity = tn / float(tn + fp)
    print("Specificity = ", specificity)
        

In [47]:
evaluation_metric(y_train, y_test, y_train_pred, y_test_pred)

Train data :
Confusion Metrix :
[[50064   195]
 [ 5556   184]]
Accuracy Score =  0.8973017375310274
sensitivity =  0.03205574912891986
Specificity =  0.9961200978929147
******************************

Test data :
Confusion Metrix :
[[12558    50]
 [ 1356    36]]
Accuracy Score =  0.8995714285714286
sensitivity =  0.02586206896551724
Specificity =  0.9960342639593909


### if we dont need sensitivity and specificity, we can delete it