In [182]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [183]:
data = pd.read_excel('data.xlsx', sheet_name='Sheet1')

In [184]:
data.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los_Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New_York,1,48.76,172,0
2,3,Customer_3,24,Female,Los_Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [185]:
from statsmodels.formula.api import logit

In [186]:
data.drop(columns=['CustomerID', 'Name'], inplace=True)

In [187]:
data.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,Male,Los_Angeles,17,73.36,236,0
1,62,Female,New_York,1,48.76,172,0
2,24,Female,Los_Angeles,5,85.47,460,0
3,36,Female,Miami,3,97.94,297,1
4,46,Female,Miami,19,58.14,266,0


In [188]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Age                         100000 non-null  int64  
 1   Gender                      100000 non-null  object 
 2   Location                    100000 non-null  object 
 3   Subscription_Length_Months  100000 non-null  int64  
 4   Monthly_Bill                100000 non-null  float64
 5   Total_Usage_GB              100000 non-null  int64  
 6   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 5.3+ MB


In [189]:
data['Lifetime_Bill_Amt'] = data['Monthly_Bill']*data['Subscription_Length_Months']

In [190]:
data_encoded = pd.get_dummies(data, columns=['Gender', 'Location'], dtype=float)

In [191]:
data_encoded.head()

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Lifetime_Bill_Amt,Gender_Female,Gender_Male,Location_Chicago,Location_Houston,Location_Los_Angeles,Location_Miami,Location_New_York
0,63,17,73.36,236,0,1247.12,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,62,1,48.76,172,0,48.76,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,24,5,85.47,460,0,427.35,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,36,3,97.94,297,1,293.82,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,46,19,58.14,266,0,1104.66,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [192]:
formula = ''
for i in range(len(data_encoded.columns)):
    formula+=data_encoded.columns[i]+'+'

In [193]:
formula

'Age+Subscription_Length_Months+Monthly_Bill+Total_Usage_GB+Churn+Lifetime_Bill_Amt+Gender_Female+Gender_Male+Location_Chicago+Location_Houston+Location_Los_Angeles+Location_Miami+Location_New_York+'

In [194]:
m1 = logit('Churn~Location_Houston+Location_Los_Angeles', data=data_encoded).fit()

Optimization terminated successfully.
         Current function value: 0.693092
         Iterations 3


In [195]:
m1.summary()

0,1,2,3
Dep. Variable:,Churn,No. Observations:,100000.0
Model:,Logit,Df Residuals:,99997.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 22 Sep 2023",Pseudo R-squ.:,6.518e-05
Time:,16:06:36,Log-Likelihood:,-69309.0
converged:,True,LL-Null:,-69314.0
Covariance Type:,nonrobust,LLR p-value:,0.01091

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0066,0.008,0.810,0.418,-0.009,0.023
Location_Houston,-0.0422,0.016,-2.593,0.010,-0.074,-0.010
Location_Los_Angeles,-0.0347,0.016,-2.123,0.034,-0.067,-0.003


In [196]:
m1.predict()

array([0.49298937, 0.50165546, 0.49298937, ..., 0.50165546, 0.50165546,
       0.49298937])

In [197]:
predicted = np.round(m1.predict())

In [198]:
predicted

array([0., 1., 0., ..., 1., 1., 0.])

In [199]:
from sklearn.metrics import confusion_matrix

In [200]:
confusionMat = confusion_matrix(predicted, data['Churn'])

In [201]:
TP = confusionMat[1][1]
TN = confusionMat[0][0]
FP = confusionMat[0][1]
FN = confusionMat[1][0]

In [202]:
recall = TP/(TP+FN)
recall

0.5016554630279924

In [203]:
precision = TP/(TP+FP)
precision

0.602663773880552

In [204]:
specificity = TN/(TN+FP)
specificity

0.5079605950544803

In [205]:
2*((precision*recall)/(precision+recall))

0.5475401757603964