In [2]:

from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer as s
from sklearn.impute import KNNImputer as knn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
import pandas as pd
import numpy as np

In [4]:
data=pd.read_csv(r"D:\\Telco-Customer-Churn.csv")

In [6]:
class Utils_Suite():
    def __init__(self,data):
        self.data=data
    def compute_correlation(self,threshold=0.3):
        matrix=self.data.corr(numeric_only=False)
        x=matrix[(matrix["Churn"]<threshold)&(matrix["Churn"]>-threshold)]["Churn"]
        return x
    def compute_mutual_information(self,thresh=0.1):
        enc = OrdinalEncoder()
        df_encoded = enc.fit_transform(self.data)
        mi_scores = mutual_info_regression(df_encoded, self.data['Churn'])
        mi_scores_df = pd.DataFrame(mi_scores, index=self.data.columns, columns=['Score'])
        return mi_scores_df[mi_scores_df['Score']<thresh]
    def compute_vif(self):
        x=self.data.iloc[:,:-1]
        y=self.data.iloc[:,-1]
        x=pd.DataFrame(x)

        x['intercept']=1
        vif=pd.DataFrame()
        vif['variable']=x.columns
        vif['vif']=[variance_inflation_factor(x.values,i)for i in range(x.shape[1])]
        return vif
    


In [7]:
class PreProcess():
    #Auto Run Upon Initiation .
    def __init__(self,data):
        self.data=data
        
        self.run()
    # PreProcessing  Schedules
    def run(self):
        self.ClearNull(threshold=0.5)
        self.data=self.data.drop(data[data['TotalCharges'] == ' '].index)
        self.data['TotalCharges']=self.data['TotalCharges'].astype(float)
        l=self.get_all_Null(dtype='float64')
        
        self.outlier_remove('TotalCharges')
        self.outlier_remove('MonthlyCharges')
        
        self.StdScale()
        
        self.drop_uniq_thresh(thresh=5)
        
        #self.Label_Encoding()
        self.one_hot_encoding()
        #self.drop_correlation()
        #self.drop_vif(thresh=4)

    # Remove Correlation 
    def drop_correlation(self):
        k=Utils_Suite(self.data).compute_correlation(0.3)
        f=pd.DataFrame(k)
        m=list(f[(f['Churn']<0.15) & (f['Churn']>-0.15)].index)
        self.data=self.data.drop(columns=m)

    def ClearNull(self,threshold):
        x=self.data.isna().sum()>0

        for i in  list(x.index):
            thresh=self.data[i].isna().sum()/len(self.data)
            if(x[i]==True and thresh>threshold):
                print(i,self.data[i].isna().sum())
                self.data=self.data.drop(i,axis=1)


    def knn_impute(self,n_neighbors,col_list):
        imputer=knn(n_neighbors=n_neighbors)
        for i in col_list:
            self.data[i]=imputer.fit_transform(self.data[[i]])[0][0]


    def get_all_Null(self,dtype=""):
        x=self.data.isna().sum()>0
        l=[]
        for i in  list(x.index):
            thresh=self.data[i].isna().sum()/len(self.data)
            if(x[i]==True and (data[i].dtypes==dtype) ):
                print(i,data[i].isna().sum())
                l+=[i]
        return l
    # Drop Outlier Rows 
    def outlier_remove(self,col):
        
        q1=self.data[col].quantile(0.25)
        q3=self.data[col].quantile(0.75)
        iqr=q3-q1
        l_whis=q1-1.5*iqr
        u_whis=q3+1.5*iqr
        self.data= self.data[(self.data[col]>=l_whis)& (self.data[col]<=u_whis)]
    
    # One hot Encoding using get_dummies
    def one_hot_encoding(self):
        z=(self.data.dtypes=='object')
        k=pd.DataFrame(z)
        obj_list=list(k[k[0]==True].index)
        print(obj_list)
        for i in obj_list:
            dummy=pd.get_dummies(self.data[i],prefix=i,drop_first=True)
            
            self.data=self.data.drop(i,axis=1)
            self.data=self.data.join(dummy)
            
    # standardize data
    def drop_uniq_thresh(self,thresh=5):
        col=data.columns
        x=pd.DataFrame(self.data.dtypes)
        ll=list(x[x[0]=="object"].index)
        droplist=[]
        for i in ll:
            print(i)
            if (len(self.data[i].unique())>thresh):
                droplist+=[i]
        print(droplist)
        self.data=self.data.drop(columns=droplist)
        
            
    def Label_Encoding(self):
        label_encoder = preprocessing.LabelEncoder() 
        x=pd.DataFrame(self.data.dtypes)
        ll=list(x[x[0]=="object"].index)
        for i in ll:
            self.data[i]= label_encoder.fit_transform(self.data[i]) 
    
    def StdScale(self):
        for i in self.data.columns:
            if self.data[i].dtypes!='object' and i!='Churn':
                scale = StandardScaler().fit(self.data[[i]])
    
                self.data[i] = scale.transform(self.data[[i]])

        
    ## DANGER ZONE Col Spare NEEDED To Keep y_pred. RAM HOGGING FUNCTION .
    #Use Wisely! Plus Parallize the operation for better efficacy? Maybe???
                
    def drop_vif(self,thresh=5,col_Spare=['Churn','intercept']):

        
        vif=Utils_Suite(self.data).compute_vif()
        z1=vif[vif["vif"]>thresh]
        z1=z1.sort_values(by='vif', kind='mergesort',ascending=[False])
        while True:
            try:
                col=z1.iloc[0,0]
                if z1.empty:
                    break
                if col in col_Spare:
                    z1=z1.iloc[1:]
                    continue
                self.data=self.data.drop(col,axis=1)
                vif=Utils_Suite(self.data).compute_vif()
                z1=vif[vif["vif"]>thresh]
                z1=z1.sort_values(by='vif', kind='mergesort',ascending=[False])
            except IndexError:
                break
        
    # Wrapper Function to dump data to a variable  
    def write_df(self):
        return self.data
    

In [8]:
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [9]:
data=data.drop(data[data['TotalCharges'] == ' '].index)

In [483]:
data['TotalCharges']=data['TotalCharges'].astype(float)

In [10]:
x=PreProcess(data)
data=x.write_df()

customerID
gender
Partner
Dependents
PhoneService
MultipleLines
InternetService
OnlineSecurity
OnlineBackup
DeviceProtection
TechSupport
StreamingTV
StreamingMovies
Contract
PaperlessBilling
PaymentMethod
Churn
['customerID']
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [485]:
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,-0.44,1,0,-1.28,0,1,0,0,2,0,0,0,0,0,1,2,-1.16,-0.99,0
1,1,-0.44,0,0,0.06,1,0,0,2,0,2,0,0,0,1,0,3,-0.26,-0.17,0
2,1,-0.44,0,0,-1.24,1,0,0,2,2,0,0,0,0,0,1,3,-0.36,-0.96,1
3,1,-0.44,0,0,0.51,0,1,0,2,0,2,2,0,0,1,0,0,-0.75,-0.20,0
4,0,-0.44,0,0,-1.24,1,0,1,0,0,0,0,0,0,0,1,2,0.20,-0.94,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,-0.44,1,1,-0.34,1,2,0,2,0,2,2,2,2,1,1,3,0.66,-0.13,0
7039,0,-0.44,1,1,1.61,1,2,1,0,2,2,0,2,2,1,1,1,1.28,2.24,0
7040,0,-0.44,1,1,-0.87,0,1,0,2,0,0,0,0,0,0,1,2,-1.17,-0.85,0
7041,1,2.27,1,0,-1.16,1,2,1,0,0,0,0,0,0,0,1,3,0.32,-0.87,1


In [486]:
data['Churn']

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7032, dtype: int32

In [487]:
data1['Customer Status']

0       1
1       1
2       0
3       0
4       0
       ..
6584    0
6585    1
6586    0
6587    1
6588    1
Name: Customer Status, Length: 6589, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:


x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2024)



## Using Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression
reg=LogisticRegression(solver='sag',max_iter=1000,penalty='l2')
reg.fit(x_train,y_train)

LogisticRegression(max_iter=1000, solver='sag')

In [47]:

y_pred=reg.predict(x_test)
(y_test,y_pred)
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

conf=confusion_matrix(y_test,y_pred)

In [48]:
accuracy_score(y_test,y_pred)

0.8018957345971564

## Using Decision Tree Classifier


In [493]:

from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(criterion="entropy",max_depth=4)
tree.fit(x_train,y_train)
y_pred=tree.predict(x_test)
(y_test,y_pred)

confusion_matrix(y_test,y_pred)
accuracy_score(y_test,y_pred)



0.7829383886255924

In [494]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86      1552
           1       0.66      0.37      0.47       558

    accuracy                           0.78      2110
   macro avg       0.73      0.65      0.67      2110
weighted avg       0.77      0.78      0.76      2110



## Using  Naive Bayes 

In [495]:
from sklearn.naive_bayes  import GaussianNB
reg=GaussianNB()
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
(y_test,y_pred)

confusion_matrix(y_test,y_pred)

array([[1183,  369],
       [ 160,  398]], dtype=int64)

In [496]:
accuracy_score(y_test,y_pred)


0.7492890995260664

In [497]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.76      0.82      1552
           1       0.52      0.71      0.60       558

    accuracy                           0.75      2110
   macro avg       0.70      0.74      0.71      2110
weighted avg       0.79      0.75      0.76      2110



## Using KNN  Classifier

In [498]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)

KNeighborsClassifier()

In [499]:
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1304  248]
 [ 288  270]]
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1552
           1       0.52      0.48      0.50       558

    accuracy                           0.75      2110
   macro avg       0.67      0.66      0.67      2110
weighted avg       0.74      0.75      0.74      2110



In [500]:
accuracy_score(y_test,y_pred)


0.7459715639810427

## Using Support Vector Classifier

In [23]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='poly') # Linear Kernel

#Train the model using the training sets
clf.fit(x_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8023696682464455

In [502]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1371  181]
 [ 267  291]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1552
           1       0.62      0.52      0.57       558

    accuracy                           0.79      2110
   macro avg       0.73      0.70      0.71      2110
weighted avg       0.78      0.79      0.78      2110



### Inference 

##### Logistic Regression :

                            Accuracy: 80%

##### Naive Bayes:

                            Accuracy: 74.9%

##### SVM :

                            Accuracy: 78.7%

##### Decision Tree Classifier :

                            Accuracy: 78.2%

##### KNN Classifier :

                            Accuracy: 74.5%


                            

### Interpretations 
From All of the Above  we see that Logistic Regression Has performed the best ~ 80% Accuracy 
And KNN Classifier the worst ~74% With all other algorithms being close to each other in terms of accuracy


Decision Tree is not the most accurate because it could have overfitted the data while training and due to this we see a dip in accuracy score and are biased towards branches with more levels as here in this case 

While KNN Does not perform well in higher dimensions here we have almost 21 features 

On the other hand

Naive bayes does not perform well with continuous data [Total Salary]and may be the accuracy would be low because of existence of small correlation

SVM Needs Proper Hyper parameter tuning and selection of kernel ... here i chose linear .. which was not the case here so it performs poorly
SVM Linear ~78%
SVM Poly Kernel ~ 81 % 

The best performing model Logistic regression has the comparitively high accuracy of 80% might be because of specificity of classification wrt churn and provides a probablistic likelihood of churning and learns the features ....
 
One Interesting observation with respect to overfitting is that even when only one partof the prediction set is revealed say (Churn_Yes) the Model Learns this specific feature and clearly overfits all data to this with a accuracy of 100% which is in no way accurate.


