In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 

In [2]:
data= pd.read_csv('CustomerData.csv')
data=data.drop('customerID',axis=1)
data.isna().sum()
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Now lets start with preprocessing the Data

In [3]:
LabelEncodingColumns=['gender','Partner','Dependents','PhoneService','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for cols in LabelEncodingColumns:
    le=LabelEncoder()
    data[cols]=le.fit_transform(data[cols])
data.head()
ohe_cols=['MultipleLines','InternetService','OnlineSecurity','Contract','PaymentMethod']
hotencoder=OneHotEncoder(sparse_output=False, drop='first')
ohe_data=hotencoder.fit_transform(data[ohe_cols])
ohe_data=pd.DataFrame(ohe_data,columns=hotencoder.get_feature_names_out(ohe_cols),
    index=data.index)

In [4]:
data=pd.concat([data,ohe_data],axis=1)
data=data.drop(ohe_cols,axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   int64  
 5   PhoneService                           7043 non-null   int64  
 6   OnlineBackup                           7043 non-null   int64  
 7   DeviceProtection                       7043 non-null   int64  
 8   TechSupport                            7043 non-null   int64  
 9   StreamingTV                            7043 non-null   int64  
 10  StreamingMovies                        7043 non-null   int64  
 11  Pape

In [5]:
data['TotalCharges'] = pd.to_numeric(
    data['TotalCharges'],
    errors='coerce'
)
data['TotalCharges']=data['TotalCharges'].fillna(data['TotalCharges'].mean())

In [6]:
data.isna().sum()

gender                                   0
SeniorCitizen                            0
Partner                                  0
Dependents                               0
tenure                                   0
PhoneService                             0
OnlineBackup                             0
DeviceProtection                         0
TechSupport                              0
StreamingTV                              0
StreamingMovies                          0
PaperlessBilling                         0
MonthlyCharges                           0
TotalCharges                             0
Churn                                    0
MultipleLines_No phone service           0
MultipleLines_Yes                        0
InternetService_Fiber optic              0
InternetService_No                       0
OnlineSecurity_No internet service       0
OnlineSecurity_Yes                       0
Contract_One year                        0
Contract_Two year                        0
PaymentMeth

Finally the data has been pre processed now out next step is to split the data for training and testing  

In [7]:
y=data['Churn']
x=data.drop('Churn',axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

lets start with training the models to find the best one

Lets Start with baseline model 

In [8]:
LogisticModel=LogisticRegression(max_iter=9999)
LogisticModel.fit(x_train,y_train)
prediction=LogisticModel.predict(x_test)
print(accuracy_score(y_test,prediction))
Logistic_model_accuracy=accuracy_score(y_test,prediction)

0.8218594748048261


In [9]:
Gaussianmodel=GaussianNB()
Gaussianmodel.fit(x_train,y_train)
prediction=Gaussianmodel.predict(x_test)
print(accuracy_score(y_test,prediction))
Gaussian_accuracy=accuracy_score(y_test,prediction)

0.7359829666430092


In [10]:
best_neighbor=0
best_accuracy=0
for i in range(10,31):
    KNNmodel=KNeighborsClassifier(n_neighbors=i)
    KNNmodel.fit(x_train,y_train)
    prediction=KNNmodel.predict(x_test)
    if accuracy_score(y_test,prediction)>best_accuracy:
        best_accuracy=accuracy_score(y_test,prediction)
        best_neighbor=i
    print(accuracy_score(y_test,prediction))
print(f'best number of neighbor is {best_neighbor} with accuracy of {best_accuracy}')
KNN_model_accuracy=best_accuracy

0.7927608232789212
0.7927608232789212
0.794180269694819
0.7963094393186657
0.7877927608232789
0.7920511000709723
0.7920511000709723
0.7920511000709723
0.794180269694819
0.7927608232789212
0.7899219304471257
0.7920511000709723
0.794180269694819
0.7955997161107168
0.801277501774308
0.7970191625266146
0.7984386089425124
0.794889992902768
0.7934705464868701
0.7934705464868701
0.7934705464868701
best number of neighbor is 24 with accuracy of 0.801277501774308


In [11]:
DTmodel=DecisionTreeClassifier()
DTmodel.fit(x_train,y_train)
prediction=DTmodel.predict(x_test)
print(accuracy_score(y_test,prediction))

0.7444996451383961


In [12]:
max_depth=0
best_accuracy=0
for i in range(3,15):
    DTmodel=DecisionTreeClassifier(max_depth=i)
    DTmodel.fit(x_train,y_train)
    prediction=DTmodel.predict(x_test)
    if accuracy_score(y_test,prediction)>best_accuracy:
        best_accuracy=accuracy_score(y_test,prediction)
        max_depth=i
    print(accuracy_score(y_test,prediction))
print(f'best number of depth is {max_depth} with accuracy of {best_accuracy}')

0.7906316536550745
0.7920511000709723
0.794889992902768
0.8019872249822569
0.7977288857345636
0.7970191625266146
0.7821149751596878
0.7714691270404542
0.7650816181689141
0.7608232789212207
0.7501774308019872
0.7508871540099361
best number of depth is 6 with accuracy of 0.8019872249822569


In [13]:
best_accuracy=0
best_split=0
for i in range(3,20):
    DTmodel=DecisionTreeClassifier(max_depth=max_depth,min_samples_split=i)
    DTmodel.fit(x_train,y_train)
    prediction=DTmodel.predict(x_test)
    if accuracy_score(y_test,prediction)>best_accuracy:
        best_accuracy=accuracy_score(y_test,prediction)
        best_split=i
    print(accuracy_score(y_test,prediction))
print(f'best number of minimum split is {best_split} with accuracy of {best_accuracy}')
Tree_model_accuracy=best_accuracy

0.8019872249822569
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
0.8026969481902059
best number of minimum split is 4 with accuracy of 0.8026969481902059


so now we can see that the accuracy has started to decrease hence we should stop here.

In [14]:
print(f'Logistic Regression models Accuracy is {Logistic_model_accuracy}')
print(f'KNN models Accuracy is {KNN_model_accuracy}')
print(f'Gaussian Naive Bayes models Accuracy is {Gaussian_accuracy}')
print(f'Decision Tree models Accuracy is {Tree_model_accuracy}')

Logistic Regression models Accuracy is 0.8218594748048261
KNN models Accuracy is 0.801277501774308
Gaussian Naive Bayes models Accuracy is 0.7359829666430092
Decision Tree models Accuracy is 0.8026969481902059


hence we can see that Logistic Regression model is performing best among all other models so lets do cross validation of logistic regression model.

In [16]:
from sklearn.model_selection import GridSearchCV
LR=LogisticRegression(max_iter=10000)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}
grid = GridSearchCV(
    estimator=LR,
    param_grid=param_grid,
    cv=5,                
    scoring='accuracy',
)
grid.fit(x_train, y_train)
prediction=grid.predict(x_test)

In [18]:
print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

Best parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV accuracy: 0.8028036204828677


So technically baseLine model has the best accuracy with accuracy of 0.8218594748048261