In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [3]:
# Defining the path to raw data
file_path = r"C:\Users\ayush\Desktop\vs\Customer_churn\WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [4]:
# Generating a dataframe
df = pd.read_csv(file_path)

In [5]:
# Taking a look at the data
df.head(5)

<bound method NDFrame.head of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL   

In [6]:
# Understanding the information of the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
# Understading the analytics of the data
print(df.describe())

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


In [9]:
# Checking for categorical and numerical values
print(df.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [10]:
# Convert 'TotalCharges' column to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
mean_total_charges = df['TotalCharges'].mean()
df.fillna({'TotalCharges': mean_total_charges}, inplace=True)


In [11]:
print(df.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [13]:
# Converting categorical columns to numerical using one hot encoding
categorical_Columns = ['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']
df = pd.get_dummies(df, columns=categorical_Columns, drop_first=False)

In [14]:
print(df.head(5))

   customerID  SeniorCitizen  tenure  MonthlyCharges  TotalCharges  \
0  7590-VHVEG              0       1           29.85         29.85   
1  5575-GNVDE              0      34           56.95       1889.50   
2  3668-QPYBK              0       2           53.85        108.15   
3  7795-CFOCW              0      45           42.30       1840.75   
4  9237-HQITU              0       2           70.70        151.65   

   gender_Female  gender_Male  Partner_No  Partner_Yes  Dependents_No  ...  \
0           True        False       False         True           True  ...   
1          False         True        True        False           True  ...   
2          False         True        True        False           True  ...   
3          False         True        True        False           True  ...   
4           True        False        True        False           True  ...   

   Contract_One year  Contract_Two year  PaperlessBilling_No  \
0              False              False       

In [15]:
df['CustomerIndex'] = df.index

# Drop the original 'customerID' column because it didn't get converted to bool
df.drop(columns=['customerID'], inplace=True)

In [16]:
# Checking the correlation of features with the label 
corr_matrix = df.corr()
corr_matrix['Churn_Yes'].sort_values(ascending=False) 

Churn_Yes                                  1.000000
Contract_Month-to-month                    0.405103
OnlineSecurity_No                          0.342637
TechSupport_No                             0.337281
InternetService_Fiber optic                0.308020
PaymentMethod_Electronic check             0.301919
OnlineBackup_No                            0.268005
DeviceProtection_No                        0.252481
MonthlyCharges                             0.193356
PaperlessBilling_Yes                       0.191825
Dependents_No                              0.164221
SeniorCitizen                              0.150889
Partner_No                                 0.150448
StreamingMovies_No                         0.130845
StreamingTV_No                             0.128916
StreamingTV_Yes                            0.063228
StreamingMovies_Yes                        0.061382
MultipleLines_Yes                          0.040102
PhoneService_Yes                           0.011942
CustomerInde

In [17]:
# Droping the less impactfull columns
droping = ['Churn_No','Contract_One year', 'OnlineSecurity_Yes', 'Dependents_Yes','Partner_Yes','PaymentMethod_Credit card (automatic)', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Mailed check','OnlineBackup_Yes','DeviceProtection_Yes','MultipleLines_No','PhoneService_No','MultipleLines_No phone service', 'gender_Male','gender_Female','CustomerIndex','PhoneService_Yes','MultipleLines_Yes','StreamingMovies_Yes','StreamingTV_Yes','StreamingTV_No','StreamingMovies_No','Partner_No','SeniorCitizen','Dependents_No','PaperlessBilling_Yes','MonthlyCharges','DeviceProtection_No','OnlineBackup_No']
df = df.drop(columns=droping, inplace=False)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   tenure                                7043 non-null   int64  
 1   TotalCharges                          7043 non-null   float64
 2   InternetService_DSL                   7043 non-null   bool   
 3   InternetService_Fiber optic           7043 non-null   bool   
 4   InternetService_No                    7043 non-null   bool   
 5   OnlineSecurity_No                     7043 non-null   bool   
 6   OnlineSecurity_No internet service    7043 non-null   bool   
 7   OnlineBackup_No internet service      7043 non-null   bool   
 8   DeviceProtection_No internet service  7043 non-null   bool   
 9   TechSupport_No                        7043 non-null   bool   
 10  TechSupport_No internet service       7043 non-null   bool   
 11  TechSupport_Yes  

In [19]:
df = df.drop(columns=['TechSupport_Yes','InternetService_DSL' ])

In [20]:
corr_matrix = df.corr()
corr_matrix['Churn_Yes'].sort_values(ascending=False)

Churn_Yes                               1.000000
Contract_Month-to-month                 0.405103
OnlineSecurity_No                       0.342637
TechSupport_No                          0.337281
InternetService_Fiber optic             0.308020
PaymentMethod_Electronic check          0.301919
PaperlessBilling_No                    -0.191825
TotalCharges                           -0.199428
InternetService_No                     -0.227890
OnlineSecurity_No internet service     -0.227890
OnlineBackup_No internet service       -0.227890
DeviceProtection_No internet service   -0.227890
TechSupport_No internet service        -0.227890
StreamingTV_No internet service        -0.227890
StreamingMovies_No internet service    -0.227890
Contract_Two year                      -0.302253
tenure                                 -0.352229
Name: Churn_Yes, dtype: float64

In [21]:
# Combining some attributes for better relation with label
df['TechSupport_InternetService'] = ((df['InternetService_Fiber optic'].astype(int)/df['InternetService_Fiber optic'].astype(int).max())+(df['PaymentMethod_Electronic check'].astype(int)/df['PaymentMethod_Electronic check'].astype(int).max()))
df['Internet_Service'] = (1/((df['InternetService_No'].astype(int)/df['InternetService_No'].astype(int).max())+(df['OnlineSecurity_No internet service'].astype(int)/df['OnlineSecurity_No internet service'].astype(int).max())+(df['OnlineBackup_No internet service'].astype(int)/df['OnlineBackup_No internet service'].astype(int).max())+(df['DeviceProtection_No internet service'].astype(int)/df['DeviceProtection_No internet service'].astype(int).max())+(df['TechSupport_No internet service'].astype(int)/df['TechSupport_No internet service'].astype(int).max())+(df['StreamingTV_No internet service'].astype(int)/df['StreamingTV_No internet service'].astype(int).max())+(df['StreamingMovies_No internet service'].astype(int)/df['StreamingMovies_No internet service'].astype(int).max())))


In [23]:
corr_matrix = df.corr()
corr_matrix['Churn_Yes'].sort_values(ascending=False)

Churn_Yes                               1.000000
Contract_Month-to-month                 0.405103
TechSupport_InternetService             0.373115
OnlineSecurity_No                       0.342637
TechSupport_No                          0.337281
InternetService_Fiber optic             0.308020
PaymentMethod_Electronic check          0.301919
PaperlessBilling_No                    -0.191825
TotalCharges                           -0.199428
StreamingMovies_No internet service    -0.227890
TechSupport_No internet service        -0.227890
StreamingTV_No internet service        -0.227890
DeviceProtection_No internet service   -0.227890
OnlineBackup_No internet service       -0.227890
OnlineSecurity_No internet service     -0.227890
InternetService_No                     -0.227890
Contract_Two year                      -0.302253
tenure                                 -0.352229
Internet_Service                             NaN
Name: Churn_Yes, dtype: float64

In [24]:
# Keeping only the most impactfull features 
keep = ['Contract_Month-to-month','TechSupport_InternetService','OnlineSecurity_No','TechSupport_No','tenure','Contract_Two year', 'Churn_Yes']
df = df[keep]
df = df.rename(columns={'TechSupport_InternetService':'FiberOptic_PaymentMeth'})
df.head()

Unnamed: 0,Contract_Month-to-month,FiberOptic_PaymentMeth,OnlineSecurity_No,TechSupport_No,tenure,Contract_Two year,Churn_Yes
0,True,1.0,True,True,1,False,False
1,False,0.0,False,True,34,False,False
2,True,0.0,False,True,2,False,True
3,False,0.0,False,False,45,False,False
4,True,2.0,True,True,2,False,True


In [25]:
# Checking for null values 
df.isnull().sum()

Contract_Month-to-month    0
FiberOptic_PaymentMeth     0
OnlineSecurity_No          0
TechSupport_No             0
tenure                     0
Contract_Two year          0
Churn_Yes                  0
dtype: int64

In [26]:
# Standardizing Data
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scaled_data = scalar.fit_transform(df)
df = pd.DataFrame(scaled_data, columns=df.columns)

In [29]:
# Shuffle splitting the data
x = df.drop('Churn_Yes', axis=1)
y = df['Churn_Yes']
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(x, y):
    shuf_train = df.loc[train_index]
    shuf_test = df.loc[test_index]

In [32]:
# Splitting the data in train and test sets 
X_train = shuf_train.drop('Churn_Yes', axis=1)
Y_train = shuf_test['Churn_Yes']
X_test = shuf_train.drop('Churn_Yes', axis=1)
Y_test = shuf_test['Churn_Yes']

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize the model
model1 = LogisticRegression()

# Train the model on the training data
model1.fit(X_train, Y_train)

# Make predictions on the testing data
y_pred = model1.predict(X_test)

# Evaluate the model's performance
print("Logistic Regression")
print(classification_report(Y_test, y_pred))


Logistic Regression
              precision    recall  f1-score   support

       False       0.86      0.90      0.88      1036
        True       0.69      0.59      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.82      1409



In [23]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Initialize the model
model2 = DecisionTreeClassifier()

# Train the model on the training data
model2.fit(X_train, Y_train)

# Make predictions on the testing data
y_pred = model2.predict(X_test)

# Evaluate the model's performance
print("Decicion Tree")
print(classification_report(Y_test, y_pred))

Decicion Tree
              precision    recall  f1-score   support

       False       0.82      0.80      0.81      1036
        True       0.48      0.50      0.49       373

    accuracy                           0.72      1409
   macro avg       0.65      0.65      0.65      1409
weighted avg       0.73      0.72      0.73      1409



In [24]:

from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier()

# Train the model on the training data
model.fit(X_train, Y_train)

from sklearn.metrics import classification_report

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Random Forest")
print(classification_report(Y_test, y_pred))

Random Forest
              precision    recall  f1-score   support

       False       0.83      0.91      0.87      1036
        True       0.67      0.50      0.57       373

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [25]:

from sklearn.svm import SVC

# Initialize the model
model = SVC()

# Train the model on the training data
model.fit(X_train, Y_train)

from sklearn.metrics import classification_report

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Support Vector Machine")
print(classification_report(Y_test, y_pred))

Support Vector Machine
              precision    recall  f1-score   support

       False       0.84      0.92      0.88      1036
        True       0.69      0.52      0.59       373

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [26]:

from sklearn.ensemble import GradientBoostingClassifier

# Initialize the model
model = GradientBoostingClassifier()

# Train the model on the training data
model.fit(X_train, Y_train)

from sklearn.metrics import classification_report

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Gradient Boost")
print(classification_report(Y_test, y_pred))

Gradient Boost
              precision    recall  f1-score   support

       False       0.85      0.90      0.87      1036
        True       0.67      0.55      0.60       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [27]:

from sklearn.neighbors import KNeighborsClassifier

# Initialize the model
model = KNeighborsClassifier()

# Train the model on the training data
model.fit(X_train, Y_train)

from sklearn.metrics import classification_report

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("KNN")
print(classification_report(Y_test, y_pred))

KNN
              precision    recall  f1-score   support

       False       0.83      0.85      0.84      1036
        True       0.56      0.52      0.54       373

    accuracy                           0.77      1409
   macro avg       0.70      0.69      0.69      1409
weighted avg       0.76      0.77      0.76      1409



In [28]:
# Saving the model for further use 
import pickle

with open ('model.pkl','wb') as f:
    pickle.dump(model1,f)