# Understanding the dataset

In [1]:
import pandas as pd

dataframe = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
dataframe.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [2]:
dataframe.shape
# 21 columns, 7043 rows

dataframe.columns
dataframe.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [3]:
dataframe.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [4]:
dataframe.isnull().sum()

# No missing values

dataframe.duplicated().sum()

# No duplicates

dataframe.isna().sum()

# No NaN values

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

# Numerical vs Categorical Features

In [5]:
features_numerical = dataframe.select_dtypes(include=['int64', 'float64'])
features_numerical.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges'], dtype='object')

In [6]:
features_categorical = dataframe.select_dtypes(include=['object'])
features_categorical.columns

Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges',
       'Churn'],
      dtype='object')

In [7]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig, axes = plt.subplots(6, 3, figsize=(20, 15))
# for i, col in enumerate(features_categorical):
#     sns.countplot(x=col, data=dataframe, ax=axes[i//3, i%3])
# plt.subplots_adjust(hspace=0.5)

In [8]:
# Since graph shows "TotalCharges" is numerical, there must be some non-numeric values in the column
dataframe['TotalCharges'].value_counts()

# Find the values that are not numeric in the column
dataframe["TotalCharges"].str.contains('[^0-9.]', regex=True).sum()

# There are 11 rows with non-numeric values in the column

# Find the rows with non-numeric values
dataframe[dataframe["TotalCharges"].str.contains('[^0-9.]', regex=True)]["TotalCharges"]


488      
753      
936      
1082     
1340     
3331     
3826     
4380     
5218     
6670     
6754     
Name: TotalCharges, dtype: object

# Observations 

1. TotalChanges column has 11 non-numeric or empty values, so those rows need to be imputed
2. No missing values in any other columns
3. Need to drop CustomerID, because that adds no value
4. Many columns have No_internet_service, No_phone_service other than No which can be replaced by No

# Preprocessing

In [9]:
# Imputation

import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
dataframe['TotalCharges'].replace(' ', np.nan, inplace=True)
dataframe['TotalCharges'] = imputer.fit_transform(dataframe['TotalCharges'].values.reshape(-1, 1))

# dataframe['TotalCharges'].str.contains('[^0-9.]', regex=True).sum()


In [10]:
# CustomerID drop

dataframe.drop('customerID', axis=1, inplace=True)
dataframe.head(10)

# Remove duplicates if any
dataframe.drop_duplicates(inplace=True)

In [11]:
# Replacing No_internet_service with No, and No_phone_service with No

# Replace 'No phone service' with 'No' in MultipleLines column
dataframe['MultipleLines'].replace('No phone service', 'No', inplace=True)
dataframe['MultipleLines'].value_counts()

# Replace 'No internet service' with 'No' in 6 columns
columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in columns:
    dataframe[col].replace('No internet service', 'No', inplace=True)
    print(dataframe[col].value_counts())



OnlineSecurity
No     5002
Yes    2019
Name: count, dtype: int64
OnlineBackup
No     4592
Yes    2429
Name: count, dtype: int64
DeviceProtection
No     4599
Yes    2422
Name: count, dtype: int64
TechSupport
No     4977
Yes    2044
Name: count, dtype: int64
StreamingTV
No     4314
Yes    2707
Name: count, dtype: int64
StreamingMovies
No     4289
Yes    2732
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe['MultipleLines'].replace('No phone service', 'No', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe[col].replace('No internet service', 'No', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

In [12]:
# Encoding categorical columns and then dividing the dataset into training and testing sets 

features = dataframe.drop('Churn', axis=1)
target = dataframe['Churn']

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
target = encoder.fit_transform(target)

# One hot encoding of categorical columns

features = pd.get_dummies(features, columns=['PaymentMethod', 'Contract', 'InternetService']) # More than 2 values
features = pd.get_dummies(features, drop_first=True).astype('float64') # Binary columns
features


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract_Month-to-month,Contract_One year,...,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,PaperlessBilling_Yes
0,0.0,1.0,29.85,29.85,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,34.0,56.95,1889.50,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,2.0,53.85,108.15,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,45.0,42.30,1840.75,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,2.0,70.70,151.65,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,24.0,84.80,1990.50,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
7039,0.0,72.0,103.20,7362.90,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
7040,0.0,11.0,29.60,346.45,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7041,1.0,4.0,74.40,306.60,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Scaling the features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
features[columns] = scaler.fit_transform(features[columns])


In [14]:
# Splitting the dataset into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7921708185053381

In [15]:
# Feature selection using information gain

from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(X_train, y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

# Selecting the top 20 features

top_features = mutual_info.sort_values(ascending=False).head(20)
top_features.index

Index(['Contract_Month-to-month', 'tenure', 'Contract_Two year',
       'MonthlyCharges', 'InternetService_Fiber optic',
       'PaymentMethod_Electronic check', 'TotalCharges', 'InternetService_No',
       'TechSupport_Yes', 'OnlineSecurity_Yes', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)', 'Contract_One year',
       'PaymentMethod_Mailed check', 'DeviceProtection_Yes', 'SeniorCitizen',
       'Partner_Yes', 'PaymentMethod_Bank transfer (automatic)',
       'InternetService_DSL', 'gender_Male'],
      dtype='object')