Churn Prediction

In [78]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [79]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [80]:
!wget $data

--2025-10-15 11:39:25--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv.1’


2025-10-15 11:39:25 (107 MB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv.1’ saved [977501/977501]



In [81]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [82]:
len(df)

7043

Initial Data Preparation

In [83]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [84]:
# We transpose to see all columns
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [85]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [86]:
# We prepare the data

df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns
string_columns = list(df.dtypes[df.dtypes == "object"].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

In [87]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [88]:
df.dtypes
# we can see that the type of total charges is an object. It should be a float/int

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [89]:
df["totalcharges"]

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: totalcharges, Length: 7043, dtype: object

In [97]:
# we have to convert all values to numbers
# pd.to_numeric(df["totalcharges"])
# this shows the position at which the conversion cannot be carried out
# It also shows what kind of data causes the issue. In our case "_". It can either be caused by an 
# underscore or a space.
#  we will convert/coerce the error to Nan, and then replace the Nan with 0


df["totalcharges"] = pd.to_numeric(df["totalcharges"], errors="coerce")
df[df.totalcharges.isnull()][["customerid", "totalcharges"]]
df["totalcharges"].iloc[488]
df["totalcharges"] = df["totalcharges"].fillna(0)
df["totalcharges"].iloc[488]
df["totalcharges"][df["totalcharges"] == 0]
df["totalcharges"][df["totalcharges"] == 0].value_counts()
df[df.totalcharges == 0][["customerid", "totalcharges"]]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,0.0
753,3115-czmzd,0.0
936,5709-lvoeq,0.0
1082,4367-nuyao,0.0
1340,1371-dwpaz,0.0
3331,7644-omvmy,0.0
3826,3213-vvolg,0.0
4380,2520-sgtta,0.0
5218,2923-arzlg,0.0
6670,4075-wkniu,0.0


In [102]:
# We check our churn variable and convert it no numbers, not yes/no
# (yes = true = 1) and (no = 0 = false)
df.churn
df.churn.head(5)
(df.churn == "yes").astype(int).head()
df.churn = (df.churn == "yes").astype(int)

In [103]:
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

Setting up Validation Framework using Scikit

In [104]:
from sklearn.model_selection import train_test_split

In [None]:
# to see the documentation
train_test_split?

[31mSignature:[39m
train_test_split(
    *arrays,
    test_size=[38;5;28;01mNone[39;00m,
    train_size=[38;5;28;01mNone[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
    shuffle=[38;5;28;01mTrue[39;00m,
    stratify=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None

In [110]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
len(df_full_train), len(df_test), len(df)

(5634, 1409, 7043)

In [113]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)
len(df_train), len(df_val), len(df_test), len(df_full_train), len(df)

(4225, 1409, 1409, 5634, 7043)

In [119]:
# We reset the index and extract the y values

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [117]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [118]:
del df_train["churn"]
del df_val["churn"]
del df_test["churn"]

# Exploratory Data Analysis
- Check missing values
- Look at the target values (churn)
- Look at numerical and Categorical Variables

In [122]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [None]:
# value_counts counts the number of all values in the column
# This shows us the number of churning users(1) and non-churning users(0)
df_full_train.churn.value_counts()
df_full_train.churn.value_counts(normalize=True)        #This shows percentage
# The "1" shows us the churn rate

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [129]:
# We can also compute the churn rate using the mean
df_full_train.churn.mean()
# This gives the same value because the mean is calculated using "1s"

np.float64(0.26996805111821087)

In [None]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 3)
# 27% of users are churning
# This gives an understanding of our target variable

np.float64(0.27)

In [None]:
# We consider our feature Variables - categorical or numerical
df_full_train.nunique()
# This shows you the categorical variables even though they might be numerical variables
# df_full_train.dtypes

customerid          5634
gender                 2
seniorcitizen          2
partner                2
dependents             2
tenure                73
phoneservice           2
multiplelines          3
internetservice        3
onlinesecurity         3
onlinebackup           3
deviceprotection       3
techsupport            3
streamingtv            3
streamingmovies        3
contract               3
paperlessbilling       2
paymentmethod          4
monthlycharges      1494
totalcharges        5291
churn                  2
dtype: int64

In [135]:
# We group the categorical separate from numerical
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [None]:
df_full_train[categorical]
df_full_train[categorical].nunique()
# some of the categorical variables are binary

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64