In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head() 

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Data Processing & Feature Selection
Based on key insights, the following steps are recommended for the modeling phase:

**Categorical Feature Selection**: To optimize the prediction model, focusing on encoding the following high-impact categorical variables:

- InternetService (Priority: Fiber optic)

- Contract (Priority: Month-to-month)

- PaymentMethod (Priority: Electronic check)

- OnlineSecurity / TechSupport (Binary indicators for "No" service)

- Partner / Dependents (Social tie indicators)

**Encoding Strategy**:

- Apply Label Encoding or Mapping for binary features (e.g. Partner, Dependents, Churn).

- Apply One-Hot Encoding for multi-class categorical features like InternetService, Contract and PaymentMethod to capture the specific risk of Fiber Optic and Electronic Checks without implying a numerical order.

**Numerical Handling**: Tenure and MonthlyCharges should be kept as continuous variables, though binning Tenure into "Early Stage" (0-20 months) could provide additional predictive power.

In [27]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

`TotalCharges` is formatted as an object type. This need to be converted into a numeric format (float64) and handle any potential "empty strings" that might be causing the issue.

In [28]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(f"Number of false values in TotalCharges column: {df['TotalCharges'].isnull().sum()}")

Number of false values in TotalCharges column: 11


In [29]:
df['TotalCharges'] = df['TotalCharges'].fillna(0)
print(df['TotalCharges'].dtype)

float64


In [30]:
df['Churn'] = (df['Churn'] == 'Yes').astype(int)
print(df['Churn'].head())

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64


In [31]:
demo_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in demo_cols:
    df[col] = (df[col] == 'Yes').astype(int)
print(df[demo_cols].head())

   Partner  Dependents  PhoneService  PaperlessBilling
0        1           0             0                 1
1        0           0             1                 0
2        0           0             1                 1
3        0           0             0                 0
4        0           0             1                 1


In [35]:
multi_feature_cols = [
    'Contract', 'PaymentMethod', 'InternetService', 'MultipleLines',
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
    'TechSupport', 'StreamingTV', 'StreamingMovies'
]
df_encoded = pd.get_dummies(df, columns=multi_feature_cols,drop_first=True,dtype=int)
for col in df_encoded.columns:
    if any(original in col for original in multi_feature_cols):
        print(col)
df_encoded.head()

Contract_One year
Contract_Two year
PaymentMethod_Credit card (automatic)
PaymentMethod_Electronic check
PaymentMethod_Mailed check
InternetService_Fiber optic
InternetService_No
MultipleLines_No phone service
MultipleLines_Yes
OnlineSecurity_No internet service
OnlineSecurity_Yes
OnlineBackup_No internet service
OnlineBackup_Yes
DeviceProtection_No internet service
DeviceProtection_Yes
TechSupport_No internet service
TechSupport_Yes
StreamingTV_No internet service
StreamingTV_Yes
StreamingMovies_No internet service
StreamingMovies_Yes


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes
0,7590-VHVEG,Female,0,1,0,1,0,1,29.85,29.85,...,0,1,0,0,0,0,0,0,0,0
1,5575-GNVDE,Male,0,0,0,34,1,0,56.95,1889.5,...,0,0,0,1,0,0,0,0,0,0
2,3668-QPYBK,Male,0,0,0,2,1,1,53.85,108.15,...,0,1,0,0,0,0,0,0,0,0
3,7795-CFOCW,Male,0,0,0,45,0,0,42.3,1840.75,...,0,0,0,1,0,1,0,0,0,0
4,9237-HQITU,Female,0,0,0,2,1,1,70.7,151.65,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df_encoded['EarlyStage'] = (df_encoded['tenure'] <= 20).astype(int)
print(df_encoded[['tenure', 'EarlyStage']].head())

   tenure  EarlyStage
0       1           1
1      34           0
2       2           1
3      45           0
4       2           1


In [37]:
cols_to_drop = ['customerID', 'gender']
df_final = df_encoded.drop(columns=cols_to_drop, errors='ignore')
df_final.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,Contract_One year,...,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,EarlyStage
0,0,1,0,1,0,1,29.85,29.85,0,0,...,1,0,0,0,0,0,0,0,0,1
1,0,0,0,34,1,0,56.95,1889.5,0,1,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,2,1,1,53.85,108.15,1,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,0,45,0,0,42.3,1840.75,0,1,...,0,0,1,0,1,0,0,0,0,0
4,0,0,0,2,1,1,70.7,151.65,1,0,...,0,0,0,0,0,0,0,0,0,1


In [43]:
df_final.corr()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,Contract_One year,...,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,EarlyStage
SeniorCitizen,1.0,0.016479,-0.211185,0.016567,0.008576,0.15653,0.220173,0.103006,0.150889,-0.046262,...,0.066572,-0.182742,0.059428,-0.182742,-0.060625,-0.182742,0.105378,-0.182742,0.120176,-0.022458
Partner,0.016479,1.0,0.452676,0.379697,0.017706,-0.014877,0.096848,0.317504,-0.150448,0.082783,...,0.141498,0.000615,0.153786,0.000615,0.119999,0.000615,0.124666,0.000615,0.117412,-0.317997
Dependents,-0.211185,0.452676,1.0,0.159712,-0.001762,-0.111377,-0.11389,0.062078,-0.164221,0.068368,...,0.023671,0.139812,0.013963,0.139812,0.063268,0.139812,-0.016558,0.139812,-0.039741,-0.140744
tenure,0.016567,0.379697,0.159712,1.0,0.008448,0.006152,0.2479,0.826178,-0.352229,0.20257,...,0.360277,-0.039062,0.360653,-0.039062,0.324221,-0.039062,0.279756,-0.039062,0.286111,-0.841447
PhoneService,0.008576,0.017706,-0.001762,0.008448,1.0,0.016505,0.247398,0.113214,0.011942,-0.002791,...,-0.052312,0.172209,-0.071227,0.172209,-0.09634,0.172209,-0.022574,0.172209,-0.032959,-0.000305
PaperlessBilling,0.15653,-0.014877,-0.111377,0.006152,0.016505,1.0,0.35215,0.158574,0.191825,-0.051391,...,0.126735,-0.321013,0.103797,-0.321013,0.03788,-0.321013,0.223841,-0.321013,0.211716,-0.004941
MonthlyCharges,0.220173,0.096848,-0.11389,0.2479,0.247398,0.35215,1.0,0.651174,0.193356,0.004904,...,0.44178,-0.763557,0.482692,-0.763557,0.338304,-0.763557,0.629603,-0.763557,0.627429,-0.207946
TotalCharges,0.103006,0.317504,0.062078,0.826178,0.113214,0.158574,0.651174,1.0,-0.198324,0.170814,...,0.509226,-0.375223,0.521983,-0.375223,0.431883,-0.375223,0.514973,-0.375223,0.520122,-0.672188
Churn,0.150889,-0.150448,-0.164221,-0.352229,0.011942,0.191825,0.193356,-0.198324,1.0,-0.17782,...,-0.082255,-0.22789,-0.06616,-0.22789,-0.164674,-0.22789,0.063228,-0.22789,0.061382,0.318752
Contract_One year,-0.046262,0.082783,0.068368,0.20257,-0.002791,-0.051391,0.004904,0.170814,-0.17782,1.0,...,0.083722,0.038004,0.102495,0.038004,0.095775,0.038004,0.061612,0.038004,0.064926,-0.25703
