In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Preprocessing

In [2]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.sample(1)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1391,9885-AIBVB,Male,0,Yes,No,29,Yes,Yes,Fiber optic,No,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,85.8,2440.25,No


In [3]:
df = df.drop("customerID", axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [4]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
df[df.isin([" "]).any(axis=1)].shape

(11, 20)

In [6]:
df = df[df["TotalCharges"] != " "].reset_index(drop=True)
df.shape

(7032, 20)

In [7]:
df[df.isin([" "]).any(axis=1)].shape

(0, 20)

In [8]:
df['Churn'].sample(5)

312     No
2833    No
2216    No
630     No
536     No
Name: Churn, dtype: object

In [9]:
df['TotalCharges'] = df['TotalCharges'].astype(float)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [10]:
df['Churn'].unique()

array([0, 1], dtype=int64)

In [11]:
y = df['Churn']
X = df.drop('Churn', axis = 1)

In [12]:
num_col = X.select_dtypes(include='number').columns 
print(num_col)
cat_col = X.select_dtypes(include='object').columns
print(cat_col)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')
Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')


In [13]:
oh = OneHotEncoder(drop='first', sparse_output=False)

In [14]:
num_oh = oh.fit_transform(X[cat_col])
num_oh_df = pd.DataFrame(num_oh, columns=oh.get_feature_names_out())
num_oh_df.head()

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [15]:
X_oh = pd.concat([num_oh_df, X[num_col]], axis = 1)
X_oh.head()

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0,1,29.85,29.85
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0,34,56.95,1889.5
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0,2,53.85,108.15
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0,45,42.3,1840.75
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0,2,70.7,151.65


In [16]:
X_oh.shape

(7032, 30)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size = 0.15, shuffle = True, random_state = 1)

In [18]:
sc = StandardScaler()
num_scaled = sc.fit_transform(X_train[num_col])
num_scaled_df = pd.DataFrame(num_scaled, columns=sc.get_feature_names_out())
num_scaled_df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,-0.442581,-0.379073,1.065023,-0.085695
1,-0.442581,1.618521,-1.472425,-0.38693
2,-0.442581,-1.031348,-0.478646,-0.847411
3,-0.442581,-0.745978,0.147434,-0.587886
4,-0.442581,-0.052935,0.478694,0.095561


In [22]:
num_scaled_df.shape

(5977, 4)

In [20]:
X_train.shape

(5977, 30)

In [24]:
num_scaled_df.reset_index(drop=True, inplace=True)
X_train_non_num = X_train.drop(columns=num_col).reset_index(drop=True)
X_train = pd.concat([num_scaled_df, X_train_non_num], axis=1)

In [25]:
X_train.shape

(5977, 30)

# Model Training 

## 1. Decisions Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
D_tree = DecisionTreeClassifier(random_state = 1)

In [28]:
D_tree.fit(X_train, y_train)

In [29]:
num_scaled_test = sc.transform(X_test[num_col])
num_scaled_test_df = pd.DataFrame(num_scaled_test, columns=sc.get_feature_names_out())
num_scaled_test_df.reset_index(drop=True, inplace=True)
X_test_non_num = X_test.drop(columns=num_col).reset_index(drop=True)
X_test = pd.concat([num_scaled_test_df, X_test_non_num], axis=1)
X_oh.head()

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0,1,29.85,29.85
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0,34,56.95,1889.5
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0,2,53.85,108.15
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0,45,42.3,1840.75
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0,2,70.7,151.65


In [30]:
print(f"Test score: {D_tree.score(X_test, y_test)}")
print(f"Train score: {D_tree.score(X_train, y_train)}")

Test score: 0.7232227488151659
Train score: 0.9976576878032458


Overfitting

###  Hyperparameter Tuning 

In [43]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [44]:
gsv = GridSearchCV(D_tree, param_grid= param_grid, scoring='accuracy', n_jobs=-1, cv=10, verbose=2 )

In [45]:
gsv.fit(X_train, y_train)

Fitting 10 folds for each of 126 candidates, totalling 1260 fits


In [46]:
gsv.best_score_

0.7908606578040704

In [47]:
gsv.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_leaf': 5,
 'min_samples_split': 2}

In [48]:
D_tree_gsc = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=5, random_state=1)
D_tree_gsc.fit(X_train, y_train)
print(f"Test score: {D_tree_gsc.score(X_test, y_test)}")
print(f"Train score: {D_tree_gsc.score(X_train, y_train)}")

Test score: 0.790521327014218
Train score: 0.7908649824326586


Low accuracy 