In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

from sklearn.metrics import (
    accuracy_score,
    f1_score,
)

from feature_engine.encoding import (
    OrdinalEncoder,
    OneHotEncoder,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 25)

In [2]:
data = pd.read_csv("../data/telco_customer_churn_1.csv")
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,2193-SFWQW,Male,0,Yes,Yes,72,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Bank transfer (automatic),111.95,8033.1,No
5996,5656-JAMLX,Male,0,No,No,62,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),19.85,1253.65,No
5997,3462-BJQQA,Female,0,No,No,6,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Electronic check,89.75,552.65,No
5998,0442-TDYUO,Male,0,Yes,No,48,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Mailed check,20.05,1036,No


In [3]:
# replace NaNs of TotalCharges with '-1' and covert col to float (from string)
data['TotalCharges'] = data['TotalCharges'].str.replace(' ', '-1').astype(float)

# Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['customerID', 'Churn'], axis=1),
    data['Churn'],
    test_size=0.2,
    random_state=0,
)

# Categorical and Numerical Features

In [5]:
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'O']
num_vars = [var for var in X_train.columns if var not in cat_vars]

In [6]:
cat_vars

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [7]:
num_vars

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# Categorical Features

In [8]:
cat_vars_onehot = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
cat_vars_ordinal_arbitrary = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)
ordinal_encoder_arbitrary.fit(X_train, y_train)

onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)
onehot_encoder.fit(X_train)

X_train = ordinal_encoder_arbitrary.transform(X_train)
X_test = ordinal_encoder_arbitrary.transform(X_test)

X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# Numerical Features

In [9]:
num_vars_yeo_johnson = ['TotalCharges']

yeo_transformer = YeoJohnsonTransformer(variables=num_vars_yeo_johnson)

X_train = yeo_transformer.fit_transform(X_train)
X_test = yeo_transformer.transform(X_test)

# Target

In [10]:
y_train

3381     No
31       No
1596     No
1386     No
4237    Yes
       ... 
4931     No
3264     No
1653     No
2607    Yes
2732     No
Name: Churn, Length: 4800, dtype: object

In [11]:
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

# Scaling

In [12]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train) 

X_train = pd.DataFrame(min_max_scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(min_max_scaler.transform(X_test), columns=X_train.columns)

# ML

In [13]:
param_C = 0.8
param_max_iter = 100
clf = LogisticRegression(C=param_C, max_iter=param_max_iter, random_state=0)

# Train on all set and evaluate on test
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Cross validation on train set
cv_accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro').mean()

print(f"CV accuracy: {cv_accuracy:.2f}, Test accuracy: {test_accuracy:.2f}\n"
      f"CV f1: {cv_f1:.2f}, Test f1: {test_f1:.2f}")


CV accuracy: 0.80, Test accuracy: 0.79
CV f1: 0.72, Test f1: 0.56


In [14]:
clf.get_params()

{'C': 0.8,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}