## Setup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Churn"
COLAB = 'google.colab' in sys.modules
DEBUG = False
SEED = 666

In [4]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [5]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head(2)

(7032, 20)


Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Gender            7032 non-null   category
 1   SeniorCitizen     7032 non-null   category
 2   Partner           7032 non-null   category
 3   Dependents        7032 non-null   category
 4   Tenure            7032 non-null   int64   
 5   PhoneService      7032 non-null   category
 6   MultipleLines     7032 non-null   category
 7   InternetService   7032 non-null   category
 8   OnlineSecurity    7032 non-null   category
 9   OnlineBackup      7032 non-null   category
 10  DeviceProtection  7032 non-null   category
 11  TechSupport       7032 non-null   category
 12  StreamingTV       7032 non-null   category
 13  StreamingMovies   7032 non-null   category
 14  Contract          7032 non-null   category
 15  PaperlessBilling  7032 non-null   category
 16  PaymentMethod     7032 n

## Preprocessing Dataset

### Identify target and features

In [7]:
target = "Churn"
print(f"target = {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c!= target]
print(f"\nCategorical features ({len(cat_features)}): {cat_features}")

num_features = [c for c in df.select_dtypes(["int","float"]).columns if c!= target]
print(f"\nNumerical features ({len(num_features)}): {num_features}")



target = Churn

Categorical features (16): ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Numerical features (3): ['Tenure', 'MonthlyCharges', 'TotalCharges']


### Train/Test split

* Split data train/test -> this time (60% train/40% test)

In [8]:
df.Churn.value_counts(normalize=True)

No     0.734215
Yes    0.265785
Name: Churn, dtype: float64

* `train_size` -> % must use float / num_of_rows must use int
* `randon_state` -> 
* `stratify` -> divide most equally the rate of yes/no between train and test

In [9]:
from sklearn.model_selection import train_test_split

df_train,df_test = train_test_split(df, train_size=0.60, stratify=df[target], random_state=SEED)

print(df_train.shape,df_test.shape)

(4219, 20) (2813, 20)


In [10]:
df_train.Churn.value_counts(normalize=True)

No     0.734297
Yes    0.265703
Name: Churn, dtype: float64

In [11]:
df_test.Churn.value_counts(normalize=True)

No     0.734092
Yes    0.265908
Name: Churn, dtype: float64

### Encode target

* `LabelEncoder` takes a column with strings (categorical) and change to numerical by a Map

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(df_train[target])

y_train = le.transform(df_train[target])
y_test = le.transform(df_test[target])

In [13]:
df_train[target].head()

193      No
6108     No
3580    Yes
2195     No
3809     No
Name: Churn, dtype: category
Categories (2, object): ['No', 'Yes']

In [14]:
y_train[:5]

array([0, 0, 1, 0, 0])

In [15]:
le.inverse_transform(y_train[:5])

array(['No', 'No', 'Yes', 'No', 'No'], dtype=object)

### Encode Categorical features

In [16]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

ohe.fit(df_train[cat_features])

X_cat_train = ohe.transform(df_train[cat_features])
X_cat_test = ohe.transform(df_test[cat_features])

df_cat_train = pd.DataFrame(X_cat_train.toarray(), columns=ohe.get_feature_names_out())
df_cat_test = pd.DataFrame(X_cat_test.toarray(), columns=ohe.get_feature_names_out())
print(df_cat_train.shape, df_cat_test.shape)

df_cat_train.head(2)

(4219, 36) (2813, 36)


Unnamed: 0,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_Yes,TechSupport_No,TechSupport_Yes,StreamingTV_No,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Scale / Transform numerical features

In [17]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(df_train[num_features])

X_num_train = ss.transform(df_train[num_features])
X_num_test = ss.transform(df_test[num_features])

df_num_train = pd.DataFrame(X_num_train, columns=ss.get_feature_names_out())
df_num_test = pd.DataFrame(X_num_test, columns=ss.get_feature_names_out())
print(df_num_train.shape, df_num_test.shape)

df_num_train.head(2)



(4219, 3) (2813, 3)


Unnamed: 0,Tenure,MonthlyCharges,TotalCharges
0,1.597275,1.487874,2.557942
1,-0.596193,-1.32382,-0.804777


### Construct dataframe for model features

In [18]:
df_model_train = pd.concat([df_cat_train, df_num_train], axis=1)
df_model_test = pd.concat([df_cat_test, df_num_test], axis=1)
print(df_model_train.shape, df_model_test.shape)

(4219, 39) (2813, 39)


## Model Selection

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN(3)" : KNeighborsClassifier(3),
    "DT" : DecisionTreeClassifier(),
    "DT(max_depth=5)" : DecisionTreeClassifier(max_depth=5),
    "LR" : LogisticRegression(max_iter=1000),
    "RF" : RandomForestClassifier(),
    "AdaBoost" : AdaBoostClassifier(),
}

In [20]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

metrics = {
    'accuracy' : accuracy_score,
    'precision' : precision_score,
    'recall' : recall_score,
    'f1' : f1_score,
    'roc_auc' : roc_auc_score,
}

In [21]:
def generate_metrics():
    
    data = []
    
    for model_name, model in classifiers.items():
        
        print (f"{model_name} ...")
        
        row = {'Model': model_name}
        model.fit(df_model_train, y_train)
        
        for metric_name, metric in metrics.items():
            # Scoring on SEEN data - effectively "useless"
            y_pred = model.predict(df_model_train)
            row['train_'+metric_name] = metric(y_train, y_pred)
        
            # Scoring on UNSEEN data - important
            y_pred = model.predict(df_model_test)
            row['test_'+metric_name] = metric(y_test, y_pred)
            
        data.append(row)
        #print(f"{name:20s} accuracy\ttrain = {train_accuracy:.2%} \ttest = {test_accuracy:.2%}")
    return pd.DataFrame(data)

df_results = generate_metrics()
print(df_results.shape)
df_results.head()

KNN ...
KNN(3) ...
DT ...
DT(max_depth=5) ...
LR ...
RF ...
AdaBoost ...
(7, 11)


Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.841432,0.757554,0.720273,0.544,0.659233,0.545455,0.688402,0.544726,0.783296,0.689919
1,KNN(3),0.871059,0.7476,0.780369,0.525333,0.716325,0.526738,0.746977,0.526035,0.821687,0.67717
2,DT,0.997867,0.724493,0.999102,0.482581,0.992864,0.5,0.995973,0.491136,0.99627,0.652906
3,DT(max_depth=5),0.802797,0.787416,0.622561,0.589286,0.654773,0.661765,0.638261,0.623426,0.755566,0.747347
4,LR,0.804693,0.803768,0.660888,0.649847,0.544157,0.568182,0.596869,0.606277,0.721562,0.728643


In [22]:
len(df_results.columns)

11

In [23]:
def highlight_col(x):
    model_color = 'background-color: lightgreen'
    alt_color = ['background-color: lightblue','background-color: lightyellow']
    
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)  
    df1.iloc[:, 0] = model_color
    for k in range (1,df.shape[1],2):
        df1.iloc[:,k:k+2] = alt_color[(k//2)%2] 
    return df1 
   
df_results.style.apply(highlight_col, axis=None)

Unnamed: 0,Model,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc
0,KNN,0.841432,0.757554,0.720273,0.544,0.659233,0.545455,0.688402,0.544726,0.783296,0.689919
1,KNN(3),0.871059,0.7476,0.780369,0.525333,0.716325,0.526738,0.746977,0.526035,0.821687,0.67717
2,DT,0.997867,0.724493,0.999102,0.482581,0.992864,0.5,0.995973,0.491136,0.99627,0.652906
3,DT(max_depth=5),0.802797,0.787416,0.622561,0.589286,0.654773,0.661765,0.638261,0.623426,0.755566,0.747347
4,LR,0.804693,0.803768,0.660888,0.649847,0.544157,0.568182,0.596869,0.606277,0.721562,0.728643
5,RF,0.997867,0.788482,0.994662,0.629881,0.997324,0.495989,0.995991,0.554974,0.997694,0.69521
6,AdaBoost,0.808011,0.798791,0.672204,0.646302,0.541481,0.537433,0.599802,0.586861,0.722968,0.715448


In [24]:
import numpy as np
from sklearn.metrics import make_scorer
def business_benefit_score(y_true, y_pred, **kwargs):
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_true, y_pred)
    # entries order to match confusion matrix (tn, fp, fn, tp)
    # profit for each type of customer
    # tn - loyal customer and predicted correctly so customer just stays and company gets 1000
    # fp - loyal customer but predicted to churn so is offered scheme, some will take it up, all stay with company.
    # fn - churning customer but not predicted so customer just leaves and company gets nothing from them
    # tp - churning customer and predicted correctly so offered scheme etc
    profit = np.array([
    [1000, -0.6*25+1000],
    # 985.0
    [0, -0.9*25 + 0.9*0.7*1000]])
    # (0.9 * 0.7 * 975) + (0.9 * 0.3 * -25) + (0.1 * 0) = 607.5
    #print (cm, profit, (profit*cm).sum(), cm.sum() )
    return (profit*cm).sum() / cm.sum()

business_benefit_score([1,1,0],[1,1,0])

738.3333333333334

In [25]:
metrics['business_benefit'] = business_benefit_score

In [26]:
model = classifiers['LR']
model.fit(df_model_train, y_train)
y_pred = model.predict(df_model_train)
business_benefit_score(y_train, y_pred)

821.0191988622896