**Remarks**

In this session, we talk about the implementation cost sensitive in churn. What is churn? Customer churn predictive modeling deals with predicting the probability of a customer defecting using historical, behavioral and socio-economical information. This tool is of great benefit to subscription based companies allowing them to maximize the results of retention campaigns. 

In [26]:
# install new libraries
!pip install costcla

Installing collected packages: pyea, costcla
Successfully installed costcla-0.6 pyea-0.2


In [1]:
# prepare library
import pandas as pd
import numpy as np
import zipfile

# prepare dataset
with zipfile.ZipFile('../content/cost_sensitive_classification_churn.zip', 'r') as z:
  f = z.open('cost_sensitive_classification_churn.csv')
  data = pd.read_csv(f, index_col=False)

In [2]:
# showing the data
data.head()

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,C_FP,C_FN,C_TP,C_TN,target
0,0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.122696,-0.053953,-0.054962,-0.42021,-0.121782,-0.203241,0.510704,-0.052071,-0.11185,-0.158451,-0.122449,-0.128396,-0.106616,-0.371085,-0.146501,-0.208086,-0.066014,-0.075314,-0.435398,-0.04151,1.0,1.0,5.0,2.0,2.0,74.0,1028.571429,121.828571,0.0,0.0
1,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.122696,-0.053953,-0.054962,-0.42021,-0.121782,-0.203241,-0.69078,-0.052071,-0.11185,-0.158451,-0.122449,-0.128396,-0.106616,-0.371085,-0.146501,-0.208086,-0.066014,-0.075314,-0.435398,-0.04151,3.0,1.0,5.0,2.0,4.0,53.428571,1028.571429,82.742857,0.0,0.0
2,2,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.122696,-0.053953,-0.054962,0.77083,-0.121782,-0.203241,-0.788197,-0.052071,-0.11185,-0.158451,-0.122449,-0.128396,-0.106616,-0.371085,-0.146501,-0.208086,-0.066014,-0.075314,0.836751,-0.04151,1.0,8.0,3.0,1.0,4.0,66.285714,1285.714286,102.928571,0.0,0.0
3,3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,-0.122696,-0.053953,-0.054962,0.77083,-0.121782,-0.203241,0.185979,-0.052071,-0.11185,-0.158451,-0.122449,-0.128396,-0.106616,-1.514826,-0.146501,-0.208086,-0.066014,-0.075314,0.836751,-0.04151,1.0,8.0,4.0,3.0,2.0,92.0,1285.714286,151.785714,0.0,0.0
4,4,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.122696,-0.053953,-0.054962,-0.42021,-0.121782,-0.203241,-0.788197,-0.052071,-0.11185,-0.158451,-0.122449,-0.128396,-0.106616,-0.371085,-0.146501,-0.208086,-0.066014,-0.075314,-0.435398,-0.04151,1.0,7.0,5.0,2.0,4.0,53.428571,1028.571429,82.742857,0.0,0.0


In [48]:
# count data target
print("Counting target: \n{}".format(data.target.value_counts(normalize=True) * 100))

Counting target: 
0.0    95.212709
1.0     4.787291
Name: target, dtype: float64


In [49]:
# splitting data
X = data[['x'+str(i) for i in range(1, 47)]]
y = data.target
cost_mat = data[['C_FP', 'C_FN', 'C_TP', 'C_TN']].values

In [50]:
# cross validation split
from sklearn.model_selection import train_test_split
temp = train_test_split(X, y, cost_mat, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = temp

In [69]:
# 1.1 try to modelling (LR, Ensemble, Undersampling LR, Undersampling Ensemble)
## NO UNDERSAMPLING
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# prepare model
clf = {"LR": {"f": LogisticRegression(max_iter=400)},
       "XGB": {"f": xgb.XGBClassifier(booster="gblinear")}}

# run fitting data
for model in clf.keys():
  # fit
  clf[model]['f'].fit(X_train, y_train)
  # predict result
  clf[model]['c'] = clf[model]['f'].predict(X_test)
  clf[model]['p'] = clf[model]['f'].predict_proba(X_test)
  clf[model]['p_train'] = clf[model]['f'].predict_proba(X_train)

# calculate F1Score and savings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from costcla.metrics import savings_score, cost_loss
measures = {"f1": f1_score, "pre": precision_score,
            "rec": recall_score, "acc": accuracy_score}

result = pd.DataFrame(columns=measures.keys())

# evaluate each model
for model in clf.keys():
  result.loc[model] = [measures[measure](y_test, clf[model]["c"]) for measure in measures.keys()]

result["sav"] = np.zeros(result.shape[0])
for model in clf.keys():
  result["sav"].loc[model] = savings_score(y_test, clf[model]["c"], cost_mat_test)

result

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,f1,pre,rec,acc,sav
LR,0.0,0.0,0.0,0.948472,0.0
XGB,0.0,0.0,0.0,0.948472,0.0


In [67]:
## UNDERSAMPLING
df_train = pd.concat([X_train, y_train], axis=1)
count_class_0, count_class_1 = df_train.target.value_counts()

df_class_0 = df_train[df_train['target'] == 0]
df_class_1 = df_train[df_train['target'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
df_train_under = pd.concat([df_class_0_under, df_class_1], axis=0)
print('Random under-sampling:')
print(df_train_under.target.value_counts())
X_train_under = df_train_under.drop(['target'], axis=1)
y_train_under = df_train_under.target

# prepare model
clf = {"LR-under": {"f": LogisticRegression(max_iter=350)},
       "XGB-under": {"f": xgb.XGBClassifier(booster="gblinear")}}

# try to modelling
for model in clf.keys():
  # fit
  clf[model]['f'].fit(X_train_under, y_train_under)
  # predict result
  clf[model]['c'] = clf[model]['f'].predict(X_test)
  clf[model]['p'] = clf[model]['f'].predict_proba(X_test)
  clf[model]['p_train'] = clf[model]['f'].predict_proba(X_train)

# calculate F1Score and savings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from costcla.metrics import savings_score, cost_loss
measures = {"f1": f1_score, "pre": precision_score,
            "rec": recall_score, "acc": accuracy_score}

result = pd.DataFrame(columns=measures.keys())

# evaluate each model
for model in clf.keys():
  result.loc[model] = [measures[measure](y_test, clf[model]["c"]) for measure in measures.keys()]
result["sav"] = np.zeros(result.shape[0])
for model in clf.keys():
  result["sav"].loc[model] = savings_score(y_test, clf[model]["c"], cost_mat_test)

result

Random under-sampling:
1.0    304
0.0    304
Name: target, dtype: int64


Unnamed: 0,f1,pre,rec,acc,sav
LR-under,0.144951,0.082179,0.613793,0.626866,0.08191
XGB-under,0.122605,0.075235,0.331034,0.755864,0.005595
