In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

import itertools


from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [4]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=1)

In [6]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [7]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [8]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [9]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [10]:
def get_metrics(y_test, preds, name, P):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)

    results['classifier'].append(name)
    results['P'].append(P)
    results['F-Score'].append(fscore[ix])
    results['Precision'].append(precision[ix])
    results['Recall'].append(recall[ix])
    
    

In [11]:
results = {
    'classifier': [],
    'P': [],
    'F-Score': [],
    'Precision': [],
    'Recall': [],
}


## <center>XGBoost<a class="anchor" id="XGBoost"></a><center>

In [12]:
from xgboost import XGBClassifier

In [13]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', XGBClassifier(verbosity=0, random_state = 42)),
])

In [14]:
# params = {"classifier__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
#  "classifier__max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
#  "classifier__min_child_weight" : [ 1, 3, 5, 7 ],
#  "classifier__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
#  "classifier__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

In [15]:
# grid = RandomizedSearchCV(pipeline,
#                     param_distributions=params,
#                     cv=6,
#                     refit=False)

# search = grid.fit(X_train, y_train)
# search.best_params_

In [16]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', XGBClassifier(min_child_weight=3,
                                 max_depth=4,
                                 learning_rate=0.15,
                                 gamma=0.4,
                                 colsample_bytree=0.3, 
                                 verbosity=0,
                                 random_state = 42)),
])

In [17]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [18]:
preds_xgb = pipeline.predict_proba(X_test)[:, 1]
preds_xgb[:10]

array([0.05689038, 0.10632784, 0.0567154 , 0.03288413, 0.0683275 ,
       0.0189064 , 0.21734163, 0.04061214, 0.06918447, 0.02630709],
      dtype=float32)

In [19]:
get_metrics(y_test, preds_xgb, 'XGBoost', '-')

## PU learning

In [20]:
for P in np.linspace(0.1, 1, num=9, endpoint=False):
    pu_data = df.copy()
    pos_ind = np.where(pu_data.iloc[:,-1].values == 1)[0]
    np.random.shuffle(pos_ind)
    pos_sample_len = int(np.ceil(P * len(pos_ind)))
    pos_sample = pos_ind[:pos_sample_len]


    pu_data['class_test'] = -1
    pu_data.loc[pos_sample,'class_test'] = 1

    pu_data = pu_data.sample(frac=1)
    neg_sample = pu_data[pu_data['class_test']==-1][:len(pu_data[pu_data['class_test']==1])]
    sample_test = pu_data[pu_data['class_test']==-1][len(pu_data[pu_data['class_test']==1]):]
    pos_sample = pu_data[pu_data['class_test']==1]
    print(neg_sample.shape, pos_sample.shape)
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

    pipeline = Pipeline([
        ('features',feats),
        ('classifier', XGBClassifier(min_child_weight=3,
                                     max_depth=4,
                                     learning_rate=0.15,
                                     gamma=0.4,
                                     colsample_bytree=0.3, 
                                     verbosity=0,
                                     random_state = 42)),
    ])
    pipeline.fit(sample_train.iloc[:,:-2], sample_train.iloc[:,-2])

    y_predict = pipeline.predict(sample_test.iloc[:,:-2])
    get_metrics(sample_test.iloc[:,-2], y_predict, 'XGBoost', P)

(204, 15) (204, 15)
(408, 15) (408, 15)
(612, 15) (612, 15)
(815, 15) (815, 15)
(1019, 15) (1019, 15)
(1223, 15) (1223, 15)
(1426, 15) (1426, 15)
(1630, 15) (1630, 15)
(1834, 15) (1834, 15)


In [21]:
pd.DataFrame(results)

Unnamed: 0,classifier,P,F-Score,Precision,Recall
0,XGBoost,-,0.654683,0.64432,0.665385
1,XGBoost,0.1,0.492591,0.355108,0.80378
2,XGBoost,0.2,0.468715,0.331742,0.798341
3,XGBoost,0.3,0.485175,0.346598,0.808383
4,XGBoost,0.4,0.465682,0.328822,0.797693
5,XGBoost,0.5,0.43088,0.295931,0.792079
6,XGBoost,0.6,0.380952,0.250112,0.798859
7,XGBoost,0.7,0.323224,0.203878,0.779559
8,XGBoost,0.8,0.249004,0.148987,0.757576
9,XGBoost,0.9,0.164456,0.092537,0.738095
