---
- [RandomForestClassifier](#RandomForestClassifier)
- [XGBoost](#XGBoost)
- [LogisticRegression](#LogisticRegression)
- [Итоги](#Results)


---

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

import itertools


from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [4]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [6]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [7]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [8]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [9]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [10]:
def get_metrics(y_test, preds, name):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    tn, fp, fn, tp = confusion_matrix(y_test, preds>thresholds[ix]).ravel()
    
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix]))

    results['classifier'].append(name)
    results['Threshold'].append(thresholds[ix])
    results['F-Score'].append(fscore[ix])
    results['Precision'].append(precision[ix])
    results['Recall'].append(recall[ix])
    results['TN'].append(tn)
    results['FP'].append(fp)
    results['FN'].append(fn)
    results['TP'].append(tp)
    
    

In [11]:
results = {
    'classifier': [],
    'Threshold': [],
    'F-Score': [],
    'Precision': [],
    'Recall': [],
    'TN': [],
    'FP': [],
    'FN': [],
    'TP': []    
}


## <center>RandomForestClassifier<a class="anchor" id="RandomForestClassifier"></a><center>

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
params = {'classifier__bootstrap': [True, False],
 'classifier__max_features':[0.3, 0.5, 0.7, 1],
 'classifier__max_depth': [None],
 'classifier__min_samples_leaf': [1, 2, 4],
 'classifier__min_samples_split': [2, 5, 10]}

In [14]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [15]:
# grid = RandomizedSearchCV(pipeline,
#                     param_distributions=params,
#                     cv=6,
#                     refit=False)

# search = grid.fit(X_train, y_train)
# search.best_params_

In [16]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(max_depth=None,
                                          max_features=0.5,
                                          min_samples_leaf=3,
                                          min_samples_split=2,
                                          bootstrap=True)),
])

In [17]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [18]:
preds_rfc = pipeline.predict_proba(X_test)[:, 1]
preds_rfc[:10]

array([0.39065476, 0.18663889, 0.11068651, 0.0187619 , 0.01672161,
       0.79934848, 0.03610714, 0.06265476, 0.11809524, 0.82071219])

In [19]:
get_metrics(y_test, preds_rfc, 'RandomForestClassifier')

Best Threshold=0.424333, F-Score=0.649, Precision=0.686, Recall=0.617


## <center>XGBoost<a class="anchor" id="XGBoost"></a><center>

In [20]:
from xgboost import XGBClassifier

In [21]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', XGBClassifier(verbosity=0, random_state = 42)),
])

In [22]:
params = {"classifier__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "classifier__max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "classifier__min_child_weight" : [ 1, 3, 5, 7 ],
 "classifier__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "classifier__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

In [23]:
# grid = RandomizedSearchCV(pipeline,
#                     param_distributions=params,
#                     cv=6,
#                     refit=False)

# search = grid.fit(X_train, y_train)
# search.best_params_

In [24]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', XGBClassifier(min_child_weight=3,
                                          max_depth=4,
                                          learning_rate=0.15,
                                          gamma=0.4,
                                          colsample_bytree=0.3)),
])

In [25]:
pipeline.fit(X_train, y_train)



Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [26]:
preds_xgb = pipeline.predict_proba(X_test)[:, 1]
preds_xgb[:10]

array([0.34531116, 0.28531826, 0.1849831 , 0.09145842, 0.03780986,
       0.6439046 , 0.03958668, 0.05979925, 0.30944738, 0.62123966],
      dtype=float32)

In [27]:
get_metrics(y_test, preds_xgb, 'XGBoost')

Best Threshold=0.321245, F-Score=0.642, Precision=0.608, Recall=0.680


## <center>LogisticRegression<a class="anchor" id="LogisticRegression"></a><center>

In [28]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [29]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [30]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42)),
])

In [31]:
params = {'classifier__penalty': ['l1','l2'],
          'classifier__C': [0.001,0.01,0.1,1,10,100,1000]}

In [32]:
grid = GridSearchCV(pipeline,
                    param_grid=params,
                    cv=6,
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_

{'classifier__C': 0.1, 'classifier__penalty': 'l2'}

In [33]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(penalty='l2',
                                      C=0.1)),
])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [34]:
preds_lr = pipeline.predict_proba(X_test)[:, 1]
preds_lr[:10]

array([0.22855166, 0.33314466, 0.15635778, 0.1252202 , 0.15637723,
       0.63812437, 0.06310282, 0.08021382, 0.36888648, 0.76117602])

In [35]:
get_metrics(y_test, preds_lr, 'LogisticRegression')

Best Threshold=0.290711, F-Score=0.510, Precision=0.467, Recall=0.562


## <center>Итоги<a class="anchor" id="Results"></a><center>

In [36]:
total = pd.DataFrame(data=results)

In [37]:
total['Total_profit'] = total['TP']*2 - (total['TP']+total['FP'])*1

In [38]:
total

Unnamed: 0,classifier,Threshold,F-Score,Precision,Recall,TN,FP,FN,TP,Total_profit
0,RandomForestClassifier,0.424333,0.649431,0.68559,0.616896,1847,144,196,313,169
1,XGBoost,0.321245,0.641929,0.608084,0.679764,1768,223,164,345,122
2,LogisticRegression,0.290711,0.509804,0.466558,0.561886,1664,327,224,285,-42
