In [6]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import time

import warnings
warnings.filterwarnings('ignore')


from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split


# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [4]:
pip install hyperopt

Collecting hyperopt
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting py4j
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Collecting networkx>=2.2
  Downloading networkx-2.6.3-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cloudpickle
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting future
  Downloading future-0.18.3.tar.gz (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.9/840.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: future
  Building wheel for future (setup.py) ... [?25l

In [2]:
tcc = pd.read_csv('/Users/pro/Desktop/【特征工程】电信用户流失案例/WA_Fn-UseC_-Telco-Customer-Churn.csv')

category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethod']

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
 
# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化 
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No',  value=0, inplace=True)
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month']-1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month']-1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)


def cate_colName(Transformer, category_cols, drop='if_binary'):
    """
    离散字段独热编码后字段名创建函数
    
    :param Transformer: 独热编码转化器
    :param category_cols: 输入转化器的离散变量
    :param drop: 独热编码转化器的drop参数
    """
    
    cate_cols_new = []
    col_value = Transformer.categories_
    
    for i, j in enumerate(category_cols):
        if (drop == 'if_binary') & (len(col_value[i]) == 2):
            cate_cols_new.append(j)
        else:
            for f in col_value[i]:
                feature_name = str(j) + '_' + str(f)
                cate_cols_new.append(feature_name)
    return(cate_cols_new)


# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(enc.transform(X_train_seq).toarray(), 
                           columns = cate_colName(enc, seq_new, drop=None))

X_test_seq = pd.DataFrame(enc.transform(X_test_seq).toarray(), 
                          columns = cate_colName(enc, seq_new, drop=None))

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(ord_enc.transform(X_train[category_cols]), columns=category_cols)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(ord_enc.transform(X_test[category_cols]), columns=category_cols)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [13]:
class logit_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    
    def __init__(self, penalty='l2', C=1.0, max_iter=1e8, solver='lbfgs', l1_ratio=None, class_weight=None, thr=0.5):
        self.penalty = penalty
        self.C = C
        self.max_iter = max_iter
        self.solver = solver
        self.l1_ratio = l1_ratio
        self.thr = thr
        self.class_weight = class_weight
        
    def fit(self, X, y):
        clf = LogisticRegression(penalty = self.penalty, 
                                 C = self.C, 
                                 solver = self.solver, 
                                 l1_ratio = self.l1_ratio,
                                 class_weight=self.class_weight, 
                                 max_iter=self.max_iter, 
                                 random_state=12)
        clf.fit(X, y)
        self.coef_ = clf.coef_
        self.clf = clf
        return self
        
    def predict_proba(self, X):
        res_proba = self.clf.predict_proba(X)
        return res_proba
    
    def predict(self, X):
        res = (self.clf.predict_proba(X)[:, 1]>=self.thr) * 1
        return res

    
logistic_search = load('logistic_search.joblib')
tree_model = load('tree_model.joblib')
RF_0 = load('RF_0.joblib')

## Voting

In [14]:
# 训练集上的预测结果
train_prediction1 = logistic_search.best_estimator_.predict(X_train_OE)
train_prediction2 = tree_model.predict(X_train_OE)
train_prediction3 = RF_0.predict(X_train_OE)

# 训练集上的预测概率(预测为1的概率)
train_prediction1_proba = logistic_search.best_estimator_.predict_proba(X_train_OE)[:, 1]
train_prediction2_proba = tree_model.predict_proba(X_train_OE)[:, 1]
train_prediction3_proba = RF_0.predict_proba(X_train_OE)[:, 1]

# 测试集上的预测结果
test_prediction1 = logistic_search.best_estimator_.predict(X_test_OE)
test_prediction2 = tree_model.predict(X_test_OE)
test_prediction3 = RF_0.predict(X_test_OE)

# 测试集上的预测概率
test_prediction1_proba = logistic_search.best_estimator_.predict_proba(X_test_OE)[:, 1]
test_prediction2_proba = tree_model.predict_proba(X_test_OE)[:, 1]
test_prediction3_proba = RF_0.predict_proba(X_test_OE)[:, 1]

In [15]:
RF_0.predict_proba(X_train_OE)

array([[0.99327033, 0.00672967],
       [0.48125643, 0.51874357],
       [0.89506715, 0.10493285],
       ...,
       [0.39270528, 0.60729472],
       [0.9843395 , 0.0156605 ],
       [0.99464364, 0.00535636]])

In [16]:
from sklearn.ensemble import VotingClassifier

In [17]:
estimators = [('lr', logistic_search.best_estimator_), 
              ('tree', tree_model), 
              ('rf', RF_0)]

In [18]:
VC_hard = VotingClassifier(estimators).fit(X_train_OE, y_train)

In [19]:
VC_hard.score(X_train_OE, y_train), VC_hard.score(X_test_OE, y_test)

(0.8345323741007195, 0.7910278250993753)

In [20]:
VC_soft = VotingClassifier(estimators, voting='soft').fit(X_train_OE, y_train)

In [21]:
VC_soft.score(X_train_OE, y_train), VC_soft.score(X_test_OE, y_test)

(0.8258235516849678, 0.787052810902896)

## Advance Voting & Advance Mean

In [22]:
Voting_train_GN = (np.power(train_prediction1_proba * 
                            train_prediction2_proba * 
                            train_prediction3_proba, 1/3) >= 0.5) * 1

Voting_test_GN = (np.power(test_prediction1_proba * 
                           test_prediction2_proba * 
                           test_prediction3_proba, 1/3) >= 0.5) * 1

In [23]:
accuracy_score(Voting_train_GN, y_train), accuracy_score(Voting_test_GN, y_test)

(0.8237410071942446, 0.7842135150482681)

In [24]:
for thr in [0.48, 0.5, 0.52]:

    Voting_train_soft_thr = (((train_prediction1_proba + 
                               train_prediction2_proba + 
                               train_prediction3_proba) / 3) >= thr) * 1
    
    train_acc = accuracy_score(Voting_train_soft_thr, y_train)
    
    Voting_test_soft_thr = (((test_prediction1_proba + 
                              test_prediction2_proba + 
                              test_prediction3_proba) / 3) >= thr) * 1
    
    test_acc = accuracy_score(Voting_test_soft_thr, y_test)
    
    print("threshold %0.2f:" % thr)
    print("train_Accuracy %0.10f" % train_acc)
    print("test_Accuracy %0.10f" % test_acc)

threshold 0.48:
train_Accuracy 0.8263915184
test_Accuracy 0.7904599659
threshold 0.50:
train_Accuracy 0.8258235517
test_Accuracy 0.7870528109
threshold 0.52:
train_Accuracy 0.8226050738
test_Accuracy 0.7813742192


In [25]:
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState

In [26]:
params_space = {'thr': hp.uniform("thr",0.4,0.6)}

In [27]:
# 定义目标函数
def hyperopt_objective(params):
    thr = params['thr']
    
    Voting_train_soft_thr = (((train_prediction1_proba + 
                               train_prediction2_proba + 
                               train_prediction3_proba) / 3) >= thr) * 1
    
    train_acc = accuracy_score(Voting_train_soft_thr, y_train)
    
    return -train_acc

In [28]:
def param_hyperopt(max_evals):
    params_best = fmin(hyperopt_objective,
                       space=params_space,
                       algo=tpe.suggest,
                       max_evals=max_evals, 
                       rstate=np.random.default_rng(17))    
    return params_best

In [29]:
params_best = param_hyperopt(max_evals=3000) 

100%|███| 3000/3000 [00:24<00:00, 124.33trial/s, best loss: -0.8271488072699735]


In [30]:
params_best['thr']

0.4787137835346912

In [31]:
Voting_test_soft_thr = (((test_prediction1_proba + 
                          test_prediction2_proba + 
                          test_prediction3_proba) / 3) >= params_best['thr']) * 1
    
test_acc = accuracy_score(Voting_test_soft_thr, y_test)

test_acc

0.7904599659284497

In [32]:
# 定义超参数空间
params_space = {'thr': hp.uniform("thr",0.4,0.6)}

# 定义目标函数
def hyperopt_objective_test(params):
    thr = params['thr']
    
    Voting_test_soft_thr = (((test_prediction1_proba + 
                              test_prediction2_proba + 
                              test_prediction3_proba) / 3) >= thr) * 1

    test_acc = accuracy_score(Voting_test_soft_thr, y_test)
    
    return -test_acc

# 定义优化函数
def param_hyperopt(max_evals):
    params_best = fmin(hyperopt_objective_test,
                       space=params_space,
                       algo=tpe.suggest,
                       max_evals=max_evals, 
                       rstate=np.random.default_rng(17))    
    return params_best

In [33]:
params_best = param_hyperopt(max_evals=3000) 

100%|███| 3000/3000 [00:23<00:00, 127.22trial/s, best loss: -0.7915956842703009]


In [34]:
params_best

{'thr': 0.4714998631942625}

In [35]:
# 训练集得分
Voting_train_soft_thr = (((train_prediction1_proba + 
                           train_prediction2_proba + 
                           train_prediction3_proba) / 3) >= params_best['thr']) * 1
    
train_acc = accuracy_score(Voting_train_soft_thr, y_train)

train_acc

0.8254449072321091

In [36]:
from sklearn.model_selection import cross_val_score