In [1]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings
warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin

# 自定义模块
from telcoFunc import *
# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethod']

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
 
# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化 
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [31]:
train = pd.read_csv('train_new.csv')
test = pd.read_csv('test_new.csv')

In [32]:
train.shape

(5282, 259)

In [33]:
X_train = train.iloc[:,:-1]
X_test = test.iloc[:,:-1]
y_train = train.iloc[:,-1]
y_test = test.iloc[:,-1]

In [None]:
parameter_space = {
    "min_samples_leaf": range(1, 10, 3), 
    "min_samples_split": range(1, 10, 3),
    "max_depth": range(5, 16, 5),
    "n_estimators": range(10, 160, 70), 
    "max_features":['sqrt', 'log2'] + list(range(8, 17, 2)), 
    "max_samples":[None, 0.4, 0.5, 0.6]}

# 实例化模型与评估器
RF_0 = RandomForestClassifier(random_state=12)
grid_RF_0 = GridSearchCV(RF_0, parameter_space, n_jobs=-1)

# 模型训练
grid_RF_0.fit(X_train, y_train)

In [40]:
grid_RF_0.best_score_

0.8084053639517215

In [41]:
grid_RF_0.score(X_train, y_train), grid_RF_0.score(X_test, y_test)

(0.8492995077622113, 0.7932992617830777)

In [42]:
grid_RF_0.best_params_

{'max_depth': 10,
 'max_features': 'log2',
 'max_samples': None,
 'min_samples_leaf': 7,
 'min_samples_split': 4,
 'n_estimators': 150}

In [None]:
parameter_space = {
    "min_samples_leaf": range(6, 9), 
    "min_samples_split": range(2, 7, 2),
    "max_depth": range(9, 12),
    "n_estimators": range(130, 170, 10), 
    "max_features":['sqrt', 'log2'] + list(range(7, 9)), 
    "max_samples":[None, 0.4, 0.5, 0.6]}

# 实例化模型与评估器
RF_0 = RandomForestClassifier(random_state=12)
grid_RF_0 = GridSearchCV(RF_0, parameter_space, n_jobs=-1)

# 模型训练
grid_RF_0.fit(X_train, y_train)

In [44]:
grid_RF_0.best_score_

0.8093516169261201

In [45]:
grid_RF_0.score(X_train, y_train), grid_RF_0.score(X_test, y_test)

(0.8536539189700871, 0.7910278250993753)

In [46]:
grid_RF_0.best_params_

{'max_depth': 11,
 'max_features': 'log2',
 'max_samples': None,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'n_estimators': 150}

In [None]:
parameter_space = {
    "min_samples_leaf": range(6, 9), 
    "min_samples_split": range(2, 4),
    "max_depth": range(8, 12),
    "n_estimators": range(147, 153), 
    "max_features":['log2'] + list(range(7, 9)), 
    "max_samples":[None]}

# 实例化模型与评估器
RF_0 = RandomForestClassifier(random_state=12)
grid_RF_0 = GridSearchCV(RF_0, parameter_space, n_jobs=-1)

# 模型训练
grid_RF_0.fit(X_train, y_train)

In [60]:
grid_RF_0.best_score_

0.8093516169261201

In [61]:
grid_RF_0.score(X_train, y_train), grid_RF_0.score(X_test, y_test)

(0.8536539189700871, 0.7910278250993753)

In [62]:
grid_RF_0.best_params_

{'max_depth': 11,
 'max_features': 'log2',
 'max_samples': None,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'n_estimators': 150}

|Models|CV.best_score_|train_score|test_score|
|:--:|:--:|:--:|:--:|
|Logistic+grid|0.8045|0.8075|0.7956|
|RF+grid_R1|0.8084|0.8493|0.7933|
|RF+grid_R2|0.8094|0.8537|0.7910|
|RF+grid_R3|0.8094|0.8537|0.7910|