# <center> 【Kaggle】Telco Customer Churn

In [None]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer

# 实用函数
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin

# 自定义模块
from telcoFunc import *

# re模块相关
import inspect, re

from tqdm import tqdm
import gc

In [None]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethod']

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [None]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [None]:
cd /content/drive/MyDrive/feature engineering

/content/drive/MyDrive/feature engineering


In [None]:
# 导入特征衍生模块
import features_creation as fc
from features_creation import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
fc?

In [None]:
train, test = train_test_split(tcc, random_state=22)

In [None]:
X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

In [None]:
X_train.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
4067,Female,0,Yes,No,68,Yes,No,DSL,Yes,Yes,No,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),79.6,5515.8
3306,Female,0,Yes,Yes,3,Yes,No,Fiber optic,No,Yes,No,Yes,No,No,Month-to-month,Yes,Electronic check,80.0,241.3
3391,Male,0,No,No,4,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.0,73.45
3249,Female,0,Yes,Yes,10,Yes,No,DSL,No,Yes,Yes,No,No,No,Month-to-month,Yes,Mailed check,55.55,551.3
2674,Female,1,No,No,4,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.05,91.45


In [None]:
X_train.shape

(5282, 19)

In [None]:
y_train

4067    0
3306    0
3391    0
3249    0
2674    0
       ..
5478    0
356     0
4908    1
6276    0
2933    0
Name: Churn, Length: 5282, dtype: int64

然后围绕tenure列，进行分训练集和测试集的时序特征衍生：

In [None]:
X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

In [None]:
# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

In [None]:
X_train_seq.head()

Unnamed: 0,tenure_year
4067,2014
3306,2019
3391,2019
3249,2019
2674,2019


In [None]:
# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

In [None]:
X_train_seq.head()

Unnamed: 0,tenure_year,tenure_month
4067,2014,5
3306,2019,10
3391,2019,9
3249,2019,3
2674,2019,9


In [None]:
# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month']-1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month']-1) // 3) + 1

In [None]:
X_train_seq.head()

Unnamed: 0,tenure_year,tenure_month,tenure_quarter
4067,2014,5,2
3306,2019,10,4
3391,2019,9,3
3249,2019,3,1
2674,2019,9,3


In [None]:
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

OneHotEncoder()

In [None]:
seq_new = list(X_train_seq.columns)

In [None]:
# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(enc.transform(X_train_seq).toarray(),
                           columns = cate_colName(enc, seq_new, drop=None))

X_test_seq = pd.DataFrame(enc.transform(X_test_seq).toarray(),
                          columns = cate_colName(enc, seq_new, drop=None))


X_train_seq.head(5)

Unnamed: 0,tenure_year_2014,tenure_year_2015,tenure_year_2016,tenure_year_2017,tenure_year_2018,tenure_year_2019,tenure_year_2020,tenure_month_1,tenure_month_2,tenure_month_3,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# 首先进行index调整
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

# 然后进行数据集拼接
df_temp = pd.concat([X_train_seq, y_train], axis=1)

In [None]:
df_temp.head()

Unnamed: 0,tenure_year_2014,tenure_year_2015,tenure_year_2016,tenure_year_2017,tenure_year_2018,tenure_year_2019,tenure_year_2020,tenure_month_1,tenure_month_2,tenure_month_3,...,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4,Churn
4067,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3391,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3249,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2674,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [None]:
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

In [None]:
df_corr

Churn               1.000000
tenure_year_2019    0.320558
tenure_quarter_4    0.201279
tenure_month_12     0.196918
tenure_month_11     0.048933
tenure_month_10     0.036697
tenure_year_2018    0.025471
tenure_month_9      0.017106
tenure_month_8      0.002575
tenure_quarter_3    0.000518
tenure_month_6     -0.018544
tenure_month_3     -0.019293
tenure_month_7     -0.019341
tenure_year_2020   -0.025913
tenure_month_4     -0.035117
tenure_year_2017   -0.038261
tenure_month_5     -0.047308
tenure_quarter_2   -0.062907
tenure_year_2016   -0.065928
tenure_month_2     -0.082142
tenure_year_2015   -0.100355
tenure_month_1     -0.131205
tenure_quarter_1   -0.155000
tenure_year_2014   -0.227663
Name: Churn, dtype: float64

In [None]:
def features_test(features_train_new,
                  features_test_new,
                  X_train,
                  X_test,
                  y_train,
                  y_test,
                  category_cols,
                  numeric_cols):
    """
    新特征测试函数

    :param features_train_new: 训练集衍生特征
    :param features_test_new: 测试集衍生特征
    :param X_train: 训练集特征
    :param X_test: 测试集特征
    :param y_train: 训练集标签
    :param y_test: 测试集标签
    :param category_cols: 离散列名称
    :param numeric_cols: 连续列名称
    :return: result_df评估指标
    """

    # 数据准备
    # 如果是一个衍生特征，则将其转化为series
    if type(features_train_new) == np.ndarray:
        name = 'features_train_new'
        features_train_new = pd.Series(features_train_new, name=name)

    if type(features_test_new) == np.ndarray:
        name = 'features_test_new'
        features_test_new = pd.Series(features_test_new, name=name)

    # 复制里散列、连续列的列名称
    category_cols = category_cols.copy()
    numeric_cols = numeric_cols.copy()

    # 修改衍生特征矩阵的index
    features_train_new.index = X_train.index
    features_test_new.index = X_test.index

    # 将衍生特征和原始特征进行拼接
    X_train = pd.concat([X_train, features_train_new], axis=1)
    X_test = pd.concat([X_test, features_test_new], axis=1)

    # 判断衍生特征是连续还是离散
    if type(features_train_new) == pd.DataFrame:
        for col in features_train_new:
            if features_train_new[col].nunique() >= 15:
                numeric_cols.append(col)
            else:
                category_cols.append(col)

    else:
        if features_train_new.nunique() >= 15:
            numeric_cols.append(name)
        else:
            category_cols.append(name)


    # print(category_cols)
    # 检验列是否划分完全
    assert len(category_cols) + len(numeric_cols) == X_train.shape[1]

    # 训练部分
    # 设置转化器流
    logistic_pre = ColumnTransformer([
        ('cat', preprocessing.OneHotEncoder(drop='if_binary'), category_cols),
        ('num', 'passthrough', numeric_cols)
    ])

    num_pre = ['passthrough', preprocessing.StandardScaler(), preprocessing.KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')]

    # 实例化逻辑回归评估器
    logistic_model = logit_threshold(max_iter=int(1e8))

    # 设置机器学习流
    logistic_pipe = make_pipeline(logistic_pre, logistic_model)

    # 设置超参数空间
    logistic_param = [
        {'columntransformer__num':num_pre, 'logit_threshold__penalty': ['l1'], 'logit_threshold__C': np.arange(0.1, 1.1, 0.1).tolist(), 'logit_threshold__solver': ['saga']},
        {'columntransformer__num':num_pre, 'logit_threshold__penalty': ['l2'], 'logit_threshold__C': np.arange(0.1, 1.1, 0.1).tolist(), 'logit_threshold__solver': ['lbfgs', 'newton-cg', 'sag', 'saga']},
    ]

    # 实例化网格搜索评估器
    logistic_search = GridSearchCV(estimator = logistic_pipe,
                                   param_grid = logistic_param,
                                   scoring='accuracy',
                                   n_jobs = 12)

    # 输出时间
    s = time.time()
    logistic_search.fit(X_train, y_train)
    print(time.time()-s, "s")

    # 计算预测结果
    return(logistic_search.best_score_, logistic_search.best_params_)

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 3].index)
print(new_col)

train_new_temp = X_train_seq[new_col]
test_new_temp = X_test_seq[new_col]

features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

['tenure_year_2019', 'tenure_year_2014']
46.666375398635864 s


(0.8097323757919785,
 {'columntransformer__num': StandardScaler(),
  'logit_threshold__C': 0.2,
  'logit_threshold__penalty': 'l1',
  'logit_threshold__solver': 'saga'})

In [None]:
numeric_cols

['tenure', 'MonthlyCharges', 'TotalCharges']

In [None]:
colNames = ['MonthlyCharges', 'TotalCharges']
colNames

['MonthlyCharges', 'TotalCharges']

In [None]:
Polynomial_Features?

[1;31mSignature:[0m [0mPolynomial_Features[0m[1;33m([0m[0mcolNames[0m[1;33m,[0m [0mdegree[0m[1;33m,[0m [0mX_train[0m[1;33m,[0m [0mX_test[0m[1;33m,[0m [0mmulti[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
多项式特征衍生函数

:param colNames: 参与交叉衍生的列名称
:param degree: 多项式最高阶
:param X_train: 训练集特征
:param X_test: 测试集特征
:param multi: 是否进行多变量多项式组衍生

:return：多项式衍生后的新特征和新列名称
[1;31mFile:[0m      d:\work\jupyter\telco\正式课程\features_creation.py
[1;31mType:[0m      function


In [None]:
X_train_ply, X_test_ply, colNames_train_new, colNames_test_new = Polynomial_Features(colNames=colNames,
                                                                                     degree=3,
                                                                                     X_train=X_train,
                                                                                     X_test=X_test)

In [None]:
X_train_ply

Unnamed: 0,MonthlyCharges**2*TotalCharges**0,MonthlyCharges**1*TotalCharges**1,MonthlyCharges**0*TotalCharges**2,MonthlyCharges**3*TotalCharges**0,MonthlyCharges**2*TotalCharges**1,MonthlyCharges**1*TotalCharges**2,MonthlyCharges**0*TotalCharges**3
0,6336.1600,439057.6800,3.042405e+07,5.043583e+05,3.494899e+07,2.421754e+09,1.678130e+11
1,6400.0000,19304.0000,5.822569e+04,5.120000e+05,1.544320e+06,4.658055e+06,1.404986e+07
2,361.0000,1395.5500,5.394903e+03,6.859000e+03,2.651545e+04,1.025031e+05,3.962556e+05
3,3085.8025,30624.7150,3.039317e+05,1.714163e+05,1.701203e+06,1.688341e+07,1.675575e+08
4,402.0025,1833.5725,8.363103e+03,8.060150e+03,3.676313e+04,1.676802e+05,7.648057e+05
...,...,...,...,...,...,...,...
5277,11299.6900,583268.1000,3.010717e+07,1.201157e+06,6.200140e+07,3.200392e+09,1.651980e+11
5278,2926.8100,48094.9000,7.903210e+05,1.583404e+05,2.601934e+06,4.275637e+07,7.025954e+08
5279,11267.8225,334637.8750,9.938256e+06,1.196079e+06,3.552181e+07,1.054946e+09,3.133035e+10
5280,414.1225,6836.5825,1.128624e+05,8.427393e+03,1.391245e+05,2.296750e+06,3.791612e+07


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_train_ply)

StandardScaler()

In [None]:
scaler.transform(X_train_ply)

array([[ 0.33464874,  1.03533026,  1.24455267, ...,  0.68739317,
         0.85860063,  0.94153185],
       [ 0.35164345, -0.72233392, -0.63923349, ..., -0.6317181 ,
        -0.56737633, -0.50565539],
       [-1.25598529, -0.79732324, -0.64251092, ..., -0.69165442,
        -0.5700639 , -0.50577314],
       ...,
       [ 1.64749563,  0.59808585, -0.02631204, ...,  0.71001313,
         0.05224576, -0.23556692],
       [-1.24184366, -0.77453962, -0.63584403, ..., -0.68720762,
        -0.5687694 , -0.50544955],
       [-1.19875062, -0.63590995, -0.47101134, ..., -0.65484604,
        -0.53090569, -0.46601802]])

In [None]:
X_train_ply = pd.DataFrame(scaler.transform(X_train_ply), columns=colNames_train_new)
X_train_ply.index = X_train.index
X_train_ply.head()

Unnamed: 0,MonthlyCharges**2*TotalCharges**0,MonthlyCharges**1*TotalCharges**1,MonthlyCharges**0*TotalCharges**2,MonthlyCharges**3*TotalCharges**0,MonthlyCharges**2*TotalCharges**1,MonthlyCharges**1*TotalCharges**2,MonthlyCharges**0*TotalCharges**3
4067,0.334649,1.03533,1.244553,0.15919,0.687393,0.858601,0.941532
3306,0.351643,-0.722334,-0.639233,0.177696,-0.631718,-0.567376,-0.505655
3391,-1.255985,-0.797323,-0.642511,-1.045667,-0.691654,-0.570064,-0.505773
3249,-0.530622,-0.67493,-0.623991,-0.647138,-0.625523,-0.560164,-0.504331
2674,-1.24507,-0.795489,-0.642327,-1.042758,-0.69125,-0.570025,-0.50577


In [None]:
X_test_ply = pd.DataFrame(scaler.transform(X_test_ply), columns=colNames_test_new)
X_test_ply.index = X_test.index
X_test_ply.head()

Unnamed: 0,MonthlyCharges**2*TotalCharges**0,MonthlyCharges**1*TotalCharges**1,MonthlyCharges**0*TotalCharges**2,MonthlyCharges**3*TotalCharges**0,MonthlyCharges**2*TotalCharges**1,MonthlyCharges**1*TotalCharges**2,MonthlyCharges**0*TotalCharges**3
6187,-1.244,-0.789203,-0.641146,-1.042464,-0.690048,-0.569799,-0.505737
6448,1.563321,1.922314,1.756934,1.713345,1.997069,1.818142,1.569246
5492,-1.181689,-0.617925,-0.453174,-1.023058,-0.648504,-0.52449,-0.459669
2028,-1.24982,-0.775952,-0.636024,-1.044043,-0.687671,-0.568853,-0.505462
5376,-1.201928,-0.76095,-0.631667,-1.029834,-0.683246,-0.5676,-0.505117


In [None]:
# 然后进行数据集拼接
df_temp = pd.concat([X_train_ply, y_train], axis=1)

df_temp.head()

Unnamed: 0,MonthlyCharges**2*TotalCharges**0,MonthlyCharges**1*TotalCharges**1,MonthlyCharges**0*TotalCharges**2,MonthlyCharges**3*TotalCharges**0,MonthlyCharges**2*TotalCharges**1,MonthlyCharges**1*TotalCharges**2,MonthlyCharges**0*TotalCharges**3,Churn
4067,0.334649,1.03533,1.244553,0.15919,0.687393,0.858601,0.941532,0
3306,0.351643,-0.722334,-0.639233,0.177696,-0.631718,-0.567376,-0.505655,0
3391,-1.255985,-0.797323,-0.642511,-1.045667,-0.691654,-0.570064,-0.505773,0
3249,-0.530622,-0.67493,-0.623991,-0.647138,-0.625523,-0.560164,-0.504331,0
2674,-1.24507,-0.795489,-0.642327,-1.042758,-0.69125,-0.570025,-0.50577,0


In [None]:
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

In [None]:
df_corr

Churn                                1.000000
MonthlyCharges**2*TotalCharges**0    0.151934
MonthlyCharges**3*TotalCharges**0    0.120895
MonthlyCharges**2*TotalCharges**1   -0.121555
MonthlyCharges**1*TotalCharges**1   -0.148175
MonthlyCharges**1*TotalCharges**2   -0.152267
MonthlyCharges**0*TotalCharges**3   -0.158389
MonthlyCharges**0*TotalCharges**2   -0.178061
Name: Churn, dtype: float64

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 3].index)
print(new_col)

train_new_temp = X_train_ply[new_col]
test_new_temp = X_test_ply[new_col]

features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

['MonthlyCharges**0*TotalCharges**2', 'MonthlyCharges**0*TotalCharges**3']
46.75545644760132 s


(0.810678628766377,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.1,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'newton-cg'})

In [None]:
# 查看分类变量
category_cols

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [None]:
# 查看每个分类变量的取值水平
for feature in tcc[category_cols]:
        print(f'{feature}: {tcc[feature].unique()}')

gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


In [None]:
Cross_Combination?

[1;31mSignature:[0m [0mCross_Combination[0m[1;33m([0m[0mcolNames[0m[1;33m,[0m [0mX_train[0m[1;33m,[0m [0mX_test[0m[1;33m,[0m [0mmulti[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mOneHot[0m[1;33m=[0m[1;32mTrue[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
交叉组合特征衍生函数

:param colNames: 参与交叉衍生的列名称
:param X_train: 训练集特征
:param X_test: 测试集特征
:param multi: 是否进行多变量交叉组合
:param OneHot: 是否进行独热编码

:return：交叉衍生后的新特征和特征名称
[1;31mFile:[0m      d:\work\jupyter\telco\正式课程\features_creation.py
[1;31mType:[0m      function


In [None]:
features_train_new, features_test_new, colNames_train_new, colNames_test_new = Cross_Combination(category_cols, X_train, X_test)

In [None]:
features_train_new.head()

Unnamed: 0,gender&SeniorCitizen_Female&0,gender&SeniorCitizen_Female&1,gender&SeniorCitizen_Male&0,gender&SeniorCitizen_Male&1,gender&Partner_Female&No,gender&Partner_Female&Yes,gender&Partner_Male&No,gender&Partner_Male&Yes,gender&Dependents_Female&No,gender&Dependents_Female&Yes,...,Contract&PaymentMethod_Two year&Electronic check,Contract&PaymentMethod_Two year&Mailed check,PaperlessBilling&PaymentMethod_No&Bank transfer (automatic),PaperlessBilling&PaymentMethod_No&Credit card (automatic),PaperlessBilling&PaymentMethod_No&Electronic check,PaperlessBilling&PaymentMethod_No&Mailed check,PaperlessBilling&PaymentMethod_Yes&Bank transfer (automatic),PaperlessBilling&PaymentMethod_Yes&Credit card (automatic),PaperlessBilling&PaymentMethod_Yes&Electronic check,PaperlessBilling&PaymentMethod_Yes&Mailed check
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
features_train_new.index

RangeIndex(start=0, stop=5282, step=1)

In [None]:
X_train.index

Int64Index([4067, 3306, 3391, 3249, 2674, 2757, 5237, 5856, 3069, 6631,
            ...
            6628, 2527, 2952, 4587, 6646, 5478,  356, 4908, 6276, 2933],
           dtype='int64', length=5282)

In [None]:
features_train_new.index = X_train.index

In [None]:
df_temp = pd.concat([features_train_new, y_train], axis=1)

In [None]:
df_temp.head()

Unnamed: 0,gender&SeniorCitizen_Female&0,gender&SeniorCitizen_Female&1,gender&SeniorCitizen_Male&0,gender&SeniorCitizen_Male&1,gender&Partner_Female&No,gender&Partner_Female&Yes,gender&Partner_Male&No,gender&Partner_Male&Yes,gender&Dependents_Female&No,gender&Dependents_Female&Yes,...,Contract&PaymentMethod_Two year&Mailed check,PaperlessBilling&PaymentMethod_No&Bank transfer (automatic),PaperlessBilling&PaymentMethod_No&Credit card (automatic),PaperlessBilling&PaymentMethod_No&Electronic check,PaperlessBilling&PaymentMethod_No&Mailed check,PaperlessBilling&PaymentMethod_Yes&Bank transfer (automatic),PaperlessBilling&PaymentMethod_Yes&Credit card (automatic),PaperlessBilling&PaymentMethod_Yes&Electronic check,PaperlessBilling&PaymentMethod_Yes&Mailed check,Churn
4067,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3306,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3391,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
3249,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2674,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0


In [None]:
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

In [None]:
df_corr

Churn                                                                 1.000000
OnlineSecurity&Contract_No&Month-to-month                             0.441598
TechSupport&Contract_No&Month-to-month                                0.430260
InternetService&Contract_Fiber optic&Month-to-month                   0.415189
OnlineSecurity&TechSupport_No&No                                      0.398994
                                                                        ...   
PhoneService&InternetService_Yes&No                                  -0.230200
OnlineSecurity&TechSupport_No internet service&No internet service   -0.230200
Partner&Contract_Yes&Two year                                        -0.248640
PhoneService&Contract_Yes&Two year                                   -0.283557
SeniorCitizen&Contract_0&Two year                                    -0.289069
Name: Churn, Length: 762, dtype: float64

In [None]:
np.abs(df_corr).sort_values(ascending = False)[1: 11]

OnlineSecurity&Contract_No&Month-to-month                 0.441598
TechSupport&Contract_No&Month-to-month                    0.430260
InternetService&Contract_Fiber optic&Month-to-month       0.415189
OnlineSecurity&TechSupport_No&No                          0.398994
OnlineBackup&Contract_No&Month-to-month                   0.381821
Contract&PaperlessBilling_Month-to-month&Yes              0.373529
Contract&PaymentMethod_Month-to-month&Electronic check    0.373363
Dependents&Contract_No&Month-to-month                     0.364952
InternetService&OnlineSecurity_Fiber optic&No             0.363039
OnlineSecurity&PaymentMethod_No&Electronic check          0.359524
Name: Churn, dtype: float64

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 4].index)
new_col

['OnlineSecurity&Contract_No&Month-to-month',
 'TechSupport&Contract_No&Month-to-month',
 'InternetService&Contract_Fiber optic&Month-to-month']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

In [None]:
train_new_temp.head()

Unnamed: 0,OnlineSecurity&Contract_No&Month-to-month,TechSupport&Contract_No&Month-to-month,InternetService&Contract_Fiber optic&Month-to-month
4067,0.0,0.0,0.0
3306,1.0,0.0,1.0
3391,0.0,0.0,0.0
3249,1.0,1.0,0.0
2674,0.0,0.0,0.0


In [None]:
features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

44.58748745918274 s


(0.8097323757919785,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.1,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'lbfgs'})

In [None]:
colNames = ['OnlineSecurity', 'Contract', 'TechSupport']
colNames

['OnlineSecurity', 'Contract', 'TechSupport']

尝试对其进行交叉组合特征衍生：

In [None]:
features_train_new, features_test_new, colNames_train_new, colNames_test_new = Cross_Combination(colNames,
                                                                                                 X_train,
                                                                                                 X_test,
                                                                                                 multi=True)

In [None]:
features_train_new.head()

Unnamed: 0,OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service,OnlineSecurity&Contract&TechSupport_No&Month-to-month&No,OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_No&One year&No,OnlineSecurity&Contract&TechSupport_No&One year&Yes,OnlineSecurity&Contract&TechSupport_No&Two year&No,OnlineSecurity&Contract&TechSupport_No&Two year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_Yes&One year&No,OnlineSecurity&Contract&TechSupport_Yes&One year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Two year&No,OnlineSecurity&Contract&TechSupport_Yes&Two year&Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
features_train_new.shape

(5282, 15)

In [None]:
(features_train_new == 0).sum() / 5282

OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service    0.926543
OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service          0.948315
OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service          0.912533
OnlineSecurity&Contract&TechSupport_No&Month-to-month&No                                      0.689133
OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes                                     0.931655
OnlineSecurity&Contract&TechSupport_No&One year&No                                            0.958728
OnlineSecurity&Contract&TechSupport_No&One year&Yes                                           0.965165
OnlineSecurity&Contract&TechSupport_No&Two year&No                                            0.987694
OnlineSecurity&Contract&TechSupport_No&Two year&Yes                                           0.967815
OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No                

In [None]:
features_train_new.index = X_train.index

In [None]:
df_temp = pd.concat([features_train_new, y_train], axis=1)

In [None]:
df_temp.head()

Unnamed: 0,OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service,OnlineSecurity&Contract&TechSupport_No&Month-to-month&No,OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_No&One year&No,OnlineSecurity&Contract&TechSupport_No&One year&Yes,OnlineSecurity&Contract&TechSupport_No&Two year&No,OnlineSecurity&Contract&TechSupport_No&Two year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_Yes&One year&No,OnlineSecurity&Contract&TechSupport_Yes&One year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Two year&No,OnlineSecurity&Contract&TechSupport_Yes&Two year&Yes,Churn
4067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3306,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3391,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3249,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2674,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

In [None]:
df_corr

Churn                                                                                         1.000000
OnlineSecurity&Contract&TechSupport_No&Month-to-month&No                                      0.440002
OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes                                     0.042028
OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No                                     0.021370
OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&Yes                                   -0.029948
OnlineSecurity&Contract&TechSupport_No&One year&No                                           -0.034648
OnlineSecurity&Contract&TechSupport_No&Two year&No                                           -0.042966
OnlineSecurity&Contract&TechSupport_No&One year&Yes                                          -0.044896
OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service   -0.060186
OnlineSecurity&Contract&TechSupport_Yes&One year&Yes                     

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 2].index)
new_col

['OnlineSecurity&Contract&TechSupport_No&Month-to-month&No']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

In [None]:
train_new_temp.head()

Unnamed: 0,OnlineSecurity&Contract&TechSupport_No&Month-to-month&No
4067,0.0
3306,0.0
3391,0.0
3249,1.0
2674,0.0


In [None]:
features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

42.55664372444153 s


(0.8082181201800406,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.6,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'lbfgs'})

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 3].index)
print(new_col)

train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

['OnlineSecurity&Contract&TechSupport_No&Month-to-month&No', 'OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service']
43.986942291259766 s


(0.8097325549726213,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.4,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'lbfgs'})

In [None]:
X_train_seq.head()

Unnamed: 0,tenure_year_2014,tenure_year_2015,tenure_year_2016,tenure_year_2017,tenure_year_2018,tenure_year_2019,tenure_year_2020,tenure_month_1,tenure_month_2,tenure_month_3,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
4067,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3391,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3249,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2674,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

# 拼接数据集
train_temp = pd.concat([X_train[category_cols], X_train_seq], axis=1)
test_temp = pd.concat([X_test[category_cols], X_test_seq], axis=1)

In [None]:
train_temp.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
4067,Female,0,Yes,No,Yes,No,DSL,Yes,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3306,Female,0,Yes,Yes,Yes,No,Fiber optic,No,Yes,No,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3391,Male,0,No,No,Yes,No,No,No internet service,No internet service,No internet service,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3249,Female,0,Yes,Yes,Yes,No,DSL,No,Yes,Yes,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2674,Female,1,No,No,Yes,No,No,No internet service,No internet service,No internet service,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
train_temp.shape, test_temp.shape

((5282, 39), (1761, 39))

In [None]:
list(train_temp.columns)

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_year_2014',
 'tenure_year_2015',
 'tenure_year_2016',
 'tenure_year_2017',
 'tenure_year_2018',
 'tenure_year_2019',
 'tenure_year_2020',
 'tenure_month_1',
 'tenure_month_2',
 'tenure_month_3',
 'tenure_month_4',
 'tenure_month_5',
 'tenure_month_6',
 'tenure_month_7',
 'tenure_month_8',
 'tenure_month_9',
 'tenure_month_10',
 'tenure_month_11',
 'tenure_month_12',
 'tenure_quarter_1',
 'tenure_quarter_2',
 'tenure_quarter_3',
 'tenure_quarter_4']

In [None]:
len(list(train_temp.columns))

39

In [None]:
features_train_new, features_test_new, colNames_train_new, colNames_test_new = Cross_Combination(list(train_temp.columns),
                                                                                                 train_temp,
                                                                                                 test_temp)

In [None]:
features_train_new.head()

Unnamed: 0,gender&SeniorCitizen_Female&0,gender&SeniorCitizen_Female&1,gender&SeniorCitizen_Male&0,gender&SeniorCitizen_Male&1,gender&Partner_Female&No,gender&Partner_Female&Yes,gender&Partner_Male&No,gender&Partner_Male&Yes,gender&Dependents_Female&No,gender&Dependents_Female&Yes,...,tenure_quarter_1&tenure_quarter_4_1.0&0.0,tenure_quarter_2&tenure_quarter_3_0.0&0.0,tenure_quarter_2&tenure_quarter_3_0.0&1.0,tenure_quarter_2&tenure_quarter_3_1.0&0.0,tenure_quarter_2&tenure_quarter_4_0.0&0.0,tenure_quarter_2&tenure_quarter_4_0.0&1.0,tenure_quarter_2&tenure_quarter_4_1.0&0.0,tenure_quarter_3&tenure_quarter_4_0.0&0.0,tenure_quarter_3&tenure_quarter_4_0.0&1.0,tenure_quarter_3&tenure_quarter_4_1.0&0.0
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 查看拼接后的df
df_temp.head()

Unnamed: 0,gender&SeniorCitizen_Female&0,gender&SeniorCitizen_Female&1,gender&SeniorCitizen_Male&0,gender&SeniorCitizen_Male&1,gender&Partner_Female&No,gender&Partner_Female&Yes,gender&Partner_Male&No,gender&Partner_Male&Yes,gender&Dependents_Female&No,gender&Dependents_Female&Yes,...,tenure_quarter_2&tenure_quarter_3_0.0&0.0,tenure_quarter_2&tenure_quarter_3_0.0&1.0,tenure_quarter_2&tenure_quarter_3_1.0&0.0,tenure_quarter_2&tenure_quarter_4_0.0&0.0,tenure_quarter_2&tenure_quarter_4_0.0&1.0,tenure_quarter_2&tenure_quarter_4_1.0&0.0,tenure_quarter_3&tenure_quarter_4_0.0&0.0,tenure_quarter_3&tenure_quarter_4_0.0&1.0,tenure_quarter_3&tenure_quarter_4_1.0&0.0,Churn
4067,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
3306,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3391,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3249,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
2674,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0


In [None]:
# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)
df_corr

Churn                                                  1.000000
OnlineSecurity&Contract_No&Month-to-month              0.441598
TechSupport&Contract_No&Month-to-month                 0.430260
InternetService&Contract_Fiber optic&Month-to-month    0.415189
Contract&tenure_year_2014_Month-to-month&0.0           0.402879
                                                         ...   
Contract&tenure_year_2020_Two year&0.0                -0.301604
tenure_year_2019&tenure_month_12_0.0&0.0              -0.303862
tenure_year_2018&tenure_year_2019_0.0&0.0             -0.315952
tenure_year_2019&tenure_year_2020_0.0&0.0             -0.317597
SeniorCitizen&tenure_year_2019_0&0.0                  -0.319902
Name: Churn, Length: 3590, dtype: float64

In [None]:
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                                  1.000000
OnlineSecurity&Contract_No&Month-to-month              0.441598
TechSupport&Contract_No&Month-to-month                 0.430260
InternetService&Contract_Fiber optic&Month-to-month    0.415189
Contract&tenure_year_2014_Month-to-month&0.0           0.402879
Contract&tenure_year_2020_Month-to-month&0.0           0.400606
OnlineSecurity&tenure_year_2019_No&1.0                 0.399682
OnlineSecurity&TechSupport_No&No                       0.398994
Contract&tenure_year_2015_Month-to-month&0.0           0.398467
Contract&tenure_month_2_Month-to-month&0.0             0.395384
Contract&tenure_month_1_Month-to-month&0.0             0.392526
OnlineSecurity&tenure_year_2014_No&0.0                 0.391337
Contract&tenure_month_5_Month-to-month&0.0             0.391318
InternetService&tenure_year_2019_Fiber optic&1.0       0.391223
TechSupport&tenure_year_2019_No&1.0                    0.390156
Contract&tenure_month_4_Month-to-month&0

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 4].index)
new_col

['OnlineSecurity&Contract_No&Month-to-month',
 'TechSupport&Contract_No&Month-to-month',
 'InternetService&Contract_Fiber optic&Month-to-month']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

In [None]:
features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

44.43435311317444 s


(0.8097323757919785,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.1,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'lbfgs'})

In [None]:
X_train_seq.head()

Unnamed: 0,tenure_year_2014,tenure_year_2015,tenure_year_2016,tenure_year_2017,tenure_year_2018,tenure_year_2019,tenure_year_2020,tenure_month_1,tenure_month_2,tenure_month_3,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
4067,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3391,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3249,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2674,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
col1 = ['OnlineSecurity', 'Contract', 'TechSupport']
col2 = ['tenure_year_2014', 'tenure_year_2019', 'tenure_year_2020']

In [None]:
# 拼接数据集
train_temp = pd.concat([X_train[col1], X_train_seq[col2]], axis=1)
test_temp = pd.concat([X_test[col1], X_test_seq[col2]], axis=1)

In [None]:
train_temp.head()

Unnamed: 0,OnlineSecurity,Contract,TechSupport,tenure_year_2014,tenure_year_2019,tenure_year_2020
4067,Yes,Two year,Yes,1.0,0.0,0.0
3306,No,Month-to-month,Yes,0.0,1.0,0.0
3391,No internet service,Month-to-month,No internet service,0.0,1.0,0.0
3249,No,Month-to-month,No,0.0,1.0,0.0
2674,No internet service,Month-to-month,No internet service,0.0,1.0,0.0


In [None]:
train_temp.shape, test_temp.shape

((5282, 6), (1761, 6))

In [None]:
cl = list(train_temp.columns)
cl

['OnlineSecurity',
 'Contract',
 'TechSupport',
 'tenure_year_2014',
 'tenure_year_2019',
 'tenure_year_2020']

In [None]:
n = len(list(train_temp.columns))
n

6

In [None]:
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            print(i, j, k)

0 1 2
0 1 3
0 1 4
0 1 5
0 2 3
0 2 4
0 2 5
0 3 4
0 3 5
0 4 5
1 2 3
1 2 4
1 2 5
1 3 4
1 3 5
1 4 5
2 3 4
2 3 5
2 4 5
3 4 5


In [None]:
col_temp = []

for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            col_temp.append(cl[i])
            col_temp.append(cl[j])
            col_temp.append(cl[k])
            print(col_temp)
            col_temp = []

['OnlineSecurity', 'Contract', 'TechSupport']
['OnlineSecurity', 'Contract', 'tenure_year_2014']
['OnlineSecurity', 'Contract', 'tenure_year_2019']
['OnlineSecurity', 'Contract', 'tenure_year_2020']
['OnlineSecurity', 'TechSupport', 'tenure_year_2014']
['OnlineSecurity', 'TechSupport', 'tenure_year_2019']
['OnlineSecurity', 'TechSupport', 'tenure_year_2020']
['OnlineSecurity', 'tenure_year_2014', 'tenure_year_2019']
['OnlineSecurity', 'tenure_year_2014', 'tenure_year_2020']
['OnlineSecurity', 'tenure_year_2019', 'tenure_year_2020']
['Contract', 'TechSupport', 'tenure_year_2014']
['Contract', 'TechSupport', 'tenure_year_2019']
['Contract', 'TechSupport', 'tenure_year_2020']
['Contract', 'tenure_year_2014', 'tenure_year_2019']
['Contract', 'tenure_year_2014', 'tenure_year_2020']
['Contract', 'tenure_year_2019', 'tenure_year_2020']
['TechSupport', 'tenure_year_2014', 'tenure_year_2019']
['TechSupport', 'tenure_year_2014', 'tenure_year_2020']
['TechSupport', 'tenure_year_2019', 'tenure_yea

In [None]:
# 创建容器
col_temp = []
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

# 多次循环、遍历三三组合
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            col_temp.append(cl[i])
            col_temp.append(cl[j])
            col_temp.append(cl[k])
            features_train1, features_test1, colNames_train, colNames_test = Cross_Combination(col_temp,
                                                                                               train_temp,
                                                                                               test_temp,
                                                                                               multi=True)

            colNames_train_new.extend(colNames_train)
            colNames_test_new.extend(colNames_test)
            features_train_new.append(features_train1)
            features_test_new.append(features_test1)

            col_temp = []

In [None]:
features_train_new[0].head()

Unnamed: 0,OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service,OnlineSecurity&Contract&TechSupport_No&Month-to-month&No,OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_No&One year&No,OnlineSecurity&Contract&TechSupport_No&One year&Yes,OnlineSecurity&Contract&TechSupport_No&Two year&No,OnlineSecurity&Contract&TechSupport_No&Two year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_Yes&One year&No,OnlineSecurity&Contract&TechSupport_Yes&One year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Two year&No,OnlineSecurity&Contract&TechSupport_Yes&Two year&Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
features_train_new[1].head()

Unnamed: 0,OnlineSecurity&Contract&tenure_year_2014_No internet service&Month-to-month&0.0,OnlineSecurity&Contract&tenure_year_2014_No internet service&Month-to-month&1.0,OnlineSecurity&Contract&tenure_year_2014_No internet service&One year&0.0,OnlineSecurity&Contract&tenure_year_2014_No internet service&One year&1.0,OnlineSecurity&Contract&tenure_year_2014_No internet service&Two year&0.0,OnlineSecurity&Contract&tenure_year_2014_No internet service&Two year&1.0,OnlineSecurity&Contract&tenure_year_2014_No&Month-to-month&0.0,OnlineSecurity&Contract&tenure_year_2014_No&Month-to-month&1.0,OnlineSecurity&Contract&tenure_year_2014_No&One year&0.0,OnlineSecurity&Contract&tenure_year_2014_No&One year&1.0,OnlineSecurity&Contract&tenure_year_2014_No&Two year&0.0,OnlineSecurity&Contract&tenure_year_2014_No&Two year&1.0,OnlineSecurity&Contract&tenure_year_2014_Yes&Month-to-month&0.0,OnlineSecurity&Contract&tenure_year_2014_Yes&Month-to-month&1.0,OnlineSecurity&Contract&tenure_year_2014_Yes&One year&0.0,OnlineSecurity&Contract&tenure_year_2014_Yes&One year&1.0,OnlineSecurity&Contract&tenure_year_2014_Yes&Two year&0.0,OnlineSecurity&Contract&tenure_year_2014_Yes&Two year&1.0
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)

In [None]:
features_train_new.head()

Unnamed: 0,OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service,OnlineSecurity&Contract&TechSupport_No&Month-to-month&No,OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_No&One year&No,OnlineSecurity&Contract&TechSupport_No&One year&Yes,OnlineSecurity&Contract&TechSupport_No&Two year&No,OnlineSecurity&Contract&TechSupport_No&Two year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No,...,TechSupport&tenure_year_2019&tenure_year_2020_No&0.0&0.0,TechSupport&tenure_year_2019&tenure_year_2020_No&0.0&1.0,TechSupport&tenure_year_2019&tenure_year_2020_No&1.0&0.0,TechSupport&tenure_year_2019&tenure_year_2020_Yes&0.0&0.0,TechSupport&tenure_year_2019&tenure_year_2020_Yes&0.0&1.0,TechSupport&tenure_year_2019&tenure_year_2020_Yes&1.0&0.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&0.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&1.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&1.0&0.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_1.0&0.0&0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


同样，可以简单检验下衍生特征矩阵的稀疏性：

In [None]:
features_train_new.shape

(5282, 225)

In [None]:
(features_train_new == 0).sum() / 5282

OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service    0.926543
OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service          0.948315
OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service          0.912533
OnlineSecurity&Contract&TechSupport_No&Month-to-month&No                                      0.689133
OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes                                     0.931655
                                                                                                ...   
TechSupport&tenure_year_2019&tenure_year_2020_Yes&1.0&0.0                                     0.960242
tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&0.0                                0.515146
tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&1.0                                0.998107
tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&1.0&0.0           

In [None]:
((features_train_new == 0).sum() / 5282).sort_values(ascending = False)

OnlineSecurity&Contract&tenure_year_2019_No&Two year&1.0                           1.000000
Contract&TechSupport&tenure_year_2014_Month-to-month&No internet service&1.0       1.000000
OnlineSecurity&Contract&tenure_year_2014_No internet service&Month-to-month&1.0    1.000000
TechSupport&tenure_year_2019&tenure_year_2020_No&0.0&1.0                           0.999811
OnlineSecurity&TechSupport&tenure_year_2020_Yes&No&1.0                             0.999811
                                                                                     ...   
Contract&TechSupport&tenure_year_2020_Month-to-month&No&0.0                        0.616622
TechSupport&tenure_year_2014&tenure_year_2020_No&0.0&0.0                           0.560394
OnlineSecurity&tenure_year_2014&tenure_year_2020_No&0.0&0.0                        0.557175
tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&0.0                     0.515146
Contract&tenure_year_2014&tenure_year_2020_Month-to-month&0.0&0.0               

In [None]:
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 查看拼接后的df
df_temp.head()

Unnamed: 0,OnlineSecurity&Contract&TechSupport_No internet service&Month-to-month&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&One year&No internet service,OnlineSecurity&Contract&TechSupport_No internet service&Two year&No internet service,OnlineSecurity&Contract&TechSupport_No&Month-to-month&No,OnlineSecurity&Contract&TechSupport_No&Month-to-month&Yes,OnlineSecurity&Contract&TechSupport_No&One year&No,OnlineSecurity&Contract&TechSupport_No&One year&Yes,OnlineSecurity&Contract&TechSupport_No&Two year&No,OnlineSecurity&Contract&TechSupport_No&Two year&Yes,OnlineSecurity&Contract&TechSupport_Yes&Month-to-month&No,...,TechSupport&tenure_year_2019&tenure_year_2020_No&0.0&1.0,TechSupport&tenure_year_2019&tenure_year_2020_No&1.0&0.0,TechSupport&tenure_year_2019&tenure_year_2020_Yes&0.0&0.0,TechSupport&tenure_year_2019&tenure_year_2020_Yes&0.0&1.0,TechSupport&tenure_year_2019&tenure_year_2020_Yes&1.0&0.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&0.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&0.0&1.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_0.0&1.0&0.0,tenure_year_2014&tenure_year_2019&tenure_year_2020_1.0&0.0&0.0,Churn
4067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3306,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3391,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3249,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2674,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [None]:
# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)
df_corr

Churn                                                                                      1.000000
OnlineSecurity&Contract&tenure_year_2014_No&Month-to-month&0.0                             0.443017
OnlineSecurity&Contract&tenure_year_2020_No&Month-to-month&0.0                             0.441598
OnlineSecurity&Contract&TechSupport_No&Month-to-month&No                                   0.440002
Contract&TechSupport&tenure_year_2014_Month-to-month&No&0.0                                0.433924
                                                                                             ...   
OnlineSecurity&TechSupport&tenure_year_2020_No internet service&No internet service&0.0   -0.229197
Contract&tenure_year_2019&tenure_year_2020_Two year&0.0&0.0                               -0.294312
OnlineSecurity&Contract&tenure_year_2014_No internet service&Month-to-month&1.0                 NaN
OnlineSecurity&Contract&tenure_year_2019_No&Two year&1.0                                        NaN


In [None]:
# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                                                1.000000
OnlineSecurity&Contract&tenure_year_2014_No&Month-to-month&0.0       0.443017
OnlineSecurity&Contract&tenure_year_2020_No&Month-to-month&0.0       0.441598
OnlineSecurity&Contract&TechSupport_No&Month-to-month&No             0.440002
Contract&TechSupport&tenure_year_2014_Month-to-month&No&0.0          0.433924
Contract&TechSupport&tenure_year_2020_Month-to-month&No&0.0          0.430260
OnlineSecurity&TechSupport&tenure_year_2014_No&No&0.0                0.411071
Contract&tenure_year_2014&tenure_year_2020_Month-to-month&0.0&0.0    0.402879
OnlineSecurity&Contract&tenure_year_2019_No&Month-to-month&1.0       0.402734
OnlineSecurity&TechSupport&tenure_year_2019_No&No&1.0                0.401730
OnlineSecurity&tenure_year_2014&tenure_year_2019_No&0.0&1.0          0.399682
OnlineSecurity&tenure_year_2019&tenure_year_2020_No&1.0&0.0          0.399682
OnlineSecurity&TechSupport&tenure_year_2020_No&No&0.0           

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 4].index)
new_col

['OnlineSecurity&Contract&tenure_year_2014_No&Month-to-month&0.0',
 'OnlineSecurity&Contract&tenure_year_2020_No&Month-to-month&0.0',
 'OnlineSecurity&Contract&TechSupport_No&Month-to-month&No']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

In [None]:
features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

44.57247447967529 s


(0.8085967288781859,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.5,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'lbfgs'})

In [None]:
Group_Statistics?

[1;31mSignature:[0m
[0mGroup_Statistics[0m[1;33m([0m[1;33m
[0m    [0mkeyCol[0m[1;33m,[0m[1;33m
[0m    [0mX_train[0m[1;33m,[0m[1;33m
[0m    [0mX_test[0m[1;33m,[0m[1;33m
[0m    [0mcol_num[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcol_cat[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnum_stat[0m[1;33m=[0m[1;33m[[0m[1;34m'mean'[0m[1;33m,[0m [1;34m'var'[0m[1;33m,[0m [1;34m'max'[0m[1;33m,[0m [1;34m'min'[0m[1;33m,[0m [1;34m'skew'[0m[1;33m,[0m [1;34m'median'[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mcat_stat[0m[1;33m=[0m[1;33m[[0m[1;34m'mean'[0m[1;33m,[0m [1;34m'var'[0m[1;33m,[0m [1;34m'max'[0m[1;33m,[0m [1;34m'min'[0m[1;33m,[0m [1;34m'median'[0m[1;33m,[0m [1;34m'count'[0m[1;33m,[0m [1;34m'nunique'[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mquant[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mmulti[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m

In [None]:
col_temp = category_cols.copy()
col_temp

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [None]:
keyCol = col_temp.pop(2)
keyCol

'Partner'

In [None]:
col_temp

['gender',
 'SeniorCitizen',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [None]:
col_temp = category_cols.copy()
col_temp

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [None]:
for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    col_temp = category_cols.copy()
    print(keyCol)

gender
SeniorCitizen
Partner
Dependents
PhoneService
MultipleLines
InternetService
OnlineSecurity
OnlineBackup
DeviceProtection
TechSupport
StreamingTV
StreamingMovies
Contract
PaperlessBilling
PaymentMethod


In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

OrdinalEncoder()

In [None]:
ord_enc.transform(X_train[category_cols])

array([[0., 0., 1., ..., 2., 1., 0.],
       [0., 0., 1., ..., 0., 1., 2.],
       [1., 0., 0., ..., 0., 0., 3.],
       ...,
       [0., 1., 0., ..., 0., 1., 2.],
       [0., 0., 1., ..., 1., 0., 1.],
       [1., 0., 1., ..., 2., 1., 0.]])

In [None]:
X_train_OE = pd.DataFrame(ord_enc.transform(X_train[category_cols]), columns=category_cols)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(ord_enc.transform(X_test[category_cols]), columns=category_cols)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [None]:
X_train_OE.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,tenure,MonthlyCharges,TotalCharges
4067,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,2.0,2.0,2.0,2.0,1.0,0.0,68,79.6,5515.8
3306,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,2.0,3,80.0,241.3
3391,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0,4,19.0,73.45
3249,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,3.0,10,55.55,551.3
2674,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0,4,20.05,91.45


In [None]:
X_train_OE.shape, X_train.shape

((5282, 19), (5282, 19))

In [None]:
X_test_OE.shape, X_test.shape

((1761, 19), (1761, 19))

In [None]:
ord_enc.categories_

[array(['Female', 'Male'], dtype=object),
 array([0, 1], dtype=int64),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'No phone service', 'Yes'], dtype=object),
 array(['DSL', 'Fiber optic', 'No'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['Month-to-month', 'One year', 'Two year'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Bank transfer (automatic)', 'Credit card (automatic)',
        'Electronic check', 'Mailed check'], dtype=object)]

In [None]:
# 创建容器
col_temp = category_cols.copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = Group_Statistics(keyCol,
                                                                                      X_train_OE,
                                                                                      X_test_OE,
                                                                                      col_num=numeric_cols,
                                                                                      col_cat=col_temp,
                                                                                      extension=True)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = category_cols.copy()

In [None]:
features_train_new[0]

Unnamed: 0,tenure_gender_mean,tenure_gender_var,tenure_gender_max,tenure_gender_min,tenure_gender_skew,tenure_gender_median,MonthlyCharges_gender_mean,MonthlyCharges_gender_var,MonthlyCharges_gender_max,MonthlyCharges_gender_min,...,InternetService_cv_gender,OnlineSecurity_cv_gender,OnlineBackup_cv_gender,DeviceProtection_cv_gender,TechSupport_cv_gender,StreamingTV_cv_gender,StreamingMovies_cv_gender,Contract_cv_gender,PaperlessBilling_cv_gender,PaymentMethod_cv_gender
0,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600
1,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600
2,32.597584,614.757184,72,0,0.234222,29,64.016969,899.633055,118.35,18.25,...,0.857846,1.094405,0.989697,0.978766,1.083917,0.916737,0.905134,1.199651,0.843495,0.669323
3,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600
4,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,32.597584,614.757184,72,0,0.234222,29,64.016969,899.633055,118.35,18.25,...,0.857846,1.094405,0.989697,0.978766,1.083917,0.916737,0.905134,1.199651,0.843495,0.669323
5278,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600
5279,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600
5280,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.40,...,0.852147,1.093349,0.990747,0.978976,1.088451,0.891776,0.899249,1.228772,0.818448,0.682600


In [None]:
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)

In [None]:
features_train_new.head()

Unnamed: 0,tenure_gender_mean,tenure_gender_var,tenure_gender_max,tenure_gender_min,tenure_gender_skew,tenure_gender_median,MonthlyCharges_gender_mean,MonthlyCharges_gender_var,MonthlyCharges_gender_max,MonthlyCharges_gender_min,...,MultipleLines_cv_PaymentMethod,InternetService_cv_PaymentMethod,OnlineSecurity_cv_PaymentMethod,OnlineBackup_cv_PaymentMethod,DeviceProtection_cv_PaymentMethod,TechSupport_cv_PaymentMethod,StreamingTV_cv_PaymentMethod,StreamingMovies_cv_PaymentMethod,Contract_cv_PaymentMethod,PaperlessBilling_cv_PaymentMethod
0,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,0.869666,0.891326,0.919172,0.826531,0.836828,0.889565,0.827997,0.801837,0.852233,0.864201
1,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,0.909129,0.692528,1.702213,1.278833,1.266881,1.704683,0.956058,0.963521,2.042807,0.608718
2,32.597584,614.757184,72,0,0.234222,29,64.016969,899.633055,118.35,18.25,...,1.576416,0.865125,0.828221,0.893478,0.893478,0.834575,0.911518,0.923124,1.238971,1.190282
3,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,1.576416,0.865125,0.828221,0.893478,0.893478,0.834575,0.911518,0.923124,1.238971,1.190282
4,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,1.576416,0.865125,0.828221,0.893478,0.893478,0.834575,0.911518,0.923124,1.238971,1.190282


In [None]:
features_train_new.shape, features_test_new.shape

((5282, 5136), (1761, 5136))

In [None]:
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 查看拼接后的df
df_temp.head()

Unnamed: 0,tenure_gender_mean,tenure_gender_var,tenure_gender_max,tenure_gender_min,tenure_gender_skew,tenure_gender_median,MonthlyCharges_gender_mean,MonthlyCharges_gender_var,MonthlyCharges_gender_max,MonthlyCharges_gender_min,...,InternetService_cv_PaymentMethod,OnlineSecurity_cv_PaymentMethod,OnlineBackup_cv_PaymentMethod,DeviceProtection_cv_PaymentMethod,TechSupport_cv_PaymentMethod,StreamingTV_cv_PaymentMethod,StreamingMovies_cv_PaymentMethod,Contract_cv_PaymentMethod,PaperlessBilling_cv_PaymentMethod,Churn
4067,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,0.891326,0.919172,0.826531,0.836828,0.889565,0.827997,0.801837,0.852233,0.864201,0
3306,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,0.692528,1.702213,1.278833,1.266881,1.704683,0.956058,0.963521,2.042807,0.608718,0
3391,32.597584,614.757184,72,0,0.234222,29,64.016969,899.633055,118.35,18.25,...,0.865125,0.828221,0.893478,0.893478,0.834575,0.911518,0.923124,1.238971,1.190282,0
3249,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,0.865125,0.828221,0.893478,0.893478,0.834575,0.911518,0.923124,1.238971,1.190282,0
2674,31.947588,601.095276,72,0,0.257136,28,65.345765,891.427517,118.75,18.4,...,0.865125,0.828221,0.893478,0.893478,0.834575,0.911518,0.923124,1.238971,1.190282,0


In [None]:
# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)
df_corr

Churn                                 1.000000
tenure_cv_Contract                    0.406388
OnlineSecurity_cv_Contract            0.406261
Dependents_cv_Contract                0.406230
TotalCharges_norm_Contract            0.406185
                                        ...   
Partner_gap_PaymentMethod                  NaN
PhoneService_gap_PaymentMethod             NaN
PaperlessBilling_gap_PaymentMethod         NaN
SeniorCitizen_mag2_PaymentMethod           NaN
Dependents_mag2_PaymentMethod              NaN
Name: Churn, Length: 5137, dtype: float64

In [None]:
# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                              1.000000
TotalCharges_mag2_Contract                         0.406394
tenure_cv_Contract                                 0.406388
StreamingTV_Contract_mean                          0.406276
OnlineSecurity_cv_Contract                         0.406261
tenure_dive2_tenure_Contract_median                0.406239
Dependents_cv_Contract                             0.406230
Dependents_Contract_var                            0.406203
TotalCharges_norm_Contract                         0.406185
DeviceProtection_cv_Contract                       0.406177
OnlineBackup_cv_Contract                           0.406168
OnlineBackup_Contract_mean                         0.406082
tenure_mag2_Contract                               0.406042
TotalCharges_Contract_skew                         0.406039
StreamingTV_cv_Contract                            0.406021
DeviceProtection_Contract_mean                     0.405749
PaperlessBilling_Contract_mean          

In [None]:
np.abs(df_corr).sort_values(ascending = False)[20: 40]

TotalCharges_Contract_q2                          0.405671
tenure_Contract_q2                                0.405599
TotalCharges_Contract_q1                          0.405563
StreamingMovies_cv_Contract                       0.405519
TotalCharges_minus2_TotalCharges_Contract_mean    0.405406
TotalCharges_minus1_TotalCharges_Contract_mean    0.405406
TotalCharges_Contract_mean                        0.405402
tenure_dive1_tenure_Contract_mean                 0.405313
TechSupport_dive1_TechSupport_Contract_mean       0.405298
OnlineSecurity_Contract_mean                      0.405243
TechSupport_cv_Contract                           0.405048
PaperlessBilling_cv_Contract                      0.404911
TotalCharges_cv_Contract                          0.404896
PaymentMethod_Contract_var                        0.404894
Partner_cv_Contract                               0.404877
MonthlyCharges_Contract_q1                        0.404671
TotalCharges_Contract_median                      0.4041

In [None]:
np.abs(df_corr).sort_values(ascending = False)[200:220]

MonthlyCharges_minus2_MonthlyCharges_Contract_mean       0.371829
InternetService_mag2_Contract                            0.371437
PaymentMethod_mag2_Contract                              0.369155
MultipleLines_Contract_mean                              0.368319
InternetService_Contract_mean                            0.363831
InternetService_mag1_Contract                            0.363831
PaymentMethod_mag1_Contract                              0.362901
Dependents_OnlineSecurity_var                            0.362594
MultipleLines_dive1_MultipleLines_OnlineSecurity_mean    0.362330
Dependents_cv_OnlineSecurity                             0.361842
SeniorCitizen_OnlineSecurity_count                       0.361838
Partner_OnlineSecurity_count                             0.361838
Dependents_OnlineSecurity_count                          0.361838
PhoneService_OnlineSecurity_count                        0.361838
MultipleLines_OnlineSecurity_count                       0.361838
OnlineBack

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 4].index)
new_col

['TotalCharges_mag2_Contract',
 'tenure_cv_Contract',
 'StreamingTV_Contract_mean']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

In [None]:
train_new_temp.head()

Unnamed: 0,TotalCharges_mag2_Contract,tenure_cv_Contract,StreamingTV_Contract_mean
4067,0.986651,0.312622,1.268409
3306,0.495887,0.986208,0.794933
3391,0.495887,0.986208,0.794933
3249,0.495887,0.986208,0.794933
2674,0.495887,0.986208,0.794933


In [None]:
features_test(train_new_temp,
              test_new_temp,
              X_train,
              X_test,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

50.71905517578125 s


(0.810489414007626,
 {'columntransformer__num': StandardScaler(),
  'logit_threshold__C': 0.30000000000000004,
  'logit_threshold__penalty': 'l1',
  'logit_threshold__solver': 'saga'})

<center><img src="https://s2.loli.net/2022/03/16/1UWuFzTK3dLHQMR.png" alt="image-20220316171913807" style="zoom:50%;" />

In [None]:
pd.concat([X_train_OE['Contract'],
           features_train_new['StreamingTV_Contract_mean'],
           features_train_new['TotalCharges_mag2_Contract']], axis=1)[:10]

Unnamed: 0,Contract,StreamingTV_Contract_mean,TotalCharges_mag2_Contract
4067,2.0,1.268409,0.986651
3306,0.0,0.794933,0.495887
3391,0.0,0.794933,0.495887
3249,0.0,0.794933,0.495887
2674,0.0,0.794933,0.495887
2757,1.0,1.14663,0.877504
5237,0.0,0.794933,0.495887
5856,0.0,0.794933,0.495887
3069,0.0,0.794933,0.495887
6631,0.0,0.794933,0.495887


In [None]:
X_train_seq.head()

Unnamed: 0,tenure_year_2014,tenure_year_2015,tenure_year_2016,tenure_year_2017,tenure_year_2018,tenure_year_2019,tenure_year_2020,tenure_month_1,tenure_month_2,tenure_month_3,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
4067,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3391,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3249,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2674,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

# 拼接数据集
train_temp = pd.concat([X_train_OE, X_train_seq], axis=1)
test_temp = pd.concat([X_test_OE, X_test_seq], axis=1)

In [None]:
train_temp.shape

(5282, 42)

In [None]:
# 包括时序衍生变量在内的所有离散变量名
cat_temp = (category_cols + list(X_train_seq.columns)).copy()
cat_temp

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_year_2014',
 'tenure_year_2015',
 'tenure_year_2016',
 'tenure_year_2017',
 'tenure_year_2018',
 'tenure_year_2019',
 'tenure_year_2020',
 'tenure_month_1',
 'tenure_month_2',
 'tenure_month_3',
 'tenure_month_4',
 'tenure_month_5',
 'tenure_month_6',
 'tenure_month_7',
 'tenure_month_8',
 'tenure_month_9',
 'tenure_month_10',
 'tenure_month_11',
 'tenure_month_12',
 'tenure_quarter_1',
 'tenure_quarter_2',
 'tenure_quarter_3',
 'tenure_quarter_4']

In [None]:
cat_temp = list(X_train_seq.columns).copy()
cat_temp

['tenure_year_2014',
 'tenure_year_2015',
 'tenure_year_2016',
 'tenure_year_2017',
 'tenure_year_2018',
 'tenure_year_2019',
 'tenure_year_2020',
 'tenure_month_1',
 'tenure_month_2',
 'tenure_month_3',
 'tenure_month_4',
 'tenure_month_5',
 'tenure_month_6',
 'tenure_month_7',
 'tenure_month_8',
 'tenure_month_9',
 'tenure_month_10',
 'tenure_month_11',
 'tenure_month_12',
 'tenure_quarter_1',
 'tenure_quarter_2',
 'tenure_quarter_3',
 'tenure_quarter_4']

In [None]:
# 创建容器
col_temp = cat_temp.copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = Group_Statistics(keyCol,
                                                                                      train_temp,
                                                                                      test_temp,
                                                                                      col_num=numeric_cols,
                                                                                      col_cat=col_temp+category_cols,
                                                                                      extension=True)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = cat_temp.copy()

In [None]:
features_train_new[0]

Unnamed: 0,tenure_tenure_year_2014_mean,tenure_tenure_year_2014_var,tenure_tenure_year_2014_max,tenure_tenure_year_2014_min,tenure_tenure_year_2014_skew,tenure_tenure_year_2014_median,MonthlyCharges_tenure_year_2014_mean,MonthlyCharges_tenure_year_2014_var,MonthlyCharges_tenure_year_2014_max,MonthlyCharges_tenure_year_2014_min,...,InternetService_cv_tenure_year_2014,OnlineSecurity_cv_tenure_year_2014,OnlineBackup_cv_tenure_year_2014,DeviceProtection_cv_tenure_year_2014,TechSupport_cv_tenure_year_2014,StreamingTV_cv_tenure_year_2014,StreamingMovies_cv_tenure_year_2014,Contract_cv_tenure_year_2014,PaperlessBilling_cv_tenure_year_2014,PaymentMethod_cv_tenure_year_2014
0,68.127341,13.127162,72,61,-0.549207,69,76.213530,1022.759716,118.75,19.10,...,0.848957,0.694922,0.572021,0.566283,0.691981,0.620829,0.594970,0.380092,0.821334,0.974531
1,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
2,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
3,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
4,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
5278,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
5279,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744
5280,23.186758,350.262051,60,0,0.439575,19,61.756122,821.604310,116.60,18.25,...,0.856458,1.222631,1.121632,1.107576,1.212794,0.985965,0.991487,1.565422,0.833398,0.601744


In [None]:
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)

In [None]:
features_train_new.head()

Unnamed: 0,tenure_tenure_year_2014_mean,tenure_tenure_year_2014_var,tenure_tenure_year_2014_max,tenure_tenure_year_2014_min,tenure_tenure_year_2014_skew,tenure_tenure_year_2014_median,MonthlyCharges_tenure_year_2014_mean,MonthlyCharges_tenure_year_2014_var,MonthlyCharges_tenure_year_2014_max,MonthlyCharges_tenure_year_2014_min,...,InternetService_cv_tenure_quarter_4,OnlineSecurity_cv_tenure_quarter_4,OnlineBackup_cv_tenure_quarter_4,DeviceProtection_cv_tenure_quarter_4,TechSupport_cv_tenure_quarter_4,StreamingTV_cv_tenure_quarter_4,StreamingMovies_cv_tenure_quarter_4,Contract_cv_tenure_quarter_4,PaperlessBilling_cv_tenure_quarter_4,PaymentMethod_cv_tenure_quarter_4
0,68.127341,13.127162,72,61,-0.549207,69,76.21353,1022.759716,118.75,19.1,...,0.858849,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798
1,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,0.846682,1.290253,1.186932,1.195498,1.301364,1.050394,1.059107,1.773409,0.855493,0.544229
2,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,0.858849,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798
3,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,0.858849,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798
4,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,0.858849,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798


In [None]:
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 查看拼接后的df
df_temp.head()

Unnamed: 0,tenure_tenure_year_2014_mean,tenure_tenure_year_2014_var,tenure_tenure_year_2014_max,tenure_tenure_year_2014_min,tenure_tenure_year_2014_skew,tenure_tenure_year_2014_median,MonthlyCharges_tenure_year_2014_mean,MonthlyCharges_tenure_year_2014_var,MonthlyCharges_tenure_year_2014_max,MonthlyCharges_tenure_year_2014_min,...,OnlineSecurity_cv_tenure_quarter_4,OnlineBackup_cv_tenure_quarter_4,DeviceProtection_cv_tenure_quarter_4,TechSupport_cv_tenure_quarter_4,StreamingTV_cv_tenure_quarter_4,StreamingMovies_cv_tenure_quarter_4,Contract_cv_tenure_quarter_4,PaperlessBilling_cv_tenure_quarter_4,PaymentMethod_cv_tenure_quarter_4,Churn
4067,68.127341,13.127162,72,61,-0.549207,69,76.21353,1022.759716,118.75,19.1,...,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798,0
3306,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,1.290253,1.186932,1.195498,1.301364,1.050394,1.059107,1.773409,0.855493,0.544229,0
3391,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798,0
3249,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798,0
2674,23.186758,350.262051,60,0,0.439575,19,61.756122,821.60431,116.6,18.25,...,1.015211,0.911021,0.892671,1.001012,0.84299,0.836854,1.041919,0.819885,0.736798,0


In [None]:
df_corr = pd.Series(dtype=np.float64)

for col in df_temp:
    corr = np.corrcoef(df_temp[col], df_temp['Churn'])[0, 1]
    s = pd.Series(corr, index=[col])
    df_corr = df_corr.append(s)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
df_corr

tenure_tenure_year_2014_mean           -0.227663
tenure_tenure_year_2014_var             0.227663
tenure_tenure_year_2014_max            -0.227663
tenure_tenure_year_2014_min            -0.227663
tenure_tenure_year_2014_skew            0.227663
                                          ...   
StreamingMovies_cv_tenure_quarter_4     0.201279
Contract_cv_tenure_quarter_4            0.201279
PaperlessBilling_cv_tenure_quarter_4    0.201279
PaymentMethod_cv_tenure_quarter_4      -0.201279
Churn                                   1.000000
Length: 16906, dtype: float64

In [None]:
# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                                            1.000000
tenure_month_6_norm_tenure_year_2019                             0.320558
tenure_tenure_year_2019_q2                                       0.320558
StreamingMovies_tenure_year_2019_var                             0.320558
MonthlyCharges_tenure_year_2019_mean                             0.320558
tenure_year_2018_tenure_year_2019_var                            0.320558
PaperlessBilling_cv_tenure_year_2019                             0.320558
TotalCharges_tenure_year_2019_max                                0.320558
TotalCharges_tenure_year_2019_var                                0.320558
tenure_month_2_norm_tenure_year_2019                             0.320558
tenure_month_8_norm_tenure_year_2019                             0.320558
DeviceProtection_cv_tenure_year_2019                             0.320558
tenure_month_9_tenure_year_2019_var                              0.320558
MonthlyCharges_tenure_year_2019_min   

In [None]:
np.abs(df_corr).sort_values(ascending = False)[500: 520]

TechSupport_tenure_year_2014_var                                  0.227663
StreamingMovies_dive1_StreamingMovies_tenure_year_2014_mean       0.227663
TotalCharges_tenure_year_2014_min                                 0.227663
PhoneService_mag2_tenure_year_2014                                0.227663
InternetService_tenure_year_2014_mean                             0.227663
tenure_quarter_3_minus1_tenure_quarter_3_tenure_year_2014_mean    0.227663
tenure_month_3_cv_tenure_year_2014                                0.227663
tenure_quarter_3_minus2_tenure_quarter_3_tenure_year_2014_mean    0.227663
PhoneService_tenure_year_2014_var                                 0.227663
PhoneService_norm_tenure_year_2014                                0.227663
PaperlessBilling_mag2_tenure_year_2014                            0.227663
PaymentMethod_norm_tenure_year_2014                               0.227663
tenure_month_9_tenure_year_2014_var                               0.227663
tenure_year_2019_cv_tenur

In [None]:
# 调整index
X_train_seq.index = X_train_OE.index
X_test_seq.index = X_test_OE.index

# 拼接数据集
train_temp = pd.concat([X_train_OE[category_cols], X_train_seq], axis=1)
test_temp = pd.concat([X_test_OE[category_cols], X_test_seq], axis=1)

# 双变量组合特征衍生
features_train_new, features_test_new, colNames_train_new, colNames_test_new = Cross_Combination(list(train_temp.columns),
                                                                                                 train_temp,
                                                                                                 test_temp)

# 组合标签
features_train_new.index = X_train.index
features_test_new.index = X_test.index
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 挑选最重要的5个衍生特征
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 6].index)
print(new_col)

# 创建对应df
train_new_BC = features_train_new[new_col]
test_new_BC = features_test_new[new_col]

['OnlineSecurity&Contract_0.0&0.0', 'TechSupport&Contract_0.0&0.0', 'InternetService&Contract_1.0&0.0', 'Contract&tenure_year_2014_0.0&0.0', 'Contract&tenure_year_2020_0.0&0.0']


In [None]:
train_new_BC.head()

Unnamed: 0,OnlineSecurity&Contract_0.0&0.0,TechSupport&Contract_0.0&0.0,InternetService&Contract_1.0&0.0,Contract&tenure_year_2014_0.0&0.0,Contract&tenure_year_2020_0.0&0.0
4067,0.0,0.0,0.0,0.0,0.0
3306,1.0,0.0,1.0,1.0,1.0
3391,0.0,0.0,0.0,1.0,1.0
3249,1.0,1.0,0.0,1.0,1.0
2674,0.0,0.0,0.0,1.0,1.0


In [None]:
# 合并数据集
train_temp = pd.concat([X_train_OE, train_new_BC], axis=1)
test_temp = pd.concat([X_test_OE, test_new_BC], axis=1)

# 包括时序衍生变量在内的所有离散变量名
cat_temp = list(train_new_BC.columns).copy()
print(cat_temp)

# 创建容器
col_temp = cat_temp.copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = Group_Statistics(keyCol,
                                                                                      train_temp,
                                                                                      test_temp,
                                                                                      col_num=numeric_cols,
                                                                                      col_cat=category_cols)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = cat_temp.copy()

# 合并数据集
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)

# 查看衍生特征规模
print(features_train_new.shape)
print(features_test_new.shape)

['OnlineSecurity&Contract_0.0&0.0', 'TechSupport&Contract_0.0&0.0', 'InternetService&Contract_1.0&0.0', 'Contract&tenure_year_2014_0.0&0.0', 'Contract&tenure_year_2020_0.0&0.0']
(5282, 840)
(1761, 840)


In [None]:
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 计算相关系数
df_corr = pd.Series(dtype=np.float64)

for col in df_temp:
    corr = np.corrcoef(df_temp[col], df_temp['Churn'])[0, 1]
    s = pd.Series(corr, index=[col])
    df_corr = df_corr.append(s)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                                    1.000000
TotalCharges_OnlineSecurity&Contract_0.0&0.0_min         0.441598
StreamingTV_OnlineSecurity&Contract_0.0&0.0_var          0.441598
MonthlyCharges_OnlineSecurity&Contract_0.0&0.0_q1        0.441598
TechSupport_OnlineSecurity&Contract_0.0&0.0_mean         0.441598
InternetService_OnlineSecurity&Contract_0.0&0.0_mean     0.441598
OnlineSecurity_OnlineSecurity&Contract_0.0&0.0_var       0.441598
MonthlyCharges_OnlineSecurity&Contract_0.0&0.0_median    0.441598
MonthlyCharges_OnlineSecurity&Contract_0.0&0.0_var       0.441598
MonthlyCharges_OnlineSecurity&Contract_0.0&0.0_skew      0.441598
TotalCharges_OnlineSecurity&Contract_0.0&0.0_q2          0.441598
StreamingTV_OnlineSecurity&Contract_0.0&0.0_mean         0.441598
Contract_OnlineSecurity&Contract_0.0&0.0_var             0.441598
StreamingMovies_OnlineSecurity&Contract_0.0&0.0_mean     0.441598
OnlineBackup_OnlineSecurity&Contract_0.0&0.0_mean        0.441598
gender_Onl

In [None]:
X_train_OE.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [None]:
ord_enc.categories_

[array(['Female', 'Male'], dtype=object),
 array([0, 1], dtype=int64),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'No phone service', 'Yes'], dtype=object),
 array(['DSL', 'Fiber optic', 'No'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['No', 'No internet service', 'Yes'], dtype=object),
 array(['Month-to-month', 'One year', 'Two year'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Bank transfer (automatic)', 'Credit card (automatic)',
        'Electronic check', 'Mailed check'], dtype=object)]

In [None]:
X_train_OE.columns[7]

'OnlineSecurity'

In [None]:
ord_enc.categories_[7]

array(['No', 'No internet service', 'Yes'], dtype=object)

In [None]:
col1 = ['OnlineSecurity', 'Contract', 'TechSupport']
col2 = ['tenure_year_2014', 'tenure_year_2019', 'tenure_year_2020']
cl = col1 + col2

In [None]:
# 拼接数据集
train_temp = pd.concat([X_train_OE, X_train_seq[col2]], axis=1)
test_temp = pd.concat([X_test_OE, X_test_seq[col2]], axis=1)
print(train_temp.shape)
print(test_temp.shape)

# 创建容器
col_temp = []
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

# 多次循环、遍历三三组合
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            col_temp.append(cl[i])
            col_temp.append(cl[j])
            col_temp.append(cl[k])
            features_train1, features_test1, colNames_train, colNames_test = Cross_Combination(col_temp,
                                                                                               train_temp,
                                                                                               test_temp,
                                                                                               multi=True)

            colNames_train_new.extend(colNames_train)
            colNames_test_new.extend(colNames_test)
            features_train_new.append(features_train1)
            features_test_new.append(features_test1)

            col_temp = []

# 创建三变量交叉组合衍生数据集
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)

# 查看衍生数据集规模
print(features_train_new.shape)
print(features_test_new.shape)

# 组合标签
features_train_new.index = X_train.index
features_test_new.index = X_test.index
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 挑选最重要的5个衍生特征
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 6].index)
print(new_col)

# 创建对应df
train_new_MC = features_train_new[new_col]
test_new_MC = features_test_new[new_col]

(5282, 22)
(1761, 22)
(5282, 225)
(1761, 225)
['OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0', 'OnlineSecurity&Contract&tenure_year_2020_0.0&0.0&0.0', 'OnlineSecurity&Contract&TechSupport_0.0&0.0&0.0', 'Contract&TechSupport&tenure_year_2014_0.0&0.0&0.0', 'Contract&TechSupport&tenure_year_2020_0.0&0.0&0.0']


In [None]:
# 合并数据集
train_temp = pd.concat([X_train_OE, train_new_MC], axis=1)
test_temp = pd.concat([X_test_OE, test_new_MC], axis=1)

# 包括时序衍生变量在内的所有离散变量名
cat_temp = list(train_new_MC.columns).copy()
print(cat_temp)

# 创建容器
col_temp = cat_temp.copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = Group_Statistics(keyCol,
                                                                                      train_temp,
                                                                                      test_temp,
                                                                                      col_num=numeric_cols,
                                                                                      col_cat=category_cols)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = cat_temp.copy()

# 合并数据集
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)

# 查看衍生特征规模
print(features_train_new.shape)
print(features_test_new.shape)

['OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0', 'OnlineSecurity&Contract&tenure_year_2020_0.0&0.0&0.0', 'OnlineSecurity&Contract&TechSupport_0.0&0.0&0.0', 'Contract&TechSupport&tenure_year_2014_0.0&0.0&0.0', 'Contract&TechSupport&tenure_year_2020_0.0&0.0&0.0']
(5282, 840)
(1761, 840)


最后查看衍生特征和标签的相关系数：

In [None]:
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 计算相关系数
df_corr = pd.Series(dtype=np.float64)

for col in df_temp:
    corr = np.corrcoef(df_temp[col], df_temp['Churn'])[0, 1]
    s = pd.Series(corr, index=[col])
    df_corr = df_corr.append(s)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                                                           1.000000
MonthlyCharges_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_q2          0.443017
gender_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_var                 0.443017
DeviceProtection_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_var       0.443017
TotalCharges_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_min           0.443017
TotalCharges_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_var           0.443017
DeviceProtection_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_mean      0.443017
PaymentMethod_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_mean         0.443017
Contract_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_mean              0.443017
PaperlessBilling_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_mean      0.443017
PaperlessBilling_OnlineSecurity&Contract&tenure_year_2014_0.0&0.0&0.0_var       0.443017
tenure_OnlineSecurity

<center><img src="https://s2.loli.net/2022/03/17/o1Vi2h6RDeMEmkb.png" alt="image-20220317153432864" style="zoom:40%;" />

<center><img src="https://s2.loli.net/2022/03/17/U8ZbmpgyFSucCDz.png" alt="image-20220317153443674" style="zoom:50%;" />

In [None]:
# 定义标签
col_cat = [target]
print(col_cat)

# 创建容器
col_temp = category_cols.copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for keyCol in col_temp:
    features_train1, features_test1, colNames_train_new, colNames_test_new = Target_Encode(keyCol,
                                                                                           X_train_OE,
                                                                                           y_train,
                                                                                           X_test_OE,
                                                                                           col_cat=col_cat,
                                                                                           extension=True)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = category_cols.copy()

['Churn']


In [None]:
features_train_new[0]

Unnamed: 0,Churn_gender_mean_kfold,Churn_gender_var_kfold,Churn_gender_max_kfold,Churn_gender_min_kfold,Churn_gender_median_kfold,Churn_gender_count_kfold,Churn_gender_nunique_kfold,Churn_gender_q1_kfold,Churn_gender_q2_kfold,Churn_dive1_Churn_gender_mean_kfold,Churn_dive2_Churn_gender_median_kfold,Churn_minus1_Churn_gender_mean_kfold,Churn_minus2_Churn_gender_mean_kfold,Churn_norm_gender_kfold,Churn_gap_gender_kfold,Churn_mag1_gender_kfold,Churn_mag2_gender_kfold,Churn_cv_gender_kfold
0,0.272425,0.198304,1,0,0,2107,2,0,1.00,0.000000,0.0,-0.272425,-0.272425,-0.611747,1.00,-0.272425,0.0,1.634565
1,0.258141,0.191594,1,0,0,2119,2,0,1.00,3.873707,100000.0,0.741859,0.741859,1.694807,1.00,-0.258141,0.0,1.695580
3,0.248839,0.187005,1,0,0,2154,2,0,0.00,4.018495,100000.0,0.751161,0.751161,1.736985,0.00,-0.248839,0.0,1.737762
4,0.272425,0.198304,1,0,0,2107,2,0,1.00,0.000000,0.0,-0.272425,-0.272425,-0.611747,1.00,-0.272425,0.0,1.634565
6,0.260540,0.192750,1,0,0,2111,2,0,1.00,3.838035,100000.0,0.739460,0.739460,1.684253,1.00,-0.260540,0.0,1.685025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.258141,0.191594,1,0,0,2119,2,0,1.00,3.873707,100000.0,0.741859,0.741859,1.694807,1.00,-0.258141,0.0,1.695580
7039,0.264775,0.194761,1,0,0,2115,2,0,1.00,0.000000,0.0,-0.264775,-0.264775,-0.599952,1.00,-0.264775,0.0,1.666700
7040,0.264775,0.194761,1,0,0,2115,2,0,1.00,0.000000,0.0,-0.264775,-0.264775,-0.599952,1.00,-0.264775,0.0,1.666700
7041,0.250000,0.187589,1,0,0,2096,2,0,0.25,3.999840,100000.0,0.750000,0.750000,1.731598,0.25,-0.250000,0.0,1.732395


In [None]:
train_new_TE = pd.concat(features_train_new, axis=1)
test_new_TE = pd.concat(features_test_new, axis=1)

In [None]:
train_new_TE.head()

Unnamed: 0,Churn_gender_mean_kfold,Churn_gender_var_kfold,Churn_gender_max_kfold,Churn_gender_min_kfold,Churn_gender_median_kfold,Churn_gender_count_kfold,Churn_gender_nunique_kfold,Churn_gender_q1_kfold,Churn_gender_q2_kfold,Churn_dive1_Churn_gender_mean_kfold,...,Churn_PaymentMethod_q2_kfold,Churn_dive1_Churn_PaymentMethod_mean_kfold,Churn_dive2_Churn_PaymentMethod_median_kfold,Churn_minus1_Churn_PaymentMethod_mean_kfold,Churn_minus2_Churn_PaymentMethod_mean_kfold,Churn_norm_PaymentMethod_kfold,Churn_gap_PaymentMethod_kfold,Churn_mag1_PaymentMethod_kfold,Churn_mag2_PaymentMethod_kfold,Churn_cv_PaymentMethod_kfold
0,0.272425,0.198304,1,0,0,2107,2,0,1.0,0.0,...,1,4.377378,200000.0,1.543115,1.543115,3.09663,1,-0.456885,0.0,1.090647
1,0.258141,0.191594,1,0,0,2119,2,0,1.0,3.873707,...,0,14.87491,300000.0,2.798328,2.798328,6.970235,0,-0.201672,0.0,1.990552
3,0.248839,0.187005,1,0,0,2154,2,0,0.0,4.018495,...,0,0.0,0.0,-0.148724,-0.148724,-0.417735,0,-0.148724,0.0,2.393631
4,0.272425,0.198304,1,0,0,2107,2,0,1.0,0.0,...,1,4.377378,200000.0,1.543115,1.543115,3.09663,1,-0.456885,0.0,1.090647
6,0.26054,0.19275,1,0,0,2111,2,0,1.0,3.838035,...,0,6.663735,100000.0,0.849944,0.849944,2.378554,0,-0.150056,0.0,2.381129


In [None]:
train_new_TE.shape, test_new_TE.shape

((5282, 288), (1761, 288))

In [None]:
# 修改index
train_new_TE.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([train_new_TE, y_train], axis=1)

# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                           1.000000
Churn_dive1_Churn_PaymentMethod_mean_kfold      0.028967
Churn_PaymentMethod_q2_kfold                    0.028299
Churn_gap_PaymentMethod_kfold                   0.028299
Churn_cv_OnlineBackup_kfold                     0.027046
Churn_StreamingTV_q2_kfold                      0.026733
Churn_gap_StreamingMovies_kfold                 0.026733
Churn_StreamingMovies_q2_kfold                  0.026733
Churn_gap_StreamingTV_kfold                     0.026733
Churn_dive1_Churn_InternetService_mean_kfold    0.026715
Churn_cv_StreamingTV_kfold                      0.026328
Churn_norm_InternetService_kfold                0.026254
Churn_StreamingTV_var_kfold                     0.026090
Churn_OnlineBackup_var_kfold                    0.026039
Churn_OnlineBackup_count_kfold                  0.025715
Churn_PaymentMethod_count_kfold                 0.025656
Churn_mag1_PaymentMethod_kfold                  0.025561
Churn_PaymentMethod_mean_kfold 

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 2].index)
new_col

['Churn_dive1_Churn_PaymentMethod_mean_kfold']

In [None]:
train_new_temp = train_new_TE[new_col]
test_new_temp = test_new_TE[new_col]

# 带入模型进行训练
features_test(train_new_temp,
              test_new_temp,
              X_train_OE,
              X_test_OE,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

43.17520570755005 s


(0.81067880794702,
 {'columntransformer__num': StandardScaler(),
  'logit_threshold__C': 0.2,
  'logit_threshold__penalty': 'l1',
  'logit_threshold__solver': 'saga'})

<center><img src="https://s2.loli.net/2022/03/16/1UWuFzTK3dLHQMR.png" alt="image-20220316171913807" style="zoom:50%;" />

In [None]:
# 修改index
test_new_TE.index = X_test.index

# 拼接衍生特征与标签
df_temp = pd.concat([test_new_TE, y_test], axis=1)

# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                           1.000000
Churn_Contract_count_kfold                      0.420910
Churn_gap_Contract_kfold                        0.420833
Churn_Contract_q2_kfold                         0.420833
Churn_mag1_Contract_kfold                       0.419774
Churn_Contract_mean_kfold                       0.419774
Churn_Contract_var_kfold                        0.416007
Churn_minus2_Churn_Contract_mean_kfold          0.412676
Churn_minus1_Churn_Contract_mean_kfold          0.412676
Churn_dive2_Churn_Contract_median_kfold         0.410428
Churn_cv_Contract_kfold                         0.403028
Churn_norm_Contract_kfold                       0.398627
Churn_dive1_Churn_Contract_mean_kfold           0.384253
Churn_cv_InternetService_kfold                  0.341227
Churn_InternetService_count_kfold               0.340154
Churn_InternetService_var_kfold                 0.338333
Churn_mag1_InternetService_kfold                0.334336
Churn_InternetService_mean_kfol

In [None]:
# 定义标签
col_cat = [target]
print(col_cat)

# 合并数据集
X_train_temp = pd.concat([X_train_OE, X_train_seq], axis=1)
X_test_temp = pd.concat([X_test_OE, X_test_seq], axis=1)

# 创建容器
col_temp = list(X_train_seq.columns).copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for keyCol in col_temp:
    features_train1, features_test1, colNames_train_new, colNames_test_new = Target_Encode(keyCol,
                                                                                           X_train_temp,
                                                                                           y_train,
                                                                                           X_test_temp,
                                                                                           col_cat=col_cat,
                                                                                           extension=True)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = category_cols.copy()

# 合并数据集
train_new_TE_seq = pd.concat(features_train_new, axis=1)
test_new_TE_seq = pd.concat(features_test_new, axis=1)

# 测试相关系数
# 修改index
train_new_TE_seq.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([train_new_TE_seq, y_train], axis=1)

# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

['Churn']


Churn                                              1.000000
Churn_gap_tenure_year_2015_kfold                   0.031359
Churn_tenure_year_2015_q2_kfold                    0.031359
Churn_dive2_Churn_tenure_year_2015_median_kfold    0.031359
Churn_tenure_year_2015_count_kfold                 0.031286
Churn_minus1_Churn_tenure_year_2015_mean_kfold     0.031271
Churn_minus2_Churn_tenure_year_2015_mean_kfold     0.031271
Churn_norm_tenure_year_2015_kfold                  0.030900
Churn_tenure_year_2015_mean_kfold                  0.030564
Churn_mag1_tenure_year_2015_kfold                  0.030564
Churn_dive1_Churn_tenure_year_2015_mean_kfold      0.030321
Churn_tenure_year_2015_var_kfold                   0.030229
Churn_cv_tenure_year_2015_kfold                    0.029787
Churn_mag1_tenure_year_2019_kfold                  0.028483
Churn_tenure_year_2019_mean_kfold                  0.028483
Churn_cv_tenure_year_2019_kfold                    0.028470
Churn_tenure_year_2019_var_kfold        

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 2].index)
new_col

['Churn_gap_tenure_year_2015_kfold']

In [None]:
train_new_temp = train_new_TE_seq[new_col]
test_new_temp = test_new_TE_seq[new_col]

# 带入模型进行训练
features_test(train_new_temp,
              test_new_temp,
              X_train_OE,
              X_test_OE,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

43.03908157348633 s


(0.8104897723689115,
 {'columntransformer__num': StandardScaler(),
  'logit_threshold__C': 0.2,
  'logit_threshold__penalty': 'l1',
  'logit_threshold__solver': 'saga'})

In [None]:
# 合并数据集
train_temp = pd.concat([X_train_OE[category_cols], X_train_seq], axis=1)
test_temp = pd.concat([X_test_OE[category_cols], X_test_seq], axis=1)

# 所有离散变量名称列表
col_temp = (category_cols + list(X_train_seq.columns)).copy()

# 双变量组合特征衍生
features_train_new, features_test_new, colNames_train_new, colNames_test_new = Cross_Combination(col_temp,
                                                                                                 train_temp,
                                                                                                 test_temp)

# 调整index
features_train_new.index = X_train.index
features_test_new.index = X_test.index

# 合并数据集
X_train_temp = pd.concat([X_train_OE, X_train_seq, features_train_new], axis=1)
X_test_temp = pd.concat([X_test_OE, X_test_seq, features_test_new], axis=1)

In [None]:
features_train_new.shape

(5282, 3589)

In [None]:
X_train_temp.shape

(5282, 3631)

In [None]:
from tqdm import tqdm
import gc

# 定义标签
col_cat = [target]
print(col_cat)

# 创建容器
key_temp = list((features_train_new.columns)).copy()
n = len(key_temp)
print(n)

colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for keyCol in tqdm(key_temp):
    features_train1, features_test1, colNames_train_new, colNames_test_new = Target_Encode(keyCol,
                                                                                           X_train_temp,
                                                                                           y_train,
                                                                                           X_test_temp,
                                                                                           col_cat=col_cat)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = category_cols.copy()

# 合并数据集
train_new_TE_BC = pd.concat(features_train_new, axis=1)
test_new_TE_BC = pd.concat(features_test_new, axis=1)
print(train_new_TE_BC.shape)
print(test_new_TE_BC.shape)

# 内存清理
gc.collect()

  0%|          | 0/3589 [00:00<?, ?it/s]

['Churn']
3589


100%|██████████| 3589/3589 [28:18<00:00,  2.11it/s]


(5282, 32301)
(1761, 32301)


0

In [None]:
train_new_TE_BC.shape

(5282, 32301)

In [None]:
# 测试相关系数
# 修改index
train_new_TE_BC.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([train_new_TE_BC, y_train], axis=1)

# 计算相关系数
df_corr = pd.Series(dtype=np.float64)

for col in tqdm(df_temp.columns):
    corr = np.corrcoef(df_temp[col], df_temp['Churn'])[0, 1]
    s = pd.Series(corr, index=[col])
    df_corr = df_corr.append(s)

# 内存清理
gc.collect()

  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 32302/32302 [01:38<00:00, 329.55it/s]


26

In [None]:
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                                       1.000000
Churn_Contract&PaymentMethod_0.0&3.0_mean_kfold             0.048124
Churn_Contract&PaymentMethod_0.0&3.0_var_kfold              0.047889
Churn_Contract&PaymentMethod_0.0&3.0_count_kfold            0.047669
Churn_PaymentMethod&tenure_year_2015_3.0&0.0_count_kfold    0.045870
Churn_PaymentMethod&tenure_year_2015_3.0&0.0_q2_kfold       0.045795
Churn_PaymentMethod&tenure_year_2015_3.0&0.0_mean_kfold     0.045372
Churn_PaymentMethod&tenure_year_2015_3.0&0.0_var_kfold      0.045235
Churn_Partner&PaymentMethod_0.0&3.0_q2_kfold                0.042659
Churn_Partner&PaymentMethod_0.0&3.0_count_kfold             0.042645
Churn_PaymentMethod&tenure_month_5_3.0&0.0_count_kfold      0.041902
Churn_PaymentMethod&tenure_month_5_3.0&0.0_q2_kfold         0.041848
Churn_PaymentMethod&tenure_year_2014_3.0&0.0_count_kfold    0.041755
Churn_PaymentMethod&tenure_year_2014_3.0&0.0_q2_kfold       0.041701
Churn_PaymentMethod&tenure_month_5

In [None]:
np.abs(df_corr).sort_values(ascending = False)[60: 80]

Churn_PaymentMethod&tenure_month_1_3.0&0.0_q2_kfold          0.038604
Churn_StreamingMovies&PaperlessBilling_2.0&1.0_mean_kfold    0.038552
Churn_PaymentMethod&tenure_year_2017_3.0&0.0_count_kfold     0.038336
Churn_StreamingMovies&PaperlessBilling_2.0&1.0_var_kfold     0.038299
Churn_PaymentMethod&tenure_year_2017_3.0&0.0_q2_kfold        0.038265
Churn_PaymentMethod&tenure_year_2017_3.0&0.0_mean_kfold      0.038190
Churn_PaymentMethod&tenure_year_2017_3.0&0.0_var_kfold       0.038023
Churn_PaymentMethod&tenure_month_12_3.0&0.0_mean_kfold       0.037821
Churn_PaymentMethod&tenure_month_7_3.0&0.0_count_kfold       0.037772
Churn_PaymentMethod&tenure_month_7_3.0&0.0_q2_kfold          0.037741
Churn_PaymentMethod&tenure_month_12_3.0&0.0_count_kfold      0.037736
Churn_PaymentMethod&tenure_month_12_3.0&0.0_q2_kfold         0.037713
Churn_PaymentMethod&tenure_month_12_3.0&0.0_var_kfold        0.037682
Churn_PaymentMethod&tenure_month_10_3.0&0.0_mean_kfold       0.037666
Churn_PaymentMethod&

In [None]:
NLP_Group_Stat?

[1;31mSignature:[0m
[0mNLP_Group_Stat[0m[1;33m([0m[1;33m
[0m    [0mX_train[0m[1;33m,[0m[1;33m
[0m    [0mX_test[0m[1;33m,[0m[1;33m
[0m    [0mcol_cat[0m[1;33m,[0m[1;33m
[0m    [0mkeyCol[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtfidf[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mcountVec[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
NLP特征衍生函数

:param X_train: 训练集特征
:param X_test: 测试集特征
:param col_cat: 参与衍生的离散型变量，只能带入多个列
:param keyCol: 分组参考的关键变量，输入字符串时代表按照单独列分组，输入list代表按照多个列进行分组
:param tfidf: 是否进行tfidf计算  
:param countVec: 是否进行CountVectorizer计算

:return：NLP特征衍生后的新特征和新特征的名称
[1;31mFile:[0m      d:\work\jupyter\telco\正式课程\features_creation.py
[1;31mType:[0m      function


In [None]:
# 数据集拼接
X_train_temp = pd.concat([X_train_OE, X_train_seq], axis=1)
X_test_temp = pd.concat([X_test_OE, X_test_seq], axis=1)

# 所有离散变量名称
col_cat = category_cols + list(X_train_seq.columns)

# 进行NLP特征衍生
features_train_new, features_test_new, colNames_train_new, colNames_test_new = NLP_Group_Stat(X_train_temp,
                                                                                              X_test_temp,
                                                                                              col_cat)


# 查看衍生数据集规模
print(features_train_new.shape)

# 测试相关系数
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

(5282, 39)


Churn                     1.000000
Contract_tfidf            0.398629
tenure_year_2019_tfidf    0.333148
OnlineSecurity_tfidf      0.296923
TechSupport_tfidf         0.278950
PaperlessBilling_tfidf    0.246287
tenure_quarter_4_tfidf    0.233339
tenure_year_2014_tfidf    0.226298
tenure_month_12_tfidf     0.214692
PaymentMethod_tfidf       0.186292
OnlineBackup_tfidf        0.177502
DeviceProtection_tfidf    0.171308
SeniorCitizen_tfidf       0.161434
tenure_quarter_1_tfidf    0.136922
Dependents_tfidf          0.128100
Partner_tfidf             0.127240
PhoneService_tfidf        0.126132
tenure_month_1_tfidf      0.120929
tenure_year_2015_tfidf    0.098806
MultipleLines_tfidf       0.077045
Name: Churn, dtype: float64

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 4].index)
new_col

['Contract_tfidf', 'tenure_year_2019_tfidf', 'OnlineSecurity_tfidf']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

# 带入模型进行训练
features_test(train_new_temp,
              test_new_temp,
              X_train_OE,
              X_test_OE,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

47.58120656013489 s


(0.8108676643444855,
 {'columntransformer__num': 'passthrough',
  'logit_threshold__C': 0.1,
  'logit_threshold__penalty': 'l2',
  'logit_threshold__solver': 'lbfgs'})

In [None]:
# 创建容器
col_temp = col_cat.copy()
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = NLP_Group_Stat(X_train_temp,
                                                                                    X_test_temp,
                                                                                    col_temp,
                                                                                    keyCol)

    colNames_train_new.extend(colNames_train)
    colNames_test_new.extend(colNames_test)
    features_train_new.append(features_train1)
    features_test_new.append(features_test1)

    col_temp = col_cat.copy()


# 合并数据集
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)
print(features_train_new.shape)
print(features_test_new.shape)

# 内存清理
gc.collect()

# 测试相关系数
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 计算相关系数
df_corr = df_temp.corr()['Churn'].sort_values(ascending = False)

# 取相关系数绝对值最大的20个特征进行观察
np.abs(df_corr).sort_values(ascending = False)[: 20]

(5282, 2964)
(1761, 2964)


Churn                              1.000000
tenure_month_8_Contract_cntv       0.406389
Dependents_Contract_tfidf          0.406367
tenure_month_12_Contract_tfidf     0.406191
OnlineSecurity_Contract_tfidf      0.406165
tenure_year_2018_Contract_cntv     0.405956
tenure_month_11_Contract_cntv      0.405771
Partner_Contract_tfidf             0.405769
PaperlessBilling_Contract_tfidf    0.405416
DeviceProtection_Contract_tfidf    0.405269
SeniorCitizen_Contract_tfidf       0.405048
tenure_quarter_3_Contract_cntv     0.404980
TechSupport_Contract_tfidf         0.404805
tenure_month_10_Contract_cntv      0.404802
tenure_year_2019_Contract_tfidf    0.404660
tenure_quarter_4_Contract_tfidf    0.404501
PaymentMethod_Contract_tfidf       0.404190
tenure_quarter_4_Contract_cntv     0.404121
tenure_month_7_Contract_cntv       0.404077
tenure_month_5_Contract_cntv       0.403653
Name: Churn, dtype: float64

In [None]:
np.abs(df_corr).sort_values(ascending = False)[50: 70]

StreamingMovies_Contract_cntv            0.366081
tenure_month_8_OnlineSecurity_cntv       0.362731
Dependents_OnlineSecurity_tfidf          0.362719
tenure_year_2017_OnlineSecurity_cntv     0.362678
tenure_month_8_OnlineSecurity_tfidf      0.362612
PhoneService_OnlineSecurity_cntv         0.362591
gender_OnlineSecurity_cntv               0.362476
tenure_quarter_3_OnlineSecurity_cntv     0.362327
tenure_year_2020_OnlineSecurity_cntv     0.362078
tenure_month_7_OnlineSecurity_cntv       0.361873
tenure_month_9_OnlineSecurity_cntv       0.361821
tenure_year_2017_OnlineSecurity_tfidf    0.361776
tenure_month_10_OnlineSecurity_cntv      0.361756
tenure_month_11_OnlineSecurity_cntv      0.361098
tenure_quarter_3_Contract_tfidf          0.360663
SeniorCitizen_OnlineSecurity_cntv        0.360471
StreamingTV_OnlineSecurity_cntv          0.359959
gender_OnlineSecurity_tfidf              0.359015
tenure_month_6_OnlineSecurity_cntv       0.358508
PaymentMethod_OnlineSecurity_cntv        0.358384


In [None]:
NLP_Group_Stat?

[1;31mSignature:[0m
[0mNLP_Group_Stat[0m[1;33m([0m[1;33m
[0m    [0mX_train[0m[1;33m,[0m[1;33m
[0m    [0mX_test[0m[1;33m,[0m[1;33m
[0m    [0mcol_cat[0m[1;33m,[0m[1;33m
[0m    [0mkeyCol[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtfidf[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mcountVec[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
NLP特征衍生函数

:param X_train: 训练集特征
:param X_test: 测试集特征
:param col_cat: 参与衍生的离散型变量，只能带入多个列
:param keyCol: 分组参考的关键变量，输入字符串时代表按照单独列分组，输入list代表按照多个列进行分组
:param tfidf: 是否进行tfidf计算  
:param countVec: 是否进行CountVectorizer计算

:return：NLP特征衍生后的新特征和新特征的名称
[1;31mFile:[0m      d:\work\jupyter\telco\正式课程\features_creation.py
[1;31mType:[0m      function


In [None]:
# 合并数据集
train_temp = pd.concat([X_train_OE[category_cols], X_train_seq], axis=1)
test_temp = pd.concat([X_test_OE[category_cols], X_test_seq], axis=1)

# 所有离散变量名称列表
col_cat = (category_cols + list(X_train_seq.columns)).copy()
n = len(col_cat)

# 创建容器
col_temp = col_cat.copy()
keyCol = []
colNames_train_new = []
colNames_test_new = []
features_train_new = []
features_test_new = []

for i in tqdm(range(n)):
    for j in range(i+1, n):
        keyCol.append(col_temp.pop(i))
        keyCol.append(col_temp.pop(j-1))
        features_train1, features_test1, colNames_train, colNames_test = NLP_Group_Stat(train_temp,
                                                                                        test_temp,
                                                                                        col_temp,
                                                                                        keyCol)

        colNames_train_new.extend(colNames_train)
        colNames_test_new.extend(colNames_test)
        features_train_new.append(features_train1)
        features_test_new.append(features_test1)

        keyCol = []
        col_temp = col_cat.copy()

# 合并数据集
features_train_new = pd.concat(features_train_new, axis=1)
features_test_new = pd.concat(features_test_new, axis=1)
print(features_train_new.shape)
print(features_test_new.shape)

# 内存清理
gc.collect()

100%|██████████| 39/39 [00:16<00:00,  2.38it/s]


(5282, 54834)
(1761, 54834)


26

In [None]:
# 测试相关系数
# 修改index
features_train_new.index = X_train.index

# 拼接衍生特征与标签
df_temp = pd.concat([features_train_new, y_train], axis=1)

# 计算相关系数
df_corr = pd.Series(dtype=np.float64)

for col in tqdm(df_temp.columns):
    corr = np.corrcoef(df_temp[col], df_temp['Churn'])[0, 1]
    s = pd.Series(corr, index=[col])
    df_corr = df_corr.append(s)

# 内存清理
gc.collect()

100%|██████████| 54835/54835 [00:39<00:00, 1392.86it/s]


26

In [None]:
np.abs(df_corr).sort_values(ascending = False)[: 20]

Churn                                             1.000000
tenure_month_11_InternetService&Contract_cntv     0.467464
tenure_month_9_InternetService&Contract_cntv      0.466496
tenure_month_10_InternetService&Contract_cntv     0.465473
tenure_quarter_4_InternetService&Contract_cntv    0.464573
tenure_quarter_3_InternetService&Contract_cntv    0.463899
PaymentMethod_InternetService&Contract_cntv       0.460555
tenure_month_8_InternetService&Contract_cntv      0.459681
PaperlessBilling_InternetService&Contract_cntv    0.458440
tenure_month_7_InternetService&Contract_cntv      0.458075
gender_InternetService&Contract_cntv              0.457934
tenure_year_2018_InternetService&Contract_cntv    0.457297
tenure_year_2018_OnlineSecurity&Contract_cntv     0.455537
tenure_month_8_OnlineSecurity&Contract_cntv       0.454826
TechSupport_InternetService&Contract_tfidf        0.454414
tenure_month_12_InternetService&Contract_cntv     0.454358
tenure_year_2017_OnlineSecurity&Contract_cntv     0.4541

In [None]:
new_col = list(np.abs(df_corr).sort_values(ascending = False)[1: 4].index)
new_col

['tenure_month_11_InternetService&Contract_cntv',
 'tenure_month_9_InternetService&Contract_cntv',
 'tenure_month_10_InternetService&Contract_cntv']

In [None]:
train_new_temp = features_train_new[new_col]
test_new_temp = features_test_new[new_col]

# 带入模型进行训练
features_test(train_new_temp,
              test_new_temp,
              X_train_OE,
              X_test_OE,
              y_train,
              y_test,
              category_cols,
              numeric_cols)

56.599395513534546 s


(0.8101111636707662,
 {'columntransformer__num': StandardScaler(),
  'logit_threshold__C': 0.30000000000000004,
  'logit_threshold__penalty': 'l1',
  'logit_threshold__solver': 'saga'})