In [65]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings
warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin

# 自定义模块
from telcoFunc import *
# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethod']

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
 
# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化 
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [3]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [4]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month']-1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month']-1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(enc.transform(X_train_seq).toarray(), 
                           columns = cate_colName(enc, seq_new, drop=None))

X_test_seq = pd.DataFrame(enc.transform(X_test_seq).toarray(), 
                          columns = cate_colName(enc, seq_new, drop=None))

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [5]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(ord_enc.transform(X_train[category_cols]), columns=category_cols)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(ord_enc.transform(X_test[category_cols]), columns=category_cols)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

### TimeSeries feature creation

In [6]:
X_train_seq.head()

Unnamed: 0,tenure_year_2014,tenure_year_2015,tenure_year_2016,tenure_year_2017,tenure_year_2018,tenure_year_2019,tenure_year_2020,tenure_month_1,tenure_month_2,tenure_month_3,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
4067,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3391,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3249,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2674,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
cat_all = (category_cols + list(X_train_seq.columns)).copy()

In [8]:
X_train_seq.to_csv('X_train_seq.csv', index=False)
X_test_seq.to_csv('X_test_seq.csv', index=False)

23 new features are created by TimeSeries feature creation

### Cross Combination Feature Creation

In [9]:
# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

# 拼接数据集
train_temp = pd.concat([X_train, X_train_seq], axis=1)
test_temp = pd.concat([X_test, X_test_seq], axis=1)

# 带有时序特征的交叉组合      
CrossComb_train, CrossComb_test, colNames_train_new, colNames_test_new = Cross_Combination(cat_all, 
                                                                                           train_temp, 
                                                                                           test_temp)

In [10]:
CrossComb_train.head()

Unnamed: 0,gender&SeniorCitizen_Female&0,gender&SeniorCitizen_Female&1,gender&SeniorCitizen_Male&0,gender&SeniorCitizen_Male&1,gender&Partner_Female&No,gender&Partner_Female&Yes,gender&Partner_Male&No,gender&Partner_Male&Yes,gender&Dependents_Female&No,gender&Dependents_Female&Yes,...,tenure_quarter_1&tenure_quarter_4_1.0&0.0,tenure_quarter_2&tenure_quarter_3_0.0&0.0,tenure_quarter_2&tenure_quarter_3_0.0&1.0,tenure_quarter_2&tenure_quarter_3_1.0&0.0,tenure_quarter_2&tenure_quarter_4_0.0&0.0,tenure_quarter_2&tenure_quarter_4_0.0&1.0,tenure_quarter_2&tenure_quarter_4_1.0&0.0,tenure_quarter_3&tenure_quarter_4_0.0&0.0,tenure_quarter_3&tenure_quarter_4_0.0&1.0,tenure_quarter_3&tenure_quarter_4_1.0&0.0
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Cross Combination Feature Selection

- variance filtering

In [11]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold()
sel.fit(CrossComb_train)
CrossComb_cols = CrossComb_train.columns[sel.variances_ > 0.0099]
CrossComb_cols

Index(['gender&SeniorCitizen_Female&0', 'gender&SeniorCitizen_Female&1',
       'gender&SeniorCitizen_Male&0', 'gender&SeniorCitizen_Male&1',
       'gender&Partner_Female&No', 'gender&Partner_Female&Yes',
       'gender&Partner_Male&No', 'gender&Partner_Male&Yes',
       'gender&Dependents_Female&No', 'gender&Dependents_Female&Yes',
       ...
       'tenure_quarter_1&tenure_quarter_4_1.0&0.0',
       'tenure_quarter_2&tenure_quarter_3_0.0&0.0',
       'tenure_quarter_2&tenure_quarter_3_0.0&1.0',
       'tenure_quarter_2&tenure_quarter_3_1.0&0.0',
       'tenure_quarter_2&tenure_quarter_4_0.0&0.0',
       'tenure_quarter_2&tenure_quarter_4_0.0&1.0',
       'tenure_quarter_2&tenure_quarter_4_1.0&0.0',
       'tenure_quarter_3&tenure_quarter_4_0.0&0.0',
       'tenure_quarter_3&tenure_quarter_4_0.0&1.0',
       'tenure_quarter_3&tenure_quarter_4_1.0&0.0'],
      dtype='object', length=3474)

- chi-square

In [12]:
from sklearn.feature_selection import chi2

In [16]:
chi2(CrossComb_train[CrossComb_cols], y_train)
chi2_p = chi2(CrossComb_train[CrossComb_cols], y_train)[1]
chi2_CrossComb_cols = []

for pValue, colname in zip(chi2_p, CrossComb_cols):
    if pValue < 0.01:
        chi2_CrossComb_cols.append(colname)

print(len(chi2_CrossComb_cols))

2495


- Mutual Information

In [18]:
from sklearn.feature_selection import mutual_info_classif
MI = mutual_info_classif(CrossComb_train[CrossComb_cols], y_train, discrete_features=True, random_state=22)
MI_threshold = MI.mean() * 0.1
MI_CrossComb_cols = []

for MIvalue, colname in zip(MI, CrossComb_cols):
    if MIvalue > MI_threshold:
        MI_CrossComb_cols.append(colname)

print(len(MI_CrossComb_cols))    

2419


In [19]:
CrossComb_cols_select = list(set(chi2_CrossComb_cols) & set(MI_CrossComb_cols))
len(CrossComb_cols_select)

2369

2369 new features are created by Cross Combination feature creation

In [20]:
CrossComb_train[CrossComb_cols_select].to_csv('X_train_CrossComb.csv', index=False)
CrossComb_test[CrossComb_cols_select].to_csv('X_test_CrossComb.csv', index=False)

### Polynomial Feature Creation

In [21]:
Poly_train, Poly_test, colNames_train_new, colNames_test_new = Polynomial_Features(numeric_cols, 
                                                                                   3, 
                                                                                   X_train, 
                                                                                   X_test, 
                                                                                   multi=False)

In [22]:
Poly_train.head()

Unnamed: 0,tenure**2*MonthlyCharges**0,tenure**1*MonthlyCharges**1,tenure**0*MonthlyCharges**2,tenure**3*MonthlyCharges**0,tenure**2*MonthlyCharges**1,tenure**1*MonthlyCharges**2,tenure**0*MonthlyCharges**3,tenure**2*TotalCharges**0,tenure**1*TotalCharges**1,tenure**0*TotalCharges**2,...,tenure**2*TotalCharges**1,tenure**1*TotalCharges**2,tenure**0*TotalCharges**3,MonthlyCharges**2*TotalCharges**0,MonthlyCharges**1*TotalCharges**1,MonthlyCharges**0*TotalCharges**2,MonthlyCharges**3*TotalCharges**0,MonthlyCharges**2*TotalCharges**1,MonthlyCharges**1*TotalCharges**2,MonthlyCharges**0*TotalCharges**3
0,4624.0,5412.8,6336.16,314432.0,368070.4,430858.88,504358.336,4624.0,375074.4,30424050.0,...,25505059.2,2068835000.0,167813000000.0,6336.16,439057.68,30424050.0,504358.336,34948990.0,2421754000.0,167813000000.0
1,9.0,240.0,6400.0,27.0,720.0,19200.0,512000.0,9.0,723.9,58225.69,...,2171.7,174677.1,14049860.0,6400.0,19304.0,58225.69,512000.0,1544320.0,4658055.0,14049860.0
2,16.0,76.0,361.0,64.0,304.0,1444.0,6859.0,16.0,293.8,5394.903,...,1175.2,21579.61,396255.6,361.0,1395.55,5394.903,6859.0,26515.45,102503.1,396255.6
3,100.0,555.5,3085.8025,1000.0,5555.0,30858.025,171416.328875,100.0,5513.0,303931.7,...,55130.0,3039317.0,167557500.0,3085.8025,30624.715,303931.7,171416.328875,1701203.0,16883410.0,167557500.0
4,16.0,80.2,402.0025,64.0,320.8,1608.01,8060.150125,16.0,365.8,8363.103,...,1463.2,33452.41,764805.7,402.0025,1833.5725,8363.103,8060.150125,36763.13,167680.2,764805.7


### Polynomial Feature Selection

- variance filtering

In [24]:
sel = VarianceThreshold()
sel.fit(Poly_train)
Poly_cols = Poly_train.columns[sel.variances_ > 0]
Poly_cols = list(Poly_cols)
len(Poly_cols)

21

- ANOVA

In [25]:
from sklearn.feature_selection import f_classif
f_classif_p = f_classif(Poly_train, y_train)[1]
f_classif_Poly_cols = []

for pValue, colname in zip(f_classif_p, Poly_cols):
    if pValue < 0.01:
        f_classif_Poly_cols.append(colname)

print(len(f_classif_Poly_cols))

21


- Mutual Information

In [26]:
MI = mutual_info_classif(Poly_train[Poly_cols], y_train)
MI_threshold = MI.mean() * 0.1
MI_Ploy_cols = []

for MIvalue, colname in zip(MI, Poly_cols):
    if MIvalue > MI_threshold:
        MI_Ploy_cols.append(colname)

print(len(MI_Ploy_cols))

21


In [27]:
Poly_cols_select = Poly_cols

In [28]:
Poly_train[Poly_cols_select].to_csv('X_train_Poly.csv', index=False)
Poly_test[Poly_cols_select].to_csv('X_test_Poly.csv', index=False)

- 21 new features are created by Polynomial feature creation

### Grouped Statistical Feature Creation

- choosing the keycols

In [29]:
train_temp_OE = pd.concat([X_train_OE, X_train_seq], axis=1)
test_temp_OE = pd.concat([X_test_OE, X_test_seq], axis=1)

In [30]:
train_temp_OE[cat_all]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,tenure_month_7,tenure_month_8,tenure_month_9,tenure_month_10,tenure_month_11,tenure_month_12,tenure_quarter_1,tenure_quarter_2,tenure_quarter_3,tenure_quarter_4
4067,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3306,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3391,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3249,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2674,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5478,1.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
356,0.0,1.0,0.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4908,0.0,1.0,0.0,0.0,1.0,2.0,1.0,0.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6276,0.0,0.0,1.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
chi2_p = chi2(train_temp_OE[cat_all], y_train)[1]
chi2_select_cols = []

for pValue, colname in zip(chi2_p, cat_all):
    if pValue < 0.01:
        chi2_select_cols.append(colname)

print(len(chi2_select_cols))

26


In [32]:
MI = mutual_info_classif(train_temp_OE[cat_all], y_train, discrete_features=True, random_state=22)
MI_select_cols = []
MI_threshold = MI.mean() * 0.1

for MIvalue, colname in zip(MI, cat_all):
    if MIvalue > MI_threshold:
        MI_select_cols.append(colname)

print(len(MI_select_cols))        

23


In [35]:
temp_col = list(set(chi2_select_cols) & set(MI_select_cols))

In [38]:
X_group_temp = train_temp_OE[temp_col]
X_group_temp = pd.concat([X_group_temp,y_train],axis = 1)
X_group_temp.head()

Unnamed: 0,StreamingTV,tenure_month_1,Contract,tenure_year_2019,tenure_year_2015,Partner,DeviceProtection,tenure_month_12,PaymentMethod,tenure_year_2016,...,OnlineSecurity,PaperlessBilling,StreamingMovies,TechSupport,Dependents,tenure_quarter_4,tenure_year_2014,InternetService,SeniorCitizen,Churn
4067,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,1.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0
3306,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,1.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,0
3391,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0
3249,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,3.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
2674,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,0


In [42]:
cor = np.abs(X_group_temp.corr()['Churn']).sort_values(ascending = False)
cor

Churn               1.000000
Contract            0.394464
tenure_year_2019    0.320558
OnlineSecurity      0.307484
TechSupport         0.288867
tenure_year_2014    0.227663
tenure_quarter_4    0.201279
OnlineBackup        0.197237
tenure_month_12     0.196918
DeviceProtection    0.191164
PaperlessBilling    0.186661
Partner             0.159577
tenure_quarter_1    0.155000
Dependents          0.152544
SeniorCitizen       0.140164
tenure_month_1      0.131205
PaymentMethod       0.116375
tenure_year_2015    0.100355
tenure_month_2      0.082142
tenure_year_2016    0.065928
tenure_quarter_2    0.062907
InternetService     0.054444
StreamingMovies     0.050142
StreamingTV         0.039879
Name: Churn, dtype: float64

In [44]:
keycol = list(cor[1:3].index)
keycol

['Contract', 'tenure_year_2019']

In [45]:
cat_rest = []

for col in cat_all:
    if col not in keycol:
        cat_rest.append(col)

cat_rest

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_year_2014',
 'tenure_year_2015',
 'tenure_year_2016',
 'tenure_year_2017',
 'tenure_year_2018',
 'tenure_year_2020',
 'tenure_month_1',
 'tenure_month_2',
 'tenure_month_3',
 'tenure_month_4',
 'tenure_month_5',
 'tenure_month_6',
 'tenure_month_7',
 'tenure_month_8',
 'tenure_month_9',
 'tenure_month_10',
 'tenure_month_11',
 'tenure_month_12',
 'tenure_quarter_1',
 'tenure_quarter_2',
 'tenure_quarter_3',
 'tenure_quarter_4']

In [46]:
col_temp = keycol.copy()
GroupStat_train = pd.DataFrame()
GroupStat_test = pd.DataFrame()

for i in range(len(col_temp)):
    keyCol = col_temp.pop(i)
    features_train1, features_test1, colNames_train, colNames_test = Group_Statistics(keyCol,
                                                                                      train_temp_OE,
                                                                                      test_temp_OE,
                                                                                      col_num=numeric_cols,
                                                                                      col_cat=col_temp+cat_rest, 
                                                                                      extension=True)
    
    GroupStat_train = pd.concat([GroupStat_train, features_train1],axis=1)
    GroupStat_test = pd.concat([GroupStat_test, features_test1],axis=1)
    
    col_temp = keycol.copy()

In [47]:
GroupStat_train.head()

Unnamed: 0,tenure_Contract_mean,tenure_Contract_var,tenure_Contract_max,tenure_Contract_min,tenure_Contract_skew,tenure_Contract_median,MonthlyCharges_Contract_mean,MonthlyCharges_Contract_var,MonthlyCharges_Contract_max,MonthlyCharges_Contract_min,...,tenure_month_7_cv_tenure_year_2019,tenure_month_8_cv_tenure_year_2019,tenure_month_9_cv_tenure_year_2019,tenure_month_10_cv_tenure_year_2019,tenure_month_11_cv_tenure_year_2019,tenure_month_12_cv_tenure_year_2019,tenure_quarter_1_cv_tenure_year_2019,tenure_quarter_2_cv_tenure_year_2019,tenure_quarter_3_cv_tenure_year_2019,tenure_quarter_4_cv_tenure_year_2019
0,57.263658,320.476388,72,0,-1.408662,65.0,61.567933,1208.501297,118.75,18.4,...,3.354528,3.553765,3.532399,3.49069,3.525354,3.318175,1.474138,1.871994,1.834105,1.811573
1,17.902773,311.729585,72,1,1.063953,12.0,65.948391,710.028343,116.5,18.8,...,4.308545,3.887438,3.316136,3.178877,2.782575,1.577867,2.400843,2.255585,2.022068,1.018098
2,17.902773,311.729585,72,1,1.063953,12.0,65.948391,710.028343,116.5,18.8,...,4.308545,3.887438,3.316136,3.178877,2.782575,1.577867,2.400843,2.255585,2.022068,1.018098
3,17.902773,311.729585,72,1,1.063953,12.0,65.948391,710.028343,116.5,18.8,...,4.308545,3.887438,3.316136,3.178877,2.782575,1.577867,2.400843,2.255585,2.022068,1.018098
4,17.902773,311.729585,72,1,1.063953,12.0,65.948391,710.028343,116.5,18.8,...,4.308545,3.887438,3.316136,3.178877,2.782575,1.577867,2.400843,2.255585,2.022068,1.018098


### Grouped Statistical Feature Selection

- variance filtering

In [48]:
sel = VarianceThreshold()
sel.fit(GroupStat_train)
GroupStat_cols = list(GroupStat_train.columns[sel.variances_ > 0])
len(GroupStat_cols)

990

- ANOVA

In [49]:
f_classif_p = f_classif(GroupStat_train[GroupStat_cols], y_train)[1]
f_classif_GroupStat_cols = []

for pValue, colname in zip(f_classif_p, GroupStat_cols):
    if pValue < 0.01:
        f_classif_GroupStat_cols.append(colname)

print(len(f_classif_GroupStat_cols))

988


- Mutual Information

In [50]:
MI = mutual_info_classif(GroupStat_train[GroupStat_cols], y_train, random_state=22)
MI_threshold = MI.mean() * 0.1
MI_GroupStat_cols = []

for MIvalue, colname in zip(MI, GroupStat_cols):
    if MIvalue > MI_threshold:
        MI_GroupStat_cols.append(colname)

print(len(MI_GroupStat_cols))      

989


In [51]:
GroupStat_cols_select = list(set(f_classif_GroupStat_cols) & set(MI_GroupStat_cols))
len(GroupStat_cols_select)

987

In [52]:
GroupStat_train[GroupStat_cols_select]

Unnamed: 0,OnlineBackup_mag1_Contract,tenure_month_2_tenure_year_2019_count,tenure_quarter_4_tenure_year_2019_count,StreamingTV_minus1_StreamingTV_tenure_year_2019_mean,tenure_month_4_minus1_tenure_month_4_Contract_mean,TotalCharges_minus1_TotalCharges_tenure_year_2019_mean,tenure_month_8_Contract_count,PaymentMethod_minus2_PaymentMethod_Contract_mean,MonthlyCharges_tenure_year_2019_mean,OnlineSecurity_tenure_year_2019_median,...,DeviceProtection_dive2_DeviceProtection_tenure_year_2019_median,tenure_month_12_gap_tenure_year_2019,SeniorCitizen_minus1_SeniorCitizen_Contract_mean,StreamingMovies_norm_tenure_year_2019,Dependents_cv_tenure_year_2019,tenure_dive2_tenure_tenure_year_2019_median,tenure_month_4_dive2_tenure_month_4_Contract_median,tenure_month_1_gap_Contract,tenure_month_10_minus1_tenure_month_10_tenure_year_2019_mean,tenure_cv_tenure_year_2019
0,-0.235154,3639,3639,-1.120363,1.923990,-3181.296057,1263,0.805226,68.656829,1.0,...,0.0,0.0,1.916073,-1.276860,1.378410,0.000000,200000.0,1.0,-0.075845,0.434130
1,-0.668264,1643,1643,0.326841,-0.058199,-268.222794,2921,-1.801438,55.869842,0.0,...,100000.0,1.0,-0.201986,0.406711,1.965432,0.249999,0.0,0.0,0.909921,0.770803
2,-0.668264,1643,1643,0.326841,-0.058199,-268.222794,2921,-1.801438,55.869842,0.0,...,100000.0,1.0,-0.201986,0.406711,1.965432,0.249999,0.0,0.0,0.909921,0.770803
3,-0.668264,1643,1643,0.326841,-0.058199,-268.222794,2921,-1.801438,55.869842,0.0,...,100000.0,1.0,-0.201986,0.406711,1.965432,0.249999,0.0,0.0,0.909921,0.770803
4,-0.668264,1643,1643,0.326841,-0.058199,-268.222794,2921,-1.801438,55.869842,0.0,...,100000.0,1.0,-0.201986,0.406711,1.965432,0.249999,0.0,0.0,0.909921,0.770803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,-0.235154,3639,3639,-1.120363,1.923990,-3181.296057,1263,0.805226,68.656829,1.0,...,0.0,0.0,1.916073,-1.276860,1.378410,0.000000,200000.0,1.0,-0.075845,0.434130
5278,-0.668264,3639,3639,-1.120363,-0.058199,-3181.296057,2921,-1.801438,68.656829,1.0,...,0.0,0.0,-0.201986,-1.276860,1.378410,0.000000,0.0,0.0,-0.075845,0.434130
5279,-0.668264,3639,3639,-1.120363,-0.058199,-3181.296057,2921,-1.801438,68.656829,1.0,...,0.0,0.0,-0.201986,-1.276860,1.378410,0.000000,0.0,0.0,-0.075845,0.434130
5280,-0.079235,3639,3639,-1.120363,0.924408,-3181.296057,1098,-0.423497,68.656829,1.0,...,0.0,0.0,0.873406,-1.276860,1.378410,0.000000,100000.0,0.0,-0.075845,0.434130


In [53]:
GroupStat_train[GroupStat_cols_select].to_csv('X_train_GroupStat.csv', index=False)
GroupStat_test[GroupStat_cols_select].to_csv('X_test_GroupStat.csv', index=False)

In [13]:
X_train_seq = pd.read_csv('X_train_seq.csv')
X_train_CrossComb = pd.read_csv('X_train_CrossComb.csv')
X_train_Poly = pd.read_csv('X_train_Poly.csv')
X_train_GroupStat = pd.read_csv('X_train_GroupStat.csv')

In [14]:
X_test_seq = pd.read_csv('X_test_seq.csv')
X_test_CrossComb = pd.read_csv('X_test_CrossComb.csv')
X_test_Poly = pd.read_csv('X_test_Poly.csv')
X_test_GroupStat = pd.read_csv('X_test_GroupStat.csv')

In [15]:
X_train_seq.index = X_train.index
X_train_CrossComb.index = X_train.index
X_train_Poly.index = X_train.index
X_train_GroupStat.index = X_train.index
X_test_seq.index = X_test.index
X_test_CrossComb.index = X_test.index
X_test_Poly.index = X_test.index
X_test_GroupStat.index = X_test.index

In [16]:
features_train_new = pd.concat([X_train_seq, 
                                X_train_CrossComb, 
                                X_train_Poly, 
                                X_train_GroupStat], axis=1)
features_test_new = pd.concat([X_test_seq, 
                               X_test_CrossComb, 
                               X_test_Poly, 
                               X_test_GroupStat], axis=1)

In [17]:
features_train_new.shape

(5282, 3400)

In [18]:
features_test_new.shape

(1761, 3400)

In [24]:
from sklearn.ensemble import RandomForestClassifier

# 初始化随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
rf.fit(features_train_new, y_train)

# 获取特征重要性
importances = rf.feature_importances_

# 将特征重要性与特征名称配对
feature_names = features_train_new.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"])

# 按特征重要性降序排序
feature_importances_sorted = feature_importances.sort_values(by="importance", ascending=False)

In [82]:
print(feature_importances_sorted[:60])

                                                    importance
tenure**0*MonthlyCharges**3                           0.009293
MonthlyCharges**3*TotalCharges**0                     0.008855
tenure**1*MonthlyCharges**1                           0.008839
MonthlyCharges**1*TotalCharges**2                     0.008586
tenure**2*MonthlyCharges**1                           0.008159
tenure**0*MonthlyCharges**2                           0.008103
MonthlyCharges**2*TotalCharges**0                     0.007872
MonthlyCharges**2*TotalCharges**1                     0.007866
MonthlyCharges**0*TotalCharges**2                     0.007850
MonthlyCharges**1*TotalCharges**1                     0.007520
MonthlyCharges**0*TotalCharges**3                     0.007465
tenure**2*TotalCharges**1                             0.007399
tenure**1*TotalCharges**2                             0.007373
tenure**1*TotalCharges**1                             0.007346
tenure**1*MonthlyCharges**2                           0

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def evaluate_threshold(threshold):
    selected_features = feature_importances_sorted[feature_importances_sorted['importance'] > threshold].index.tolist()
    if not selected_features: 
        return 0, []  
    X_train_selected = features_train_new[selected_features]
    if X_train_selected.empty: 
        return 0, []  

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='accuracy')

    return np.mean(scores), selected_features


thresholds = np.linspace(0.00054, 0.0009, num=450)

results = []

for threshold in thresholds:
    accuracy, selected_features = evaluate_threshold(threshold)
    results.append((threshold, accuracy, len(selected_features)))

results_df = pd.DataFrame(results, columns=['Threshold', 'Accuracy', 'Num_Features'])

best_result = results_df.loc[results_df['Accuracy'].idxmax()]

print(best_result['Threshold'])
print(best_result['Accuracy'])
print(best_result['Num_Features'])


0.0006779064587973274
0.795720807889682
239.0


In [142]:
importance_col = feature_importances_sorted[:239].index.tolist()

In [143]:
len(importance_col)

239

In [145]:
features_train_new1 = features_train_new[importance_col]
features_test_new1 = features_test_new[importance_col]

In [146]:
train_new1 = pd.concat([X_train_OE, features_train_new1, y_train], axis = 1)
test_new1 = pd.concat([X_test_OE, features_test_new1, y_test], axis =1)

In [147]:
train_new1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,SeniorCitizen&Partner_0&No,Partner&OnlineBackup_No&No,OnlineBackup&tenure_quarter_4_No&1.0,PaymentMethod&tenure_month_9_Electronic check&0.0,MultipleLines&tenure_year_2014_Yes&0.0,gender&TechSupport_Male&No,gender&OnlineSecurity_Female&No,gender&Partner_Female&Yes,PaymentMethod&tenure_month_7_Electronic check&0.0,Churn
4067,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3306,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0
3391,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3249,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
2674,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [148]:
train_new = train_new1.to_csv('train_new.csv',index = False)
test_new = test_new1.to_csv('test_new.csv', index = False)