In [1]:
import pandas as pd
import numpy as np

In [2]:
Datasets_PATH = '~/OneDrive/ProjectSpace/Jupyter/datasets/'
TrainSets_filename = 'TrainSet.csv'
TestSets_filename = 'TestSet.csv'

### 数据预处理

In [3]:
from sklearn.model_selection import train_test_split
TrainSet_Raw = pd.read_csv(Datasets_PATH+TrainSets_filename,encoding='GBK')

In [4]:
# 数值类数据
TrainSet_Raw.columns[TrainSet_Raw.dtypes != 'object']
# 非数值类数据
TrainSet_Raw.columns[TrainSet_Raw.dtypes == 'object']

Index(['id', 'Y', 'COL_004', 'COL_007', 'COL_012', 'COL_025', 'COL_046',
       'COL_048', 'COL_051', 'COL_052', 'COL_148', 'COL_150', 'COL_153',
       'COL_154', 'COL_157', 'COL_158', 'COL_161', 'COL_163', 'COL_166',
       'COL_167', 'COL_168', 'COL_175', 'COL_178', 'COL_180', 'COL_181',
       'COL_183', 'COL_198', 'COL_202', 'COL_207', 'COL_214', 'COL_225',
       'COL_226', 'COL_227', 'AGE', 'KD_ACTIVE', 'KD_XY', 'KD_STATUS',
       'FEE_BUS', 'BANDWIDTH_IN', 'DUR60_USED', 'CNT_USED_DAY', 'DATE_USED',
       'AREA_TYPE', 'IS_QF', 'JTTF', 'IS_TV', 'DURATION60_TV', 'LOGIN_CNT',
       'TV_ZZ', 'TV_ZZ_3', 'JT_ZH', 'JT_FH', 'YX_MON', 'YX_MON_3', 'IS_4G',
       'IS_5G', 'TOUSU_SUM', 'TOUSU_NUM', 'IS_TOUSU_TV', 'IS_TOUSU_KD', 'ARPU',
       'ARPU_3', 'GPRS_BHD', 'RWY_BHD', 'VOI_BHD', 'YWKD', 'TOWN_TYPE'],
      dtype='object')

Index(['COL_109', 'SEX', 'KD_TYPE', 'ZC_TYPE', 'ZHICHA_2', 'TOUSU_TYPE'], dtype='object')

In [5]:
# 非数值类数据分析
for idx_object in TrainSet_Raw.columns[TrainSet_Raw.dtypes == 'object']:
    print(idx_object, len(TrainSet_Raw[idx_object].unique()), TrainSet_Raw[idx_object].unique())

COL_109 5 ['中' '高' '未知' '一般' '低']
SEX 3 ['女' '男' nan]
KD_TYPE 3 ['fttb' 'ftth' nan]
ZC_TYPE 32 [nan 'OTT机顶盒无改有' 'WIFI弱连接' 'OTT机顶盒无改有,WIFI弱连接,光猫托管' 'OTT机顶盒无改有,光猫托管'
 'OTT机顶盒无改有,OTT严重卡顿' 'OTT机顶盒无改有,WIFI弱连接' '光猫托管' '光猫弱光' 'OTT机顶盒无改有,光猫弱光'
 '质差自购路由器' 'WIFI弱连接,光猫弱光,光猫托管' 'OTT严重卡顿' 'OTT机顶盒无改有,质差自购路由器'
 'OTT机顶盒无改有,OTT严重卡顿,光猫弱光' 'OTT机顶盒无改有,OTT严重卡顿,WIFI弱连接'
 'OTT机顶盒无改有,OTT严重卡顿,质差自购路由器' 'OTT机顶盒无改有,光猫弱光,质差自购路由器'
 'OTT机顶盒无改有,光猫托管,质差自购路由器' 'WIFI弱连接,质差自购路由器'
 'OTT机顶盒无改有,OTT严重卡顿,WIFI弱连接,光猫托管' 'OTT机顶盒无改有,WIFI弱连接,光猫弱光' 'WIFI弱连接,光猫托管'
 'WIFI弱连接,光猫弱光' '光猫弱光,质差自购路由器' 'OTT机顶盒无改有,OTT严重卡顿,光猫托管' 'OTT严重卡顿,WIFI弱连接'
 'OTT机顶盒无改有,WIFI弱连接,质差自购路由器' '光猫弱光,光猫托管' '光猫托管,质差自购路由器'
 'WIFI弱连接,光猫托管,质差自购路由器' 'OTT机顶盒无改有,光猫弱光,光猫托管']
ZHICHA_2 3 [nan '电视严重卡顿' 'ONU弱光']
TOUSU_TYPE 9 [nan '售后服务' '功能使用' '功能使用,售后服务' '费用质疑' '办理规范' '功能使用,营销宣传' '营销宣传' '业务规则']


In [6]:
# 去除无用数据
TrainSet_Raw.drop(['ZC_TYPE'],axis=1, inplace=True)

In [7]:
# 处理分类数据，使用Onehot码
TrainSet_Raw = TrainSet_Raw.merge(pd.get_dummies(TrainSet_Raw[['COL_109', 'SEX', 'KD_TYPE', 'ZHICHA_2', 'TOUSU_TYPE']]),
                        left_index=True, right_index=True)

In [8]:
TrainSet_Raw.drop(['COL_109', 'SEX', 'KD_TYPE', 'ZHICHA_2', 'TOUSU_TYPE'], axis=1, inplace=True)

In [9]:
# 处理缺失值(用0填充)
TrainSet_Raw.fillna(0, inplace=True)

In [10]:
X_train_raw = TrainSet_Raw.drop(['id','Y'], axis=1)

In [11]:
y_train_raw = TrainSet_Raw['Y']

In [12]:
# 拆分训练、测试数据
X_train, X_test, y_train, y_test = train_test_split(X_train_raw, y_train_raw, test_size=0.3)

### 预处理测试集数据（此部分在拆分数据前已处理，无需再次处理）

In [None]:
# 去除无用数据
X_test.drop(['ZC_TYPE'],axis=1, inplace=True)

In [None]:
# 处理分类数据，使用Onehot码
X_test = X_test.merge(pd.get_dummies(X_test[['COL_109', 'SEX', 'KD_TYPE', 'ZHICHA_2', 'TOUSU_TYPE']]),
                        left_index=True, right_index=True)

In [None]:
X_test.drop(['COL_109', 'SEX', 'KD_TYPE', 'ZHICHA_2', 'TOUSU_TYPE'], axis=1, inplace=True)

In [None]:
# 处理缺失值
X_test.fillna(0, inplace=True)

### 模型选择

In [13]:
# 决策树
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_pred = tree_reg.predict(X_test)
tree_msle = mean_squared_log_error(tree_pred, y_test)
tree_rmsle = np.sqrt(tree_msle)
tree_rmsle

DecisionTreeRegressor()

0.5115556842534272

In [15]:
# 十折交叉验证
from sklearn.model_selection import cross_val_score

tree_reg = DecisionTreeRegressor()
scores = cross_val_score(tree_reg, X_train, y_train,
                         scoring="neg_mean_squared_log_error", cv=10)

tree_rmsle_scores = np.sqrt(-scores)
tree_rmsle_scores.mean()

0.5173426412235282

In [None]:
# 随机森林
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
forest_pred = forest_reg.predict(X_test)
forest_msle = mean_squared_log_error(forest_pred, y_test)
forest_rmsle = np.sqrt(forest_msle)
forest_rmsle

In [None]:
# 十折交叉验证
from sklearn.model_selection import cross_val_score

forest_reg = RandomForestRegressor()
scores = cross_val_score(forest_reg, X_train, y_train,
                         scoring="neg_mean_squared_log_error", cv=10)

forest_rmsle_scores = np.sqrt(-scores)
forest_rmsle_scores.mean()

In [None]:
# AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
ada_reg = AdaBoostRegressor()
ada_reg.fit(X_train, y_train)
ada_pred = ada_reg.predict(X_test)
ada_msle = mean_squared_log_error(ada_pred, y_test)
ada_rmsle = np.sqrt(ada_msle)
ada_rmsle

In [None]:
# 十折交叉验证
from sklearn.model_selection import cross_val_score

ada_reg = AdaBoostRegressor()
scores = cross_val_score(ada_reg, X_train, y_train,
                         scoring="neg_mean_squared_log_error", cv=10)

ada_rmsle_scores = np.sqrt(-scores)
ada_rmsle_scores.mean()

In [None]:
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
grad_reg = GradientBoostingRegressor()
grad_reg.fit(X_train, y_train)
grad_pred = grad_reg.predict(X_test)
grad_msle = mean_squared_log_error(grad_pred, y_test)
grad_rmsle = np.sqrt(grad_msle)
grad_rmsle

In [None]:
# 十折交叉验证
from sklearn.model_selection import cross_val_score

grad_reg = GradientBoostingRegressor()
scores = cross_val_score(grad_reg, X_train, y_train,
                         scoring="neg_mean_squared_log_error", cv=10)

grad_rmsle_scores = np.sqrt(-scores)
grad_rmsle_scores.mean()

#### 回归自动训练

In [15]:
# 选择模型训练（回归）
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor

from sklearn.model_selection import cross_val_score

In [20]:
### 归一化
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(X_train)  
X_train_scale = scaler.transform(X_train)  
# X_test = scaler.transform(X_test)  

StandardScaler()

|              Scoring              |                                                                                                                 Function                                                                                                                  |             Comment              |
| --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------- |
| **Classification**                |                                                                                                                                                                                                                                           |                                  |
| ‘accuracy’                        | [`metrics.accuracy_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score "sklearn.metrics.accuracy_score")                                                         |                                  |
| ‘balanced_accuracy’               | [`metrics.balanced_accuracy_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html#sklearn.metrics.balanced_accuracy_score "sklearn.metrics.balanced_accuracy_score")                     |                                  |
| ‘average_precision’               | [`metrics.average_precision_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score "sklearn.metrics.average_precision_score")                     |                                  |
| ‘neg\_brier\_score’               | [`metrics.brier_score_loss`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.brier_score_loss.html#sklearn.metrics.brier_score_loss "sklearn.metrics.brier_score_loss")                                                 |                                  |
| ‘f1’                              | [`metrics.f1_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score "sklearn.metrics.f1_score")                                                                                 | for binary targets               |
| ‘f1_micro’                        | [`metrics.f1_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score "sklearn.metrics.f1_score")                                                                                 | micro-averaged                   |
| ‘f1_macro’                        | [`metrics.f1_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score "sklearn.metrics.f1_score")                                                                                 | macro-averaged                   |
| ‘f1_weighted’                     | [`metrics.f1_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score "sklearn.metrics.f1_score")                                                                                 | weighted average                 |
| ‘f1_samples’                      | [`metrics.f1_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score "sklearn.metrics.f1_score")                                                                                 | by multilabel sample             |
| ‘neg\_log\_loss’                  | [`metrics.log_loss`] | requires `predict_proba` support |
| ‘precision’ etc.                  | [`metrics.precision_score`]                   | suffixes apply as with ‘f1’      |
| ‘recall’ etc.                     | [`metrics.recall_score`]                                | suffixes apply as with ‘f1’      |
| ‘jaccard’ etc.                    | [`metrics.jaccard_score`]                              | suffixes apply as with ‘f1’      |
| ‘roc_auc’                         | [`metrics.roc_auc_score`]                                                       |                                  |
| ‘roc\_auc\_ovr’                   | [`metrics.roc_auc_score`]                                     |                                  |
| ‘roc\_auc\_ovo’                   | [`metrics.roc_auc_score`]                                                           |                                  |
| ‘roc\_auc\_ovr_weighted’          | [`metrics.roc_auc_score`]                                                            |                                  |
| ‘roc\_auc\_ovo_weighted’          | [`metrics.roc_auc_score`]                                                            |                                  |
| **Clustering**                    |                                                                                                                                                                                                                                           |                                  |
| ‘adjusted\_mutual\_info_score’    | [`metrics.adjusted_mutual_info_score`]         |                                  |
| ‘adjusted\_rand\_score’           | [`metrics.adjusted_rand_score`]     |                                  |
| ‘completeness_score’              | [`metrics.completeness_score`]                                       |                                  |
| ‘fowlkes\_mallows\_score’         | [`metrics.fowlkes_mallows_score`]                                |
| ‘homogeneity_score’               | [`metrics.homogeneity_score`]             |                                  |
| ‘mutual\_info\_score’             | [`metrics.mutual_info_score`]                                          |                                  |
| ‘normalized\_mutual\_info_score’  | [`metrics.normalized_mutual_info_score`] |                                  |
| ‘v\_measure\_score’               | [`metrics.v_measure_score`]                  |                                  |
| **Regression**                    |                                                                                                                                                                                                                                           |                                  |
| ‘explained_variance’              | [`metrics.explained_variance_score`]                 |                                  |
| ‘max_error’                       | [`metrics.max_error`]                                                                     |                                  |
| ‘neg\_mean\_absolute_error’       | [`metrics.mean_absolute_error`]    |                                  |
| ‘neg\_mean\_squared_error’        | [`metrics.mean_squared_error`]        |                                  |
| ‘neg\_root\_mean\_squared\_error’ | [`metrics.mean_squared_error`]        |                                  |
| ‘neg\_mean\_squared\_log\_error’  | [`metrics.mean_squared_log_error`]                         |
| ‘neg\_median\_absolute_error’     | [`metrics.median_absolute_error`]                           |                                  |
| ‘r2’                              | [`metrics.r2_score`]                                                                             |                                  |
| ‘neg\_mean\_poisson_deviance’     | [`metrics.mean_poisson_deviance`]                            |                                  |
| ‘neg\_mean\_gamma_deviance’       | [`metrics.mean_gamma_deviance`]

In [16]:
score_metric = 'neg_mean_squared_log_error'
# 回归训练 
for model_select in [GradientBoostingRegressor(), AdaBoostRegressor(), RandomForestRegressor(), SVR(), 
                     SGDRegressor(), KNeighborsRegressor(), GaussianProcessRegressor(), MLPRegressor(), XGBRegressor()]:
    regresor = model_select
    reg_score = cross_val_score(regresor, X_train, y_train,
                               scoring=score_metric, cv=10, n_jobs=-1)
    
    reg_rmsle_scores = np.sqrt(-reg_score)
    print(str(regresor), score_metric, reg_rmsle_scores.mean())

GradientBoostingRegressor() neg_mean_squared_log_error 0.3545367534878658
AdaBoostRegressor() neg_mean_squared_log_error 0.355019535165337
RandomForestRegressor() neg_mean_squared_log_error 0.3557691178056485
SVR() neg_mean_squared_log_error 0.3780034502920132


ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

GradientBoostingRegressor() neg_mean_squared_log_error 0.352358110906292
AdaBoostRegressor() neg_mean_squared_log_error 0.352789060016812
RandomForestRegressor() neg_mean_squared_log_error 0.3536608862052085
SVR() neg_mean_squared_log_error 0.37536296946841224
KNeighborsRegressor() neg_mean_squared_log_error 0.37127147932634175

#### 分类自动预测

In [17]:
# 选择模型训练（分类）
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import cross_val_score

In [18]:
from sklearn.metrics import make_scorer

In [19]:
# 评价标准
from sklearn.metrics import cohen_kappa_score
score_metric = make_scorer(cohen_kappa_score)
# score_metric = 'f1'
#from sklearn.metrics import f1_score
#score_metric = make_scorer(f1_score, average='weighted')

# 分类训练 
for model_select in [GradientBoostingClassifier(), AdaBoostClassifier(), RandomForestClassifier(), SVC(), 
                     SGDClassifier(), KNeighborsClassifier(), GaussianProcessClassifier(), MLPClassifier(), XGBClassifier()]:
    clf = model_select
    clf_score = cross_val_score(clf, X_train, y_train,
                               scoring=score_metric, cv=10, n_jobs=-1)
    
    clf_f1_scores = clf_score
    print(str(clf), str(score_metric), clf_f1_scores.mean())

GradientBoostingClassifier() make_scorer(cohen_kappa_score) 0.0020048118677434166
AdaBoostClassifier() make_scorer(cohen_kappa_score) 0.0011272477961544402
RandomForestClassifier() make_scorer(cohen_kappa_score) 0.0014933170482912539
SVC() make_scorer(cohen_kappa_score) 0.0
SGDClassifier() make_scorer(cohen_kappa_score) 0.0030668196029185026
KNeighborsClassifier() make_scorer(cohen_kappa_score) -0.0008495999976736335


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGSEGV(-11), SIGSEGV(-11), SIGSEGV(-11), SIGSEGV(-11), SIGSEGV(-11), SIGSEGV(-11), SIGSEGV(-11), SIGSEGV(-11)}

GradientBoostingClassifier() make_scorer(f1_score, average=weighted) 0.27770921004003346
AdaBoostClassifier() make_scorer(f1_score, average=weighted) 0.276852844875935
RandomForestClassifier() make_scorer(f1_score, average=weighted) 0.28406589909762425
SVC() make_scorer(f1_score, average=weighted) 0.27656356005292493
SGDClassifier() make_scorer(f1_score, average=weighted) 0.14433238655778674
KNeighborsClassifier() make_scorer(f1_score, average=weighted) 0.2780129419725555
MLPClassifier() make_scorer(f1_score, average=weighted) 0.1779162425669787
XGBClassifier make_scorer(f1_score, average=weighted) 0.29400521520758044

In [None]:
from xgboost.sklearn import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_log_error
xgb_msle = mean_squared_log_error(xgb_pred, y_test)
xgb_rmsle = np.sqrt(xgb_msle)
xgb_rmsle

### 训练数据

In [None]:
# AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor

In [36]:
ada_boost_reg = AdaBoostRegressor(n_estimators=80)

In [38]:
ada_boost_reg.fit(X_train, y_train)

AdaBoostRegressor(n_estimators=80)

In [None]:
y_pred = ada_boost_reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

In [None]:
ada_boost_msle = mean_squared_log_error(y_pred, y_test)
ada_boost_rmsle = np.sqrt(ada_boost_msle)
ada_boost_rmsle

### 预测数据

In [71]:
PredSet_Raw = pd.read_csv(Datasets_PATH+TestSets_filename, encoding='gbk')

In [72]:
# 去除无用数据
X_pred = PredSet_Raw.drop(['ZC_TYPE','id'],axis=1)

In [73]:
# 处理分类数据，使用Onehot码
X_pred = X_pred.merge(pd.get_dummies(X_pred[['COL_109', 'SEX', 'KD_TYPE', 'ZHICHA_2', 'TOUSU_TYPE']]),
                        left_index=True, right_index=True)

In [74]:
X_pred.drop(['COL_109', 'SEX', 'KD_TYPE', 'ZHICHA_2', 'TOUSU_TYPE'], axis=1, inplace=True)

In [75]:
# 处理缺失值
X_pred.fillna(0, inplace=True)

In [83]:
# 训练集和测试集变换后的字段变换情况
X_pred.columns
X_train.columns
set(X_pred.columns) - set(X_train.columns)
set(X_train.columns) - set(X_pred.columns)

Index(['COL_004', 'COL_007', 'COL_012', 'COL_025', 'COL_046', 'COL_048',
       'COL_051', 'COL_052', 'COL_148', 'COL_150', 'COL_153', 'COL_154',
       'COL_157', 'COL_158', 'COL_161', 'COL_163', 'COL_166', 'COL_167',
       'COL_168', 'COL_175', 'COL_178', 'COL_180', 'COL_181', 'COL_183',
       'COL_198', 'COL_202', 'COL_207', 'COL_214', 'COL_225', 'COL_226',
       'COL_227', 'AGE', 'KD_ACTIVE', 'KD_XY', 'KD_STATUS', 'FEE_BUS',
       'BANDWIDTH_IN', 'DUR60_USED', 'CNT_USED_DAY', 'DATE_USED', 'AREA_TYPE',
       'IS_QF', 'JTTF', 'IS_TV', 'DURATION60_TV', 'LOGIN_CNT', 'TV_ZZ',
       'TV_ZZ_3', 'JT_ZH', 'JT_FH', 'YX_MON', 'YX_MON_3', 'IS_4G', 'IS_5G',
       'TOUSU_SUM', 'TOUSU_NUM', 'IS_TOUSU_TV', 'IS_TOUSU_KD', 'ARPU',
       'ARPU_3', 'GPRS_BHD', 'RWY_BHD', 'VOI_BHD', 'YWKD', 'TOWN_TYPE',
       'COL_109_一般', 'COL_109_中', 'COL_109_低', 'COL_109_未知', 'COL_109_高',
       'SEX_女', 'SEX_男', 'KD_TYPE_fttb', 'KD_TYPE_ftth', 'ZHICHA_2_ONU弱光',
       'ZHICHA_2_电视严重卡顿', 'TOUSU_TYPE_业务规则', 

Index(['COL_004', 'COL_007', 'COL_012', 'COL_025', 'COL_046', 'COL_048',
       'COL_051', 'COL_052', 'COL_148', 'COL_150', 'COL_153', 'COL_154',
       'COL_157', 'COL_158', 'COL_161', 'COL_163', 'COL_166', 'COL_167',
       'COL_168', 'COL_175', 'COL_178', 'COL_180', 'COL_181', 'COL_183',
       'COL_198', 'COL_202', 'COL_207', 'COL_214', 'COL_225', 'COL_226',
       'COL_227', 'AGE', 'KD_ACTIVE', 'KD_XY', 'KD_STATUS', 'FEE_BUS',
       'BANDWIDTH_IN', 'DUR60_USED', 'CNT_USED_DAY', 'DATE_USED', 'AREA_TYPE',
       'IS_QF', 'JTTF', 'IS_TV', 'DURATION60_TV', 'LOGIN_CNT', 'TV_ZZ',
       'TV_ZZ_3', 'JT_ZH', 'JT_FH', 'YX_MON', 'YX_MON_3', 'IS_4G', 'IS_5G',
       'TOUSU_SUM', 'TOUSU_NUM', 'IS_TOUSU_TV', 'IS_TOUSU_KD', 'ARPU',
       'ARPU_3', 'GPRS_BHD', 'RWY_BHD', 'VOI_BHD', 'YWKD', 'TOWN_TYPE',
       'COL_109_一般', 'COL_109_中', 'COL_109_低', 'COL_109_未知', 'COL_109_高',
       'SEX_女', 'SEX_男', 'KD_TYPE_fttb', 'KD_TYPE_ftth', 'ZHICHA_2_ONU弱光',
       'ZHICHA_2_电视严重卡顿', 'TOUSU_TYPE_业务规则', 

set()

set()

In [77]:
# 增加缺失的数据列
for miss_columns in set(X_train.columns) - set(X_pred.columns):
    print(miss_columns)
    X_pred[miss_columns] = 0
    
# 去除增加的数据列
for add_columns in set(X_pred.columns) - set(X_train.columns):
    print(add_columns)
    X_pred.drop(add_columns,axis=1,inplace=True)

COL_109_低
TOUSU_TYPE_功能使用,营销宣传
COL_109_一般
TOUSU_TYPE_办理规范,业务规则


In [81]:
# 测试集按预测集的列进行排序
X_pred = X_pred[X_train.columns]

In [84]:
y_pred_rlt = ada_boost_reg.predict(X_pred)

In [89]:
y_pred_rlt

array([7.70420893, 7.26138538, 6.74061773, ..., 7.8664626 , 7.21688187,
       7.21688187])

In [96]:
# 合并结果
PredSet_Raw['pred_result']=y_pred_rlt
PredSet_Raw[['id','pred_result']]

Unnamed: 0,id,pred_result
0,1,7.704209
1,2,7.261385
2,3,6.740618
3,4,7.704209
4,5,7.704209
...,...,...
3995,3996,7.704209
3996,3997,7.216882
3997,3998,7.866463
3998,3999,7.216882


In [95]:
# 数据结果
#PredSet_Raw[['id','pred_result']].to_csv('test_ada_rlt.csv', index=False, header=False)
PredSet_Raw[['id','pred_result']].to_csv('test_ada_rlt.csv', index=False)

### 其他

In [97]:
# 参数调优
from sklearn.model_selection import GridSearchCV

In [103]:
param_grid = [
    {'n_estimators': [10,50,80,100], 'loss': ['linear', 'exponential'], 'learning_rate':[0.5, 1, 1.5]},
]

ada_reg = AdaBoostRegressor()

grid_search = GridSearchCV(ada_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_log_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=AdaBoostRegressor(), n_jobs=-1,
             param_grid=[{'learning_rate': [0.5, 1, 1.5],
                          'loss': ['linear', 'exponential'],
                          'n_estimators': [10, 50, 80, 100]}],
             scoring='neg_mean_squared_log_error')

In [104]:
grid_search.best_estimator_

AdaBoostRegressor(learning_rate=0.5, n_estimators=10)

In [105]:
grid_search.predict(X_pred)

array([7.57086375, 7.57086375, 7.39737991, ..., 7.86355082, 7.45162636,
       7.39413745])

In [None]:
X_pred.to_csv('datasets/Export_pred.csv',encoding='GBK')

In [None]:
%history