# 一、准备工作

In [1]:
import sys
print(sys.version)
print(sys.version_info)

3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]
sys.version_info(major=3, minor=8, micro=3, releaselevel='final', serial=0)


## 1.1安装工具包

In [None]:
# 安装lgb、xgb、ctb决策树、matplotlib
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple lightgbm
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple xgboost
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple catboost
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple matplotlib

## 1.2导入库

In [2]:
# pandas和numpy用于数据预处理；sklearn用于模型训练；xgb、lgb、ctb为决策树，用在模型主体；matplotlib用于绘图
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, RepeatedKFold
from scipy import sparse
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split

## 1.3导入数据

In [3]:
# 导入数据
data=pd.read_excel("./data/data_proc_3_fix0.xlsx")

# 二、数据预处理

## 2.1观察数据

In [4]:
#观察数据大小
print(data.shape)
#简单查看数据
print(data.head())
#查看数据是否缺失
print(data.info(verbose=True,null_counts=True))
print(data.columns)#查看train所有特征列

(227, 10)
     pre       2W       4W       6W       8W      10W    12W  label  \
0  77.83  0.02612  0.02600  0.00001  0.00000  0.00000  0.003      0   
1   5.37  0.12300  0.00388  0.16800  0.00514  0.00905  0.296      0   
2  23.29  0.19200  0.00000  0.02700  0.00030  0.00066  0.060      0   
3  20.00  0.00671  0.00002  0.06000  0.00002  0.00001  0.035      0   
4   4.66  0.12200  0.01000  0.00102  0.01000  0.00087  0.010      0   

   Unnamed: 8  Unnamed: 9  
0         NaN         NaN  
1         NaN         NaN  
2         NaN         NaN  
3         NaN         NaN  
4         NaN         NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pre         227 non-null    float64
 1   2W          227 non-null    float64
 2   4W          227 non-null    float64
 3   6W          227 non-null    float64
 4   8W          227 non-null    float64
 5 

## 2.2划分训练测试

In [5]:
X = data.loc[:,['pre','2W','4W','6W','8W','10W','12W']]
Y = data.loc[:,['label']]
print(X)
print('---')
print(Y)
seed = 7
test_size = 0.33
X_train_, X_test_, y_train_, y_test_ = train_test_split(X, Y, test_size=test_size, random_state=seed)

        pre       2W       4W       6W       8W      10W    12W
0    77.830  0.02612  0.02600  0.00001  0.00000  0.00000  0.003
1     5.370  0.12300  0.00388  0.16800  0.00514  0.00905  0.296
2    23.290  0.19200  0.00000  0.02700  0.00030  0.00066  0.060
3    20.000  0.00671  0.00002  0.06000  0.00002  0.00001  0.035
4     4.660  0.12200  0.01000  0.00102  0.01000  0.00087  0.010
..      ...      ...      ...      ...      ...      ...    ...
222   8.160  0.38400  0.03000  0.00600  0.00000  0.00000  0.000
223  21.000  0.64000  0.06000  0.11000  0.00932  0.00002  0.000
224  15.300  0.00542  0.05000  0.00100  0.00000  0.00000  0.003
225  23.000  0.75000  0.11000  0.03000  0.00000  0.00000  0.000
226  14.205  0.00477  0.00001  0.01500  0.00001  0.01200  0.000

[227 rows x 7 columns]
---
     label
0        0
1        0
2        0
3        0
4        0
..     ...
222      1
223      1
224      1
225      1
226      1

[227 rows x 1 columns]


# 三、模型训练

## 4.1划分数据集

In [6]:
print(X_train_.head())
print(X_test_.head())
print(y_train_.head())
print(y_test_.head())

        pre       2W       4W       6W       8W  10W  12W
51   10.759  0.00361  0.00026  0.77700  0.00026  0.0  0.0
199  85.920  0.53800  0.04900  0.00800  0.00014  0.0  0.0
154   4.799  0.19000  0.00800  0.00177  0.00000  0.0  0.0
95   52.920  2.65000  1.65000  0.28237  0.00060  0.0  0.0
203  77.705  1.47400  0.10700  0.00700  0.00000  0.0  0.0
        pre        2W     4W     6W       8W      10W      12W
98  204.000  10.26300  0.000  1.880  0.00000  2.38000  0.00000
46    5.680   3.17215  0.600  0.373  0.86807  0.66117  0.24034
99  213.146   8.61800  1.283  0.642  0.00000  0.00000  0.00000
22    8.439   0.30800  0.000  0.025  0.00122  0.00127  0.02600
78   18.644   2.78500  3.591  4.155  1.01757  0.05198  0.00042
     label
51       0
199      1
154      1
95       0
203      1
    label
98      0
46      0
99      0
22      0
78      0


## 4.2转numpy数组

In [7]:
X_train = np.array(X_train_)
X_test = np.array(X_test_)
y_train = np.array(y_train_)
y_test  = np.array(y_test_)

In [8]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(152, 7)
(152, 1)
(75, 7)


## 4.3自定义评价函数

In [9]:
#自定义评价函数
def myFeval(preds, xgbtrain):
    label = xgbtrain.get_label()
    score = mean_squared_error(label,preds)
    return 'myFeval',score

## 4.4xgb

In [10]:
##### xgb

xgb_params = {"booster":'gbtree','eta': 0.005, 'max_depth': 5, 'subsample': 0.7, 
              'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 8}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_xgb = np.zeros(len(X_train))
predictions_xgb = np.zeros(len(X_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])
    
    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params,feval = myFeval)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train_)))

fold n°1
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:0.49790	valid_data-rmse:0.49833	train-myFeval:0.24791	valid_data-myFeval:0.24833
Multiple eval metrics have been passed: 'valid_data-myFeval' will be used for early stopping.

Will train until valid_data-myFeval hasn't improved in 200 rounds.
[100]	train-rmse:0.34749	valid_data-rmse:0.38154	train-myFeval:0.12075	valid_data-myFeval:0.14557
[200]	train-rmse:0.25386	valid_data-rmse:0.32629	train-myFeval:0.06444	valid_data-myFeval:0.10646
[300]	train-rmse:0.19547	valid_data-rmse:0.29802	train-myFeval:0.03821	valid_data-myFeval:0.08881
[400]	train-rmse:0.15729	valid_data-rmse:0.28751	train-myFeval:0.02474	valid_data-myFeval:0.08266
[500]	train-rmse:0.13072	valid_data-rmse:0.28356	train-myFe

Multiple eval metrics have been passed: 'valid_data-myFeval' will be used for early stopping.

Will train until valid_data-myFeval hasn't improved in 200 rounds.
[100]	train-rmse:0.34780	valid_data-rmse:0.37789	train-myFeval:0.12097	valid_data-myFeval:0.14280
[200]	train-rmse:0.25495	valid_data-rmse:0.31166	train-myFeval:0.06500	valid_data-myFeval:0.09713
[300]	train-rmse:0.19634	valid_data-rmse:0.28070	train-myFeval:0.03855	valid_data-myFeval:0.07879
[400]	train-rmse:0.15967	valid_data-rmse:0.26736	train-myFeval:0.02549	valid_data-myFeval:0.07148
[500]	train-rmse:0.13243	valid_data-rmse:0.26313	train-myFeval:0.01754	valid_data-myFeval:0.06924
[600]	train-rmse:0.11331	valid_data-rmse:0.25896	train-myFeval:0.01284	valid_data-myFeval:0.06706
[700]	train-rmse:0.09664	valid_data-rmse:0.25924	train-myFeval:0.00934	valid_data-myFeval:0.06721
Stopping. Best iteration:
[599]	train-rmse:0.11343	valid_data-rmse:0.25892	train-myFeval:0.01287	valid_data-myFeval:0.06704

CV score: 0.07923888


## 4.5lgb

In [12]:
##### lgb

param = {'boosting_type': 'gbdt',
         'num_leaves': 20,
         'min_data_in_leaf': 20, 
         'objective':'regression',
         'max_depth':6,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(X_train_))
predictions_lgb = np.zeros(len(X_test_))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
   # print(trn_idx)
   # print(".............x_train.........")
   # print(X_train[trn_idx])
  #  print(".............y_train.........")
  #  print(y_train[trn_idx])
    # 新加的
    y_train=y_train.ravel() # Wrong type(ndarray) for label.It should be list, numpy 1-D array or pandas Series
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, y_train_)))

fold n°1
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.0675672	valid_1's l2: 0.0949555
[400]	training's l2: 0.0507131	valid_1's l2: 0.0875135
Early stopping, best iteration is:
[400]	training's l2: 0.0507131	valid_1's l2: 0.0875135
fold n°2
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.0655695	valid_1's l2: 0.108912
Early stopping, best iteration is:
[251]	training's l2: 0.0580633	valid_1's l2: 0.106434
fold n°3
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.0662394	valid_1's l2: 0.0891842
[400]	training's l2: 0.0474784	valid_1's l2: 0.0863749
Early stopping, best iteration is:
[326]	training's l2: 0.0527244	valid_1's l2: 0.0845736
fold n°4
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.0708227	valid_1's l2: 0.0929258
[400]	training's l2: 0.0540186	valid_1's l2: 0.0843286
Early stopping, best iteration is:
[493]	training's l2: 0.05042

## 4.6ctb

In [13]:
from catboost import Pool, CatBoostRegressor
# cat_features=[0,2,3,10,11,13,15,16,17,18,19]
from sklearn.model_selection import train_test_split


#X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train_, y_train_, test_size=0.3, random_state=2019)
# train_pool = Pool(X_train_s, y_train_s,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
# val_pool = Pool(X_test_s, y_test_s,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
# test_pool = Pool(X_test_ ,cat_features=[0,2,3,10,11,13,15,16,17,18,19]) 


kfolder = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_cb = np.zeros(len(X_train))
predictions_cb = np.zeros(len(X_test))
kfold = kfolder.split(X_train, y_train)
fold_=0
#X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train, y_train, test_size=0.3, random_state=2019)
for train_index, vali_index in kfold:
    print("fold n°{}".format(fold_))
    fold_=fold_+1
    k_x_train = X_train[train_index]
    k_y_train = y_train[train_index]
    k_x_vali = X_train[vali_index]
    k_y_vali = y_train[vali_index]
    cb_params = {
         'n_estimators': 100000,
         'loss_function': 'RMSE',
         'eval_metric':'RMSE',
         'learning_rate': 0.05,
         'depth': 5,
         'use_best_model': True,
         'subsample': 0.6,
         'bootstrap_type': 'Bernoulli',
         'reg_lambda': 3
    }
    model_cb = CatBoostRegressor(**cb_params)
    #train the model
    model_cb.fit(k_x_train, k_y_train,eval_set=[(k_x_vali, k_y_vali)],verbose=100,early_stopping_rounds=50)
    oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_)
    predictions_cb += model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_cb, y_train)))

fold n°0
0:	learn: 0.4843726	test: 0.4869667	best: 0.4869667 (0)	total: 55.3ms	remaining: 1h 32m 7s
100:	learn: 0.1877214	test: 0.2189398	best: 0.2189398 (100)	total: 196ms	remaining: 3m 13s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.2181682768
bestIteration = 101

Shrink model to first 102 iterations.
fold n°1
0:	learn: 0.4857866	test: 0.4862094	best: 0.4862094 (0)	total: 2.48ms	remaining: 4m 8s
100:	learn: 0.1754912	test: 0.2962325	best: 0.2948263 (94)	total: 121ms	remaining: 2m
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.2948263133
bestIteration = 94

Shrink model to first 95 iterations.
fold n°2
0:	learn: 0.4823434	test: 0.5031143	best: 0.5031143 (0)	total: 2.43ms	remaining: 4m 3s
100:	learn: 0.1412649	test: 0.3459489	best: 0.3440296 (69)	total: 149ms	remaining: 2m 27s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3440296317
bestIteration = 69

Shrink model to first 70 iterations.
fold n°3
0:	learn: 0.4826076	te

## 4.7模型融合

In [14]:
from sklearn import linear_model
# 将lgb和xgb和ctb的结果进行stacking
train_stack = np.vstack([oof_lgb,oof_xgb,oof_cb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb,predictions_cb]).transpose()


folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2018)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,y_train)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
    val_data, val_y = train_stack[val_idx], y_train[val_idx]
    
    clf_3 = linear_model.BayesianRidge()
    #clf_3 =linear_model.Ridge()
    clf_3.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10
    
print("CV score: {:<8.8f}".format(mean_squared_error(oof_stack, y_train_)))

fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
CV score: 0.07329928


# 五、预测结果

In [18]:
result=list(predictions)
print(result)
predict = pd.DataFrame(data=result)
predict.to_csv("predict_1725.csv", index=False)

[0.12734082272889655, 0.03887187117817929, -0.0576205274910133, 0.03879736211464524, -0.04360689206360637, 0.8222233306869584, 0.9515586588595087, 1.000891209560275, -0.07381754848832098, 0.9526447197757097, 0.40647369448158766, 1.0068931435056276, -0.0372806395365506, 0.9120110962512827, 0.06245978770049241, 0.011371135314740434, 1.0107981605962724, 0.9722686475256033, -0.028137830672559223, 1.0002590007189391, 0.6665636863682503, 0.9930730491576473, -0.052245334760276735, -0.04337148973310682, 1.0064121522248062, 1.0043702142671547, 0.9492914960697486, -0.017612472164497753, 0.9059627109670109, 0.8914232923437804, -0.04627023214273087, 0.24216359248430722, 0.8807670793209079, 0.565855542580656, -0.04437965732987747, 0.922832755976519, 0.45139915150870324, 0.8281685910341943, -0.022992957724800225, 0.04964267959196532, 0.48323071868449147, 0.7976465621455322, 0.038878435723425184, 0.5353419384672655, -0.038552744990501334, -0.03742550204062106, 0.0602928903345251, 0.9309745932664336, 

ValueError: Length of values does not match length of index

In [42]:
data=data.drop(["id"], axis=1)
X_train_ = data[:train.shape[0]] # 训练集
X_test_  = data[train.shape[0]:] # 测试集
target_column = 'happiness'
feature_columns=list(X_test_.columns) 
feature_columns

['survey_type',
 'province',
 'city',
 'county',
 'gender',
 'birth',
 'nationality',
 'religion',
 'religion_freq',
 'edu',
 'edu_status',
 'edu_yr',
 'income',
 'political',
 'join_party',
 'floor_area',
 'property',
 'height_cm',
 'weight_jin',
 'health',
 'health_problem',
 'depression',
 'hukou',
 'hukou_loc',
 'media_1',
 'media_2',
 'media_3',
 'media_4',
 'media_5',
 'media_6',
 'leisure_1',
 'leisure_2',
 'leisure_3',
 'leisure_4',
 'leisure_5',
 'leisure_6',
 'leisure_7',
 'leisure_8',
 'leisure_9',
 'leisure_10',
 'leisure_11',
 'leisure_12',
 'socialize',
 'relax',
 'learn',
 'social_neighbor',
 'social_friend',
 'socia_outing',
 'equity',
 'class',
 'class_10_before',
 'class_10_after',
 'class_14',
 'work_exper',
 'work_status',
 'work_yr',
 'work_type',
 'work_manage',
 'insur_1',
 'insur_2',
 'insur_3',
 'insur_4',
 'family_income',
 'family_m',
 'family_status',
 'house',
 'car',
 'invest',
 'son',
 'daughter',
 'minor_child',
 'marital',
 'marital_1st',
 's_birth',
 '