# import包

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')



# 准备数据

In [2]:
data = pd.read_csv('./flchain.csv')

In [3]:
used_features = ['age', 'chapter', 'creatinine', 'flc.grp', 'kappa', 'lambda', 'mgus', 'sample.yr', 'sex']
cat_features = ['chapter', 'mgus', 'sex']
label = 'label'
data = data[used_features + [label]]

In [4]:
for f in tqdm_notebook(cat_features):
    lbl = LabelEncoder()
    data[f] = lbl.fit_transform(data[f])

  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
data.head()

Unnamed: 0,age,chapter,creatinine,flc.grp,kappa,lambda,mgus,sample.yr,sex,label
0,97.0,1,1.7,10,5.7,4.86,0,1997,0,85.0
1,92.0,12,0.9,1,0.87,0.683,0,2000,0,1281.0
2,94.0,1,1.4,10,4.36,3.85,0,1997,0,69.0
3,92.0,1,1.0,9,2.42,2.22,0,1996,0,115.0
4,93.0,1,1.1,6,1.32,1.69,0,1996,0,1039.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data[used_features], data[label], test_size=0.2, 
                                                    random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.2)

# xgboost

In [7]:
params = {
        'objective': 'reg:linear', 
        'eval_metric':  'rmse',
        'learning_rate': 0.1,
        'max_depth': 2,
}

trn_data = xgb.DMatrix(X_train, label=y_train)
val_data = xgb.DMatrix(X_valid, label=y_valid)

clf = xgb.train(params, trn_data, 
                num_boost_round=2000, 
                evals=[(trn_data, 'train'), (val_data, 'valid')],
                verbose_eval=50, 
                early_stopping_rounds=10)

xgb_preds = clf.predict(xgb.DMatrix(X_test))
xgb_rmse = mean_squared_error(y_test, xgb_preds, squared=False)

[0]	train-rmse:3641.23877	valid-rmse:3658.87207
[50]	train-rmse:1803.66504	valid-rmse:1862.48181
[100]	train-rmse:1781.60059	valid-rmse:1849.19141
[133]	train-rmse:1769.63159	valid-rmse:1847.17151


In [8]:
print(xgb_rmse)

1807.7553467860912


# lightgbm

In [9]:
params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'learning_rate': 0.1,
        'max_depth': 2
        }

trn_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
val_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
   
lgb1 = lgb.train(params, trn_data, 
                num_boost_round=2000, 
                valid_sets=[trn_data, val_data], 
                verbose_eval=50,
                early_stopping_rounds=20)

lgb_preds = lgb1.predict(X_test)
lgb_rmse = mean_squared_error(y_test, lgb_preds, squared=False)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 629
[LightGBM] [Info] Number of data points in the train set: 5039, number of used features: 9
[LightGBM] [Info] Start training from score -2476.726136
Training until validation scores don't improve for 20 rounds
[50]	training's rmse: 1800.93	valid_1's rmse: 1865.63
[100]	training's rmse: 1780.1	valid_1's rmse: 1853.02
[150]	training's rmse: 1766.67	valid_1's rmse: 1847.39
[200]	training's rmse: 1756.53	valid_1's rmse: 1845.68
[250]	training's rmse: 1745.99	valid_1's rmse: 1844.14
Early stopping, best iteration is:
[274]	training's rmse: 1742.34	valid_1's rmse: 1843.84


In [10]:
print(lgb_rmse)

1799.8615816321906


# catboost

In [11]:
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, label=y_valid, cat_features=cat_features)

cat1 = CatBoostRegressor(iterations=1000,
                         loss_function='RMSE',
                         eval_metric='RMSE',
                         metric_period=50,
                         max_depth=3,
                         early_stopping_rounds = 20
                         )
cat1.fit(train_pool, eval_set=valid_pool)

cat_preds = cat1.predict(X_test)
cat_rmse = mean_squared_error(y_test, cat_preds, squared=False)

Learning rate set to 0.065639
0:	learn: 2944.6173637	test: 3027.0159274	best: 3027.0159274 (0)	total: 54ms	remaining: 54s
50:	learn: 1817.5933203	test: 1850.5257173	best: 1850.5257173 (50)	total: 159ms	remaining: 2.96s




100:	learn: 1788.9828308	test: 1827.0155133	best: 1827.0155133 (100)	total: 242ms	remaining: 2.16s
150:	learn: 1775.0737893	test: 1821.6662264	best: 1821.6662264 (150)	total: 331ms	remaining: 1.86s
200:	learn: 1765.6089023	test: 1818.8915384	best: 1818.8915384 (200)	total: 410ms	remaining: 1.63s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1818.339935
bestIteration = 211

Shrink model to first 212 iterations.


In [12]:
print(cat_rmse)

1788.9309995540748


# 结果对比   

In [13]:
# xgboost: 1800.8353166725096 
# lightgbm: 1800.565240086788
# catboost: 1783.0293068431859