In [1]:
import lightgbm as lgb
import sklearn
import numpy
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import colorama
import numpy as np
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
def get_lgb_params(space):
    lgb_params = dict()
    lgb_params['boosting_type'] = space['boosting_type'] if 'boosting_type' in space else 'gbdt'
    lgb_params['objective'] = 'regression'
    lgb_params['metric'] = 'rmse'
    lgb_params['learning_rate'] = space['learning_rate']
    lgb_params['num_leaves'] = int(space['num_leaves'])
    lgb_params['min_data_in_leaf'] = int(space['min_data_in_leaf'])
    lgb_params['min_sum_hessian_in_leaf'] = space['min_sum_hessian_in_leaf']
    lgb_params['max_depth'] = -1
    lgb_params['lambda_l1'] = space['lambda_l1'] if 'lambda_l1' in space else 0.0
    lgb_params['lambda_l2'] = space['lambda_l2'] if 'lambda_l2' in space else 0.0
    lgb_params['max_bin'] = int(space['max_bin']) if 'max_bin' in space else 256
    lgb_params['feature_fraction'] = space['feature_fraction']
    lgb_params['bagging_fraction'] = space['bagging_fraction']
    lgb_params['bagging_freq'] = int(space['bagging_freq']) if 'bagging_freq' in space else 1
    lgb_params['nthread'] = 4
    return lgb_params

In [3]:
lgb_best_params_space = {'bagging_fraction': 0.7767407798955681, 'bagging_freq': 1.0, 'feature_fraction': 0.8777729558158374, 'lambda_l1': 5.345467708327675, 'lambda_l2': 5.553820084786194, 'learning_rate': 0.006729207978658059, 'max_bin': 189.0, 'min_data_in_leaf': 27.0, 'min_sum_hessian_in_leaf': 3.9732496490135323, 'num_leaves': 41.0}

params = get_lgb_params(lgb_best_params_space)

In [7]:
folds = KFold(n_splits=5, shuffle=True, random_state=0)

test = pd.read_csv('C:/Users/Boniface/Desktop/房租预测/datasets/cluster_test.csv')
test.pop('Unnamed: 0')
test.pop('tradeMoney')

categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration',  'region', 'plate','cluster']
feature = pd.read_csv('C:/Users/Boniface/Desktop/房租预测/datasets/cluster_train.csv')
label = feature.pop('tradeMoney')
feature.pop('Unnamed: 0')

# 1
y_pre_list = []
r2_list = []
train_feat = pd.Series()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(feature.values, label)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(feature.iloc[trn_idx], label = label[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(feature.iloc[val_idx], label = label[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(params, trn_data, num_round,valid_sets=[trn_data, val_data], verbose_eval=500,
                    early_stopping_rounds=200)
    y_pre = clf.predict(feature.iloc[val_idx], num_iteration=clf.best_iteration)
    r2 = r2_score(y_pre,label[val_idx])
    r2_list.append(r2)
    train_feat = train_feat.append(pd.Series(y_pre,index=val_idx))
    y_pre_test = clf.predict(test,num_iteration=clf.best_iteration)
    y_pre_list.append(y_pre_test)
print('r2 score{:}'.format(r2))
print('r2:{:}'.format(np.mean(r2_list)))

y_pred_final=  (y_pre_list[0]+y_pre_list[1]+y_pre_list[2]+y_pre_list[3]+y_pre_list[4])/5
feature['pre'] = train_feat
test['pre'] = y_pred_final

fold 0




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 785.454	valid_1's rmse: 824.023
[1000]	training's rmse: 682.661	valid_1's rmse: 757.639
[1500]	training's rmse: 632.226	valid_1's rmse: 737.01
[2000]	training's rmse: 595.882	valid_1's rmse: 726.618
[2500]	training's rmse: 567.448	valid_1's rmse: 719.996
[3000]	training's rmse: 544.024	valid_1's rmse: 716.165
[3500]	training's rmse: 523.791	valid_1's rmse: 713.45
[4000]	training's rmse: 506.036	valid_1's rmse: 711.746
[4500]	training's rmse: 490.114	valid_1's rmse: 710.403
[5000]	training's rmse: 475.724	valid_1's rmse: 709.267
[5500]	training's rmse: 462.698	valid_1's rmse: 708.368
[6000]	training's rmse: 450.501	valid_1's rmse: 707.809
[6500]	training's rmse: 439.086	valid_1's rmse: 707.477
[7000]	training's rmse: 428.426	valid_1's rmse: 707.252
Early stopping, best iteration is:
[6951]	training's rmse: 429.459	valid_1's rmse: 707.234
fold 1




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 781.5	valid_1's rmse: 841.428
[1000]	training's rmse: 677.854	valid_1's rmse: 774.497
[1500]	training's rmse: 627.771	valid_1's rmse: 753.789
[2000]	training's rmse: 592.021	valid_1's rmse: 741.267
[2500]	training's rmse: 563.777	valid_1's rmse: 733.185
[3000]	training's rmse: 540.433	valid_1's rmse: 728.038
[3500]	training's rmse: 520.46	valid_1's rmse: 724.248
[4000]	training's rmse: 502.75	valid_1's rmse: 721.205
[4500]	training's rmse: 487.079	valid_1's rmse: 719.257
[5000]	training's rmse: 472.773	valid_1's rmse: 717.89
[5500]	training's rmse: 459.46	valid_1's rmse: 716.765
[6000]	training's rmse: 447.194	valid_1's rmse: 715.741
[6500]	training's rmse: 435.841	valid_1's rmse: 715.141
[7000]	training's rmse: 425.3	valid_1's rmse: 714.428
[7500]	training's rmse: 415.322	valid_1's rmse: 714.002
[8000]	training's rmse: 405.922	valid_1's rmse: 713.766
[8500]	training's rmse: 396.961	valid_1's rmse: 713



Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 783.884	valid_1's rmse: 842
[1000]	training's rmse: 678.939	valid_1's rmse: 774.828
[1500]	training's rmse: 627.667	valid_1's rmse: 754.273
[2000]	training's rmse: 591.672	valid_1's rmse: 743.41
[2500]	training's rmse: 563.56	valid_1's rmse: 736.722
[3000]	training's rmse: 540.28	valid_1's rmse: 732.603
[3500]	training's rmse: 520.46	valid_1's rmse: 729.942
[4000]	training's rmse: 502.893	valid_1's rmse: 728.499
[4500]	training's rmse: 487.207	valid_1's rmse: 727.283
[5000]	training's rmse: 472.91	valid_1's rmse: 726.293
[5500]	training's rmse: 459.975	valid_1's rmse: 725.675
[6000]	training's rmse: 447.83	valid_1's rmse: 725.282
[6500]	training's rmse: 436.619	valid_1's rmse: 724.855
Early stopping, best iteration is:
[6548]	training's rmse: 435.598	valid_1's rmse: 724.782
fold 3




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 777.448	valid_1's rmse: 860.033
[1000]	training's rmse: 672.558	valid_1's rmse: 796.204
[1500]	training's rmse: 622.031	valid_1's rmse: 775.388
[2000]	training's rmse: 586.491	valid_1's rmse: 763.828
[2500]	training's rmse: 558.364	valid_1's rmse: 757.343
[3000]	training's rmse: 535.037	valid_1's rmse: 753.026
[3500]	training's rmse: 515.089	valid_1's rmse: 750.176
[4000]	training's rmse: 497.604	valid_1's rmse: 748.121
[4500]	training's rmse: 482.013	valid_1's rmse: 746.684
[5000]	training's rmse: 467.784	valid_1's rmse: 745.789
[5500]	training's rmse: 454.67	valid_1's rmse: 745.153
[6000]	training's rmse: 442.61	valid_1's rmse: 744.776
[6500]	training's rmse: 431.293	valid_1's rmse: 744.355
Early stopping, best iteration is:
[6503]	training's rmse: 431.239	valid_1's rmse: 744.342
fold 4




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 784.33	valid_1's rmse: 825.399
[1000]	training's rmse: 680.633	valid_1's rmse: 759.321
[1500]	training's rmse: 629.705	valid_1's rmse: 741.074
[2000]	training's rmse: 593.533	valid_1's rmse: 731.246
[2500]	training's rmse: 565.378	valid_1's rmse: 725.399
[3000]	training's rmse: 542.178	valid_1's rmse: 721.704
[3500]	training's rmse: 522.259	valid_1's rmse: 719.075
[4000]	training's rmse: 504.721	valid_1's rmse: 717.443
[4500]	training's rmse: 488.988	valid_1's rmse: 716.172
[5000]	training's rmse: 474.641	valid_1's rmse: 715.034
[5500]	training's rmse: 461.521	valid_1's rmse: 714.42
Early stopping, best iteration is:
[5521]	training's rmse: 461.007	valid_1's rmse: 714.321
r2 score0.9121685465571714
r2:0.908676496578698


In [9]:
# 2
y_pre_list = []
r2_list = []
train_feat = pd.Series()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(feature.values, label)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(feature.iloc[trn_idx], label[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(feature.iloc[val_idx], label[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, #feval=get_r2_metric,
                    valid_sets=[trn_data, val_data], verbose_eval=500,
                    early_stopping_rounds=200)
    y_pre = clf.predict(feature.iloc[val_idx], num_iteration=clf.best_iteration)
    r2 = r2_score(y_pre,label[val_idx])
    r2_list.append(r2)
    train_feat = train_feat.append(pd.Series(y_pre,index=val_idx))
    y_pre_test = clf.predict(test,num_iteration=clf.best_iteration)
    y_pre_list.append(y_pre_test)
print('r2 score{:}'.format(r2))
print('r2:{:}'.format(np.mean(r2_list)))
    
y_pred_final=  (y_pre_list[0]+y_pre_list[1]+y_pre_list[2]+y_pre_list[3]+y_pre_list[4])/5
feature['pre_2'] = train_feat
test['pre_2'] = y_pred_final

fold 0




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 686.507	valid_1's rmse: 731.981
Early stopping, best iteration is:
[572]	training's rmse: 673.768	valid_1's rmse: 730.545
fold 1




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 683.071	valid_1's rmse: 739.317
Early stopping, best iteration is:
[581]	training's rmse: 669.049	valid_1's rmse: 737.369
fold 2




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 682.805	valid_1's rmse: 744.122
Early stopping, best iteration is:
[610]	training's rmse: 664.909	valid_1's rmse: 742.36
fold 3




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 676.652	valid_1's rmse: 763.548
Early stopping, best iteration is:
[528]	training's rmse: 671.245	valid_1's rmse: 763.391
fold 4




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 684.578	valid_1's rmse: 739.807
Early stopping, best iteration is:
[607]	training's rmse: 667.178	valid_1's rmse: 736.915
r2 score0.9019336711167829
r2:0.8984746729841946


In [10]:
# 3
y_pre_list = []
r2_list = []
train_feat = pd.Series()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(feature.values, label)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(feature.iloc[trn_idx], label[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(feature.iloc[val_idx], label[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, #feval=get_r2_metric,
                    valid_sets=[trn_data, val_data], verbose_eval=500,
                    early_stopping_rounds=200)
    y_pre = clf.predict(feature.iloc[val_idx], num_iteration=clf.best_iteration)
    r2 = r2_score(y_pre,label[val_idx])
    r2_list.append(r2)
    train_feat = train_feat.append(pd.Series(y_pre,index=val_idx))
    y_pre_test = clf.predict(test,num_iteration=clf.best_iteration)
    y_pre_list.append(y_pre_test)
print('r2 score{:}'.format(r2))
print('r2:{:}'.format(np.mean(r2_list)))
    
y_pred_final=  (y_pre_list[0]+y_pre_list[1]+y_pre_list[2]+y_pre_list[3]+y_pre_list[4])/5

fold 0




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 684.594	valid_1's rmse: 729.311
Early stopping, best iteration is:
[584]	training's rmse: 669.959	valid_1's rmse: 727.389
fold 1




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 680.741	valid_1's rmse: 737.425
Early stopping, best iteration is:
[577]	training's rmse: 667.178	valid_1's rmse: 735.73
fold 2




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 680.711	valid_1's rmse: 741.708
Early stopping, best iteration is:
[581]	training's rmse: 666.733	valid_1's rmse: 740.359
fold 3




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 674.453	valid_1's rmse: 761.095
Early stopping, best iteration is:
[510]	training's rmse: 672.398	valid_1's rmse: 760.971
fold 4




Training until validation scores don't improve for 200 rounds
[500]	training's rmse: 682.551	valid_1's rmse: 737.308
Early stopping, best iteration is:
[574]	training's rmse: 669.826	valid_1's rmse: 734.858
r2 score0.9018904241406162
r2:0.8989908337107823


In [11]:
y_pred_final

array([4760.0218159 , 4722.31963732, 5954.12419982, ..., 5166.19662778,
       5476.54079688, 3586.48503007])

In [13]:
pd.DataFrame(y_pred_final).to_csv("pre.csv",header=None,index=None)

In [14]:
judge = pd.read_csv('C:/Users/Boniface/Desktop/房租预测/datasets/评分文件/sub_a_913.csv')
print(r2_score(judge, y_pred_final))

0.9894221610228148


In [None]:
def blend(train,test,target):
    '''5折'''
    # n_flods = 5
    # skf = list(StratifiedKFold(y, n_folds=n_flods))
    '''切分训练数据集为d1,d2两部分'''
    X_d1, X_d2, y_d1, y_d2 = train_test_split(train, target, test_size=0.5, random_state=914)

    train_ = np.zeros((X_d2.shape[0],len(clfs*3)))
    test_ = np.zeros((test.shape[0],len(clfs*3)))

    for j,clf in enumerate(clfs):
        '''依次训练各个单模型'''
        # print(j, clf)
        '''使用第1个部分作为预测，第2部分来训练模型，获得其预测的输出作为第2部分的新特征。'''
        # X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        X_d1fillna=X_d1.fillna(0)
        X_d2fillna = X_d2.fillna(0)

        X_predictfillna= test.fillna(0)

        clf.fit(X_d1fillna,y_d1)
        y_submission = clf.predict(X_d2fillna)
        y_test_submission = clf.predict(X_predictfillna)

        train_[:,j*3] = y_submission*y_submission
        '''对于测试集，直接用这k个模型的预测值作为新的特征。'''
        test_[:, j*3] = y_test_submission*y_test_submission

        train_[:, j+1] =(y_submission - y_submission.min()) /(y_submission.max() - y_submission.min())
        '''对于测试集，直接用这k个模型的预测值作为新的特征。'''
        y_test_submission = (y_test_submission - y_test_submission.min()) / \
                            (y_test_submission.max() - y_test_submission.min())
        test_[:, j+1] = y_test_submission

        train_[:, j+2] = np.log(y_submission)
        '''对于测试集，直接用这k个模型的预测值作为新的特征。'''
        y_test_submission =np.log(y_test_submission)
        test_[:, j+2] = y_test_submission



        # print("val auc Score: %f" % r2_score(y_predict, dataset_d2[:, j]))
        print('已完成第',j)

    train_.to_csv('train_blending.csv', index=False)
    test_.to_csv('test_blending.csv', index=False)
