In [1]:
import pandas as pd

train_data_file = "./zhengqi_train.txt"
test_data_file =  "./zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [2]:
epsilon=1e-5

#组交叉特征，可以自行定义，如增加： x*x/y, log(x)/y 等等
func_dict = {
            'add': lambda x,y: x+y,
            'mins': lambda x,y: x-y,
            'div': lambda x,y: x/(y+epsilon),
            'multi': lambda x,y: x*y
            }

In [3]:
def auto_features_make(train_data,test_data,func_dict,col_list):
    train_data, test_data = train_data.copy(), test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name, func in func_dict.items():
                for data in [train_data,test_data]:
                    func_features = func(data[col_i],data[col_j])
                    col_func_features = '-'.join([col_i,func_name,col_j])
                    data[col_func_features] = func_features
    return train_data,test_data

In [4]:
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

In [5]:
from sklearn.decomposition import PCA   #主成分分析法

#PCA方法降维
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']

In [6]:
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

In [None]:
# ls_validation i
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds=5
kf = KFold(len(X_train2), n_splits=Folds, random_state=2019, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
    'train_mse':[],
    'test_mse':[]
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2019,
        objective='regression',
    )
   
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
    y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
    
    # 训练模型
    lgb_reg.fit(
            X=X_train_KFold,y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
            eval_names=['Train','Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=50
        )


    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.418976	Test's l2: 0.105755
[100]	Train's l2: 0.203665	Test's l2: 0.0242962
[150]	Train's l2: 0.114456	Test's l2: 0.00489616
[200]	Train's l2: 0.0741974	Test's l2: 2.93437e-07
[250]	Train's l2: 0.0535211	Test's l2: 0.000800416
Early stopping, best iteration is:
[199]	Train's l2: 0.0747595	Test's l2: 3.3059e-08
第0折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.07475946684347008 
------
------
 预测MSE
 3.306387977686079e-08 
------

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.419015	Test's l2: 0.140632
[100]	Train's l2: 0.203691	Test's l2: 0.0985215
[150]	Train's l2: 0.114536	Test's l2: 0.079759
[200]	Train's l2: 0.074307	Test's l2: 0.0605385
[250]	Train's l2: 0.0536499	Test's l2: 0.0513692
[300]	Train's l2: 0.0416162	Test's l2: 0.049854
[350]	Train's l2: 0.0335032	Test's l2: 0.0429859
[400]	Train's l2: 0.0276629	Test's l2: 0.0404468
[450]	Train's l2: 0.0231634	Test's l2: 0.0412943
[5

第11折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.009365707625869515 
------
------
 预测MSE
 0.014969987964355553 
------

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.419045	Test's l2: 0.097808
[100]	Train's l2: 0.203876	Test's l2: 0.0476716
[150]	Train's l2: 0.114709	Test's l2: 0.0217354
[200]	Train's l2: 0.0743301	Test's l2: 0.00727449
[250]	Train's l2: 0.0536459	Test's l2: 0.00342079
[300]	Train's l2: 0.0415101	Test's l2: 0.000914694
[350]	Train's l2: 0.0334051	Test's l2: 0.000388864
[400]	Train's l2: 0.0276018	Test's l2: 0.000330803
[450]	Train's l2: 0.0231272	Test's l2: 4.72281e-05
[500]	Train's l2: 0.0194803	Test's l2: 1.97692e-05
[550]	Train's l2: 0.0165513	Test's l2: 6.51461e-05
Early stopping, best iteration is:
[473]	Train's l2: 0.0213632	Test's l2: 1.14763e-08
第12折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.021363228053157143 
------
------
 预测MSE
 1.1479735117855689e-08 
------

Training until validation scores don't improve for 100 rounds
[50]	Train's l2:

[350]	Train's l2: 0.0333003	Test's l2: 0.0223327
Early stopping, best iteration is:
[259]	Train's l2: 0.0508166	Test's l2: 0.0148776
第22折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.05081657768118204 
------
------
 预测MSE
 0.014877617179682922 
------

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.418947	Test's l2: 0.0193329
[100]	Train's l2: 0.203666	Test's l2: 0.0145756
[150]	Train's l2: 0.11451	Test's l2: 0.00108674
[200]	Train's l2: 0.0743262	Test's l2: 0.0040723
[250]	Train's l2: 0.0536434	Test's l2: 0.0160333
Early stopping, best iteration is:
[166]	Train's l2: 0.0982785	Test's l2: 2.97378e-07
第23折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.0982784845306778 
------
------
 预测MSE
 2.973757296419554e-07 
------

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.41895	Test's l2: 0.390101
[100]	Train's l2: 0.203691	Test's l2: 0.215401
[150]	Train's l2: 0.114491	Test's l2: 0.143047
[200]	Train's l2: 0.0741461	Test's l2: 0.0992432
[250]	

[100]	Train's l2: 0.203728	Test's l2: 0.0150115
Early stopping, best iteration is:
[28]	Train's l2: 0.598496	Test's l2: 7.50898e-08
第36折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.5984960635214709 
------
------
 预测MSE
 7.509136281470761e-08 
------

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.419132	Test's l2: 0.0934202
[100]	Train's l2: 0.203797	Test's l2: 0.0222217
[150]	Train's l2: 0.114612	Test's l2: 0.0029676
[200]	Train's l2: 0.0742617	Test's l2: 1.44043e-06
[250]	Train's l2: 0.0535756	Test's l2: 0.00178206
Early stopping, best iteration is:
[199]	Train's l2: 0.0747999	Test's l2: 2.24209e-07
第37折 训练和预测 训练MSE 预测MSE
------
 训练MSE
 0.07479993937220178 
------
------
 预测MSE
 2.2419136575330266e-07 
------

