In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',10)  
pd.set_option('display.max_rows',20)  
#禁用科学计数法
np.set_printoptions(suppress=True,   precision=10,  threshold=2000,  linewidth=150)  
pd.set_option('display.float_format',lambda x : '%.2f' % x)

In [2]:
train_data_file = "./zhengqi_train.txt"
test_data_file = "./zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [10]:
# 定义特征构造方法，构造特征
epsilon = 1e-5
#组交叉特征，可以自行定义，如增加： x*x/y, log(x)/y 等等
func_dict = {
    'add': lambda x, y: x + y,
    'mins': lambda x, y: x - y,
    'div': lambda x, y: x / (y + epsilon),
    'multi': lambda x, y: x * y
}

In [13]:
# 定义特征构造的函数
def auto_features_make(train_data,test_data,func_dict,col_list):
    train_data, test_data = train_data.copy(),test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name,func in func_dict.items():
                for data in [train_data,test_data]:
                    func_features = func(data[col_i],data[col_j])
                    col_func_features = '-'.join([col_i,func_name,col_j])
                    data[col_func_features] = func_features
    return train_data,test_data
# 对训练集和测试集数据进行特征构造
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

In [14]:
train_data2.head()

Unnamed: 0,V0,V1,V2,V3,V4,...,V37-multi-V36,V37-add-V37,V37-mins-V37,V37-div-V37,V37-multi-V37
0,0.57,0.02,-0.14,0.41,0.45,...,9.15,-7.02,0.0,1.0,12.31
1,0.97,0.44,0.07,0.57,0.19,...,0.24,-1.46,0.0,1.0,0.53
2,1.01,0.57,0.23,0.37,0.11,...,-0.45,-1.18,0.0,1.0,0.35
3,0.73,0.37,0.28,0.17,0.6,...,-0.04,-0.22,0.0,1.0,0.01
4,0.68,0.64,0.26,0.21,0.34,...,0.01,-0.06,0.0,1.0,0.0


In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']

X_train2 = train_data2[test_data2.columns].values
y_train =train_data2['target']

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds = 5
kf = KFold(n_splits=Folds, shuffle=True, random_state=2019)
# 记录训练和预测MSE“
MSE_DICT = {'train_mse': [], 'test_mse': []}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2019,
        objective='regression',
    )

    # 切分训练集和预测集
    X_train_KFold = X_train2[train_index]
    X_test_KFold =  X_train2[test_index]
    y_train_KFold = y_train[train_index]
    y_test_KFold = y_train[test_index]

    # 训练模型
    lgb_reg.fit(X=X_train_KFold,
                y=y_train_KFold,
                eval_set=[(X_train_KFold, y_train_KFold),
                          (X_test_KFold, y_test_KFold)],
                eval_names=['Train', 'Test'],
                early_stopping_rounds=100,
                eval_metric='MSE',
                verbose=50)

    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(
        X_train_KFold, num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(
        X_test_KFold, num_iteration=lgb_reg.best_iteration_)

    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict,
                                   y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict,
                                  y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')

    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n',
      np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n',
      np.mean(MSE_DICT['test_mse']), '\n------')

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.413128	Test's l2: 0.455026
[100]	Train's l2: 0.198017	Test's l2: 0.245523
[150]	Train's l2: 0.108839	Test's l2: 0.164432
[200]	Train's l2: 0.0683439	Test's l2: 0.132008
[250]	Train's l2: 0.0478354	Test's l2: 0.117217
[300]	Train's l2: 0.035836	Test's l2: 0.110572
[350]	Train's l2: 0.0279916	Test's l2: 0.10673
[400]	Train's l2: 0.0225218	Test's l2: 0.104686
[450]	Train's l2: 0.0183733	Test's l2: 0.103133
[500]	Train's l2: 0.0151476	Test's l2: 0.102168
[550]	Train's l2: 0.012598	Test's l2: 0.101216
[600]	Train's l2: 0.0105448	Test's l2: 0.100722
[650]	Train's l2: 0.00886925	Test's l2: 0.100606
[700]	Train's l2: 0.00751108	Test's l2: 0.100288
[750]	Train's l2: 0.00639588	Test's l2: 0.100224
[800]	Train's l2: 0.00547284	Test's l2: 0.100142
[850]	Train's l2: 0.00469886	Test's l2: 0.0999705
[900]	Train's l2: 0.00405206	Test's l2: 0.0997473
[950]	Train's l2: 0.00350702	Test's l2: 0.0997148
[1000]	Train's l2: 0.00