In [None]:
# 特征优化
import pandas as pd
train_data_file = "../data/zhengqi_train.txt"
test_data_file = "../data/zhengqi_test.txt"
train_data = pd.read_csv(train_data_file,sep='\t',encoding='utf-8')
test_data = pd.read_csv(test_data_file,sep='\t',encoding='utf-8')

In [None]:
epsilon = 1e-5
# 组合特征优化
func_dict ={
    'add':lambda x, y: x+y,
    'mins':lambda x, y: x - y,
    'div':lambda x, y: x/(y+epsilon),
    'multi':lambda x, y:x*y
}

In [None]:
# 特征构造函数
def auto_features_make(train_data,test_data,func_dict,col_list):
    train_data,test_data = train_data.copy(),test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name,func in func_dict.items():
                for data in [train_data,test_data]:
                    func_features = func(data[col_i],data[col_j])
                    col_func_features  = '-'.join([col_i,func_name,col_j])
                    data[col_func_features] = func_features
    return train_data,test_data

In [None]:
# 数据降维
train_data2 ,test_data2 = auto_features_make(
    train_data,test_data,func_dict,col_list=test_data.columns
)
# PCA降维度
from sklearn.decomposition import PCA
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

In [None]:
# LGB 进行模型训练和评估
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import numpy as pd

# 5折交叉验证
Folds = 5
kf = KFold(n_splits=Folds,shuffle=True,random_state=2019)
# 记录训练和预测MSE
MSE_dict = {'train_mse':[],'test_mse':[]}
# 线下训练预测
for i, (train_index,test_index) in enumerate(kf.split(X_train2)):
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type = 'gbdt',
        random_state=2019,
        objective='regression',
    )
    # 切分训练和预测集
    X_train_Kfold = X_train2[train_index]
    X_test_Kfold = X_train2[test_index]
    y_train_Kfold = y_train[train_index]
    y_test_Kfold = y_train[test_index]
    # 训练模型
    lgb_reg.fit(X=X_train_Kfold,
                y=y_train_Kfold,
                eval_set=[(X_train_Kfold,y_train_Kfold),
                (X_test_Kfold,y_test_Kfold)],
                eval_names=['Train','Test'],
                early_stopping_rounds=100,
                eval_metric='MSE',
                verbose=50
    )
    # 训练集和测试集预测
    y_train_Kfold_predict = lgb_reg.predict(
        X_train_Kfold,num_iteration=lgb_reg.best_iteration_
    )
    y_test_Kfold_predict = lgb_reg.predict(
        X_test_Kfold,num_iteration=lgb_reg.best_iteration_
    )
    print('第{}折训练和预测 训练MSE预测'.format(i))
    train_mse = mean_squarted_errer(y_train_Kfold_predict,y_train_Kfold)
    print('训练MSE：',train_mse,'\n')
    test_mse = mean_squarted_errer(y_test_Kfold_predict,y_test_Kfold)
    print('训练MSE：',test_mse,'\n')
    MSE_dict['train_mse'].append(train_mse)
    MSE_dict['test_mse'].append(test_mse)
    print('\n','预测MSE\n',MSE_dict['train_mse'],'\n',n.mean(MSE_dict['train_mse']),'\n')
    print('\n','预测MSE\n',MSE_dict['test_mse'],'\n',n.mean(MSE_dict['test_mse']),'\n')