In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv('zhengqi_train.txt',sep="\t")
test_data = pd.read_csv('zhengqi_test.txt',sep="\t")

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

In [5]:
##删除异常值
train_data = train_data[train_data['V9']>-7.5]
test_data = test_data[test_data['V9']>-7.5]

##归一化数据
from sklearn import preprocessing
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']

##PCA降维 保持90%的信息
from sklearn.decomposition import PCA   #主成分分析法
pca = PCA(n_components=0.9)
new_train_pca_90 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_90 = pca.transform(test_data_scaler)
new_train_pca_90 = pd.DataFrame(new_train_pca_90)
new_test_pca_90 = pd.DataFrame(new_test_pca_90)
new_train_pca_90['target'] = train_data_scaler['target']

pca = PCA(n_components=0.95)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']

In [6]:
#切分数据集
from sklearn.model_selection import train_test_split  # 切分数据
new_train_pca_16 = new_train_pca_16.fillna(0)  #采用 pca 保留16维特征的数据
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']

In [7]:
train_data,test_data,train_target,test_target = train_test_split(train,target,test_size=0.2,random_state=0)

In [8]:
clf = LinearRegression()
clf.fit(train_data,train_target)
score = mean_squared_error(test_target,clf.predict(test_data))
score

0.26423379176281725

In [9]:
clf = lgb.LGBMRegressor(learning_rate=0.01,
                       max_depth=-1,
                       n_estimators=5000,
                       boosting_type='gbdt',
                       random_state=2019,
                       objective='regression')
clf.fit(X=train_data,y=train_target,eval_metric='MSE',verbose=50)

LGBMRegressor(learning_rate=0.01, n_estimators=5000, objective='regression',
              random_state=2019)

In [10]:
score = mean_squared_error(test_target,clf.predict(test_data))
score

0.23134714151049274

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
parameters={
    'learning_rate':[0.01,0.1,1],
    'n_estimators':[20,40]
}
clf = GridSearchCV(clf,parameters,cv=5)
clf.fit(train_data,train_target)
clf.best_params_

{'learning_rate': 0.1, 'n_estimators': 40}

In [12]:
score_test = mean_squared_error(test_target,clf.predict(test_data))
score_test

0.2528012673741091

In [13]:
from sklearn.model_selection import KFold
Folds=5
kf = KFold(n_splits = Folds,shuffle=True,random_state=2019)
#记录训练和预测MSE
MSE_DICT = {'train_mse':[],'test_mse':[]}

In [22]:
train_data2_f=train.values
train_data2_target=target.values

In [23]:
# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(train_data2_f)):
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2019,
        objective='regression',
    )
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = train_data2_f[train_index], train_data2_f[test_index]
    y_train_KFold, y_test_KFold = train_data2_target[train_index], train_data2_target[test_index]
    # 训练模型
    lgb_reg.fit(X=X_train_KFold,
                y=y_train_KFold,
                eval_set=[(X_train_KFold, y_train_KFold),
                          (X_test_KFold, y_test_KFold)],
                eval_names=['Train', 'Test'],
                early_stopping_rounds=100,
                eval_metric='MSE',
                verbose=50)

    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(
        X_train_KFold, num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(
        X_test_KFold, num_iteration=lgb_reg.best_iteration_)

    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i + 1))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')

    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.535721	Test's l2: 0.602118
[100]	Train's l2: 0.349454	Test's l2: 0.432722
[150]	Train's l2: 0.25538	Test's l2: 0.35617
[200]	Train's l2: 0.201518	Test's l2: 0.317182
[250]	Train's l2: 0.166098	Test's l2: 0.29417
[300]	Train's l2: 0.140931	Test's l2: 0.280006
[350]	Train's l2: 0.121575	Test's l2: 0.267594
[400]	Train's l2: 0.106663	Test's l2: 0.260325
[450]	Train's l2: 0.0949007	Test's l2: 0.255661
[500]	Train's l2: 0.0848952	Test's l2: 0.252331
[550]	Train's l2: 0.0762856	Test's l2: 0.250039
[600]	Train's l2: 0.0694708	Test's l2: 0.249684
[650]	Train's l2: 0.0637134	Test's l2: 0.248453
[700]	Train's l2: 0.0587498	Test's l2: 0.247276
[750]	Train's l2: 0.0541193	Test's l2: 0.246845
[800]	Train's l2: 0.0500765	Test's l2: 0.246357
[850]	Train's l2: 0.0464123	Test's l2: 0.246028
[900]	Train's l2: 0.0430409	Test's l2: 0.245343
[950]	Train's l2: 0.0399874	Test's l2: 0.245034
[1000]	Train's l2: 0.0373059	Test's l2

In [24]:
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n',
      np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n',
      np.mean(MSE_DICT['test_mse']), '\n------')

------
 训练MSE
 [0.010488074363309601, 0.04079059094902118, 0.05694998678880694, 0.0834800097560256, 0.0057367365462344] 
 0.039489079680679544 
------
------
 预测MSE
 [0.23677867170871048, 0.25968472174738577, 0.26443951549859535, 0.2871568829117033, 0.24483567618315835] 
 0.25857909360991066 
------
