In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn import cross_validation, preprocessing, linear_model #機械学習用のライブラリを利用
from sklearn.linear_model import BayesianRidge
import sklearn
from sklearn.metrics import make_scorer
from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from copy import deepcopy
from collections import defaultdict 





#警告文を無視
np.warnings.filterwarnings('ignore')

sklearn.__version__

#データのimport
train_preprocessed = pd.read_csv('train_preprocessed.csv', index_col=0)
y_train = train_preprocessed['y'].values
X_temp = train_preprocessed.drop('y', axis= 1 )
X_train = X_temp.values
X_temp.columns

#評価関数の定義
def rmsle(predicted, real):
    return np.sqrt(np.mean((np.log(real+1) - np.log(predicted+1))**2))

rmsle_score = make_scorer(rmsle, greater_is_better=False)


# ridge

In [51]:

#5：Ridge Regressorを適用する------------------------------------------- -> Ridge回帰を採択
ridge_params = {
    'alpha' : [10,300, 540,579,620,626,628,30,650,800,1000]
} 
# for temp in alpha_list:
clf_ridge= linear_model.Ridge(alpha=0.7)#0.7
clf_ridge.fit(X_train, y_train)
print("\nRidgeでの偏回帰係数")
print(clf_ridge.intercept_) 
# print(np.sort(abs(clf_ridge_J1.coef_)))
# print(clf_ridge_J1.coef_) 


clf_ridge =  linear_model.Ridge()#0.7
# # ハイパーパラメータ探索
clf_ridge_cv = GridSearchCV(clf_ridge, ridge_params, scoring= rmsle_score, cv=5,verbose=1)
clf_ridge_cv.fit(X_train, np.exp(y_train))
print(clf_ridge_cv.best_params_, -1*clf_ridge_cv.best_score_)
clf_ridge = linear_model.Ridge( **clf_ridge_cv.best_params_)
clf_ridge.fit(X_train, y_train)


scores = -cross_validation.cross_val_score(clf_ridge, X_train, np.exp(y_train), cv=5, scoring='neg_mean_squared_error')
print( "cv_score: %0.3f" % (scores.mean()**(1/2)))

#モデルの保存
with open('model_ridge.pickle', mode='wb') as fp:
    pickle.dump(clf_ridge, fp, protocol=2)

df_importance_ridge = pd.DataFrame(clf_ridge.coef_,X_temp.columns).reset_index()
df_importance_ridge = df_importance_ridge.rename(columns={'index': 'val_name', 0:'importance'})
df_importance_ridge = df_importance_ridge.sort_values('importance', ascending=False)
df_importance_ridge['ridge_importance_per_score'] = df_importance_ridge['importance']/ scores.mean()




Ridgeでの偏回帰係数
9.684017206920583
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 628} 0.26704856431340773
cv_score: 5131.123


[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    0.7s finished


# Baysian Ridge

In [57]:

from sklearn.linear_model import BayesianRidge

#5：Ridge Regressorを適用する------------------------------------------- -> Ridge回帰を採択
Bridge_params = {
    'alpha_1' : [1,1.5,2],
    'alpha_2' : [15,18,20],
    'lambda_1' : [1,1.5,2],
    'lambda_2' : [3,4,5],
} 
# for temp in alpha_list:
clf_Bridge= BayesianRidge()#0.7
clf_Bridge.fit(X_train, y_train)
print("\nRidgeでの偏回帰係数")
print(clf_Bridge.intercept_) 
# print(np.sort(abs(clf_ridge_J1.coef_)))
# print(clf_ridge_J1.coef_) 


clf_Bridge =  BayesianRidge()#0.7
# # ハイパーパラメータ探索
clf_Bridge_cv = GridSearchCV(clf_Bridge, Bridge_params, scoring= rmsle_score, cv=5,verbose=1)
clf_Bridge_cv.fit(X_train, np.exp(y_train))
print(clf_Bridge_cv.best_params_, -1*clf_Bridge_cv.best_score_)
clf_Bridge = BayesianRidge( **clf_Bridge_cv.best_params_)
clf_Bridge.fit(X_train, y_train)

#　交差検証
scores = -cross_validation.cross_val_score(clf_Bridge, X_train, np.exp(y_train), cv=5, scoring='neg_mean_squared_error')
print( "cv_score: %0.3f" % (scores.mean()**(1/2)))

#モデルの保存
with open('model_Bridge.pickle', mode='wb') as fp:
    pickle.dump(clf_Bridge, fp, protocol=2)

    
df_importance_Bridge = pd.DataFrame(clf_Bridge.coef_,X_temp.columns).reset_index()
df_importance_Bridge = df_importance_Bridge.rename(columns={'index': 'val_name', 0:'importance'})
df_importance_Bridge = df_importance_Bridge.sort_values('importance', ascending=False)
df_importance_Bridge['bridge_importance_per_score'] = df_importance_Bridge['importance']/ scores.mean()



Ridgeでの偏回帰係数
9.68355431626439
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:   23.5s finished


{'alpha_1': 1, 'alpha_2': 20, 'lambda_1': 1, 'lambda_2': 4} 0.27825016226548754
cv_score: 5133.413


# XGBoost 

In [40]:
# Gradient Boosting
xgb_params = {
#      'n_estimators': [1000],
      'learning_rate':[0.09],
     #max_features': 0.2,
    'gamma':[0.1],
    'max_depth': [6],
    'min_child_weight':[ 3],
     'reg_alpha':[10,15],
      'reg_lambda':[15],
    'seed':[71],
    'subsample':[0.8],
#     'min_samples_leaf': [2],
#     'verbose': 0
}


clf= xgb.XGBRegressor()#0.7
# ハイパーパラメータ探索
clf_xgb_cv = GridSearchCV(clf, xgb_params, scoring= rmsle_score, cv=5,verbose=1)
clf_xgb_cv.fit(X_train, np.exp(y_train))
print(clf_xgb_cv.best_params_, -1*clf_xgb_cv.best_score_)

clf= xgb.XGBRegressor(**clf_xgb_cv.best_params_)#0.7
clf.fit(X_train, y_train)


scores = -cross_validation.cross_val_score(clf, X_train, np.exp(y_train), cv=5, scoring='neg_mean_squared_error')
print( "cv_score: %0.3f" % (scores.mean()**(1/2)))

with open('model_xgb.pickle', mode='wb') as fp:
    pickle.dump(clf, fp,protocol=2)
    
df_importance_xgb = pd.DataFrame(clf.feature_importances_ ,X_temp.columns).reset_index()
df_importance_xgb = df_importance_xgb.rename(columns={'index': 'val_name', 0:'importance'})
df_importance_xgb = df_importance_xgb.sort_values('importance', ascending=False)
df_importance_xgb['xgb_importance_per_score'] = df_importance_xgb['importance']/ scores.mean()
# df_importance_xgb

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   16.7s finished


{'gamma': 0.1, 'learning_rate': 0.09, 'max_depth': 6, 'min_child_weight': 3, 'reg_alpha': 15, 'reg_lambda': 15, 'seed': 71, 'subsample': 0.8} 0.25091831998856995
cv_score: 4677.799


# Random Forest

In [41]:
#ランダムフォレスト
from sklearn.ensemble import RandomForestRegressor

rf_params ={
    'random_state':[0],
     'n_estimators':[1000],
    'min_samples_leaf':[2], 
     'max_depth':[14],
#             criterion=/rmsle_score,
            'n_jobs':[-1],                        
}
clf_rf =RandomForestRegressor()
clf_rf.fit(X_train, y_train)


# # ハイパーパラメータ探索
clf_rf_cv = GridSearchCV(clf_rf, rf_params, scoring= rmsle_score, cv=5,verbose=1)
clf_rf_cv.fit(X_train, np.exp(y_train))
print(clf_rf_cv.best_params_, -1*clf_rf_cv.best_score_)

clf_rf= RandomForestRegressor(**clf_rf_cv.best_params_)#0.7
clf_rf.fit(X_train, y_train)


scores = -cross_validation.cross_val_score(clf_rf, X_train, np.exp(y_train), cv=5, scoring='neg_mean_squared_error')
print( "cv_score: %0.3f" % (scores.mean()**(1/2)))

clf_rf.fit(X_train, y_train)
with open('model_rf.pickle', mode='wb') as fp:
    pickle.dump(clf_rf, fp, protocol=2)

    
df_importance_rf= pd.DataFrame(clf_rf.feature_importances_ ,X_temp.columns).reset_index()
df_importance_rf = df_importance_rf.rename(columns={'index': 'val_name', 0:'importance'})
df_importance_rf = df_importance_rf.sort_values('importance', ascending=False)
df_importance_rf['rf_importance_per_score'] = df_importance_rf['importance']/ scores.mean()


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   25.6s finished


{'max_depth': 14, 'min_samples_leaf': 2, 'n_estimators': 1000, 'n_jobs': -1, 'random_state': 0} 0.26130224697049453
cv_score: 4809.785


# LightGBM 

In [42]:
import lightgbm as lgb
# 上記のパラメータでモデルを学習する
clf_lgb = lgb.LGBMRegressor(lerning_rate=0.09, max_depth=5, n_estimators=100,min_child_samples=5, reg_lambda=5, reg_alpha=5)
clf_lgb.fit(X_train, y_train)


# # # ハイパーパラメータ探索
# clf_cv = GridSearchCV(clf_rf, rf_params, scoring= rmsle_score, cv=5,verbose=1)
# clf_cv.fit(X_train, np.exp(y_train))
# print(clf_cv.best_params_, -1*clf_cv.best_score_)

# # clf_rf= RandomForestRegressor(**clf_cv.best_params_)#0.7
# clf_rf.fit(X_train, y_train)


scores = -cross_validation.cross_val_score(clf_lgb, X_train, np.exp(y_train), cv=5, scoring='neg_mean_squared_error')
print( "cv_score: %0.3f" % (scores.mean()**(1/2)))

clf_lgb.fit(X_train, y_train)
with open('model_lgb.pickle', mode='wb') as fp:
    pickle.dump(clf_lgb, fp, protocol=2)

    
df_importance_lgb= pd.DataFrame(clf_lgb.feature_importances_ ,X_temp.columns).reset_index()
df_importance_lgb = df_importance_lgb.rename(columns={'index': 'val_name', 0:'importance'})
df_importance_lgb = df_importance_lgb.sort_values('importance', ascending=False)
df_importance_lgb['lgb_importance_per_score'] = df_importance_lgb['importance']/ scores.mean()


cv_score: 4675.290


# Gradient Boosting Regressor

In [43]:
from sklearn.ensemble import GradientBoostingRegressor


est_params = {
    'n_estimators':[100],
    'learning_rate':[0.1],
    'max_depth':[4,5,6],
    'random_state':[0]
}
clf_est = GradientBoostingRegressor()

# # # ハイパーパラメータ探索
clf_cv = GridSearchCV(clf_est, est_params, scoring= rmsle_score, cv=5,verbose=1)
clf_cv.fit(X_train, np.exp(y_train))
print(clf_cv.best_params_, -1*clf_cv.best_score_)


clf_est= GradientBoostingRegressor(**clf_cv.best_params_)#0.7
clf_est.fit(X_train, y_train)

scores = -cross_validation.cross_val_score(clf_est, X_train, np.exp(y_train), cv=5, scoring='neg_mean_squared_error')
print( "cv_score: %0.3f" % (scores.mean() **(1/2)))

clf_est.fit(X_train, y_train)
with open('model_est.pickle', mode='wb') as fp:
    pickle.dump(clf_est, fp, protocol=2)

    
df_importance_est= pd.DataFrame(clf_est.feature_importances_ ,X_temp.columns).reset_index()
df_importance_est = df_importance_est.rename(columns={'index': 'val_name', 0:'importance'})
df_importance_est = df_importance_est.sort_values('importance', ascending=False)
df_importance_est['est_importance_per_score'] = df_importance_est['importance']/ scores.mean()

# df_importance_est['est_importance_per_score'] = df_importance_est['importance']/ scores.mean()
# df_importance_est

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   18.1s finished


{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'random_state': 0} 0.25362463490201054
cv_score: 4768.276


# 標準偏回帰係数を並べてみた

In [44]:
df_importance = df_importance_ridge.sort_index()[['val_name', 'ridge_importance_per_score']]
df_importance['bridge_importance_per_score'] = df_importance_Bridge.sort_index()['bridge_importance_per_score']
df_importance['xgb_importance_per_score'] = df_importance_xgb.sort_index()['xgb_importance_per_score']
df_importance['rf_importance_per_score'] = df_importance_rf.sort_index()['rf_importance_per_score']
df_importance['lgb_importance_per_score'] = df_importance_lgb.sort_index()['lgb_importance_per_score']
df_importance['est_importance_per_score'] = df_importance_est.sort_index()['est_importance_per_score']

import scipy.stats
def SS(text):
    return scipy.stats.zscore(text)

df_importance['ridge_importance_per_score'] = SS(df_importance['ridge_importance_per_score'])
df_importance['xgb_importance_per_score'] =SS(df_importance['xgb_importance_per_score'])
df_importance['rf_importance_per_score'] = SS(df_importance['rf_importance_per_score'])
df_importance['lgb_importance_per_score'] =SS(df_importance['lgb_importance_per_score'])
df_importance['est_importance_per_score'] =SS(df_importance['est_importance_per_score'])

df_importance['total_importance'] =  (df_importance['xgb_importance_per_score'] +  df_importance['rf_importance_per_score'] +  df_importance['ridge_importance_per_score'] +  df_importance['lgb_importance_per_score'] +df_importance['est_importance_per_score']  ) /4
df_importance= df_importance.sort_values('total_importance', ascending=False)
df_importance


Unnamed: 0,val_name,ridge_importance_per_score,bridge_importance_per_score,xgb_importance_per_score,rf_importance_per_score,lgb_importance_per_score,est_importance_per_score,total_importance
0,capacity,4.572861,4.146096e-09,7.409554,11.798330,5.186703,8.322013,9.322365
6,match_sec,0.988318,9.144133e-10,5.079573,0.754772,6.222234,5.413891,4.614697
1,home_history,0.634748,6.384953e-10,2.283597,0.849459,6.255639,4.527882,3.637831
99,home_team_浦和,3.648484,3.842493e-09,2.438929,1.004708,0.777344,0.472772,2.085559
3,home_star_stats,2.260999,1.447350e-09,1.972933,0.299864,1.879684,1.659676,2.018289
132,away_team_浦和,3.085583,2.884314e-09,2.438929,0.346233,1.278408,0.697941,1.961773
2,distance_from_big_city,-0.796795,-9.351120e-10,3.060257,0.322236,3.215853,1.912177,1.928432
4,temperature,0.399280,4.650884e-10,0.419612,0.150262,1.879684,2.373277,1.305529
113,away_team_G大阪,1.715240,1.552887e-09,1.662268,0.033674,0.877557,0.713282,1.250505
92,home_team_新潟,2.580334,2.139360e-09,1.506936,-0.074874,0.677132,-0.319016,1.092628


 # アンサンブル学習 スタッキングを実装　最終出力は３つのモデルの調和平均

In [45]:
class StackingClassifer():
    def __init__(self, estimators, estimators_second, merge_estimator, ):
        
        self.original_clfs = dict(estimators)
        self.original_clfs_second = dict(estimators_second)

        self.m_clf = merge_estimator
        
        self.clf_dict = defaultdict(list)
        self.clf_dict_second = defaultdict(list)
        self.clfs_index = sorted(self.original_clfs.keys())
        self.clfs_index_second = sorted(self.original_clfs_second.keys())

        
    def fit(self, X,y):
        #リストの初期化
        self.clfs_dict =  defaultdict(list) 
        #交差検証(k=5)
        K = 5
        kf = KFold(n_splits=K, shuffle=True, random_state=1 )

        #indexのリストを作成
        index_list = list(kf.split(X, y))
        
        #第一段の学習
        #第一段の予測を次の特徴量としてリストにする
        merge_feature_list = []
        # 初期の学習器で学習を行う
        count = 0
        for clf_name in self.clfs_index:
            #学習器をセット
            clf_origin = self.original_clfs[clf_name]
            #予測のりすと
            preds_temp_list = []
            #交差検証実施
            for train_index, test_index in index_list:
                #この時点で学習器をコピー
                clf_copy = deepcopy(clf_origin)
                #学習スタート
                clf_copy.fit(X[train_index], y[train_index])
                #予測のリストを格納
                preds_temp_list.extend(
                (clf_copy.predict(X[test_index])).tolist())
#                 print(preds_temp_list)
            
                #学習ずみのモデルを格納
                self.clf_dict[clf_name].append(clf_copy)
            if count == 0:
                merge_feature_list = preds_temp_list
                merge_feature_list  = np.array(merge_feature_list).reshape(-1,1)
                
            else:
                preds_temp_list = np.array(preds_temp_list).reshape(-1,1)
                merge_feature_list = np.concatenate((merge_feature_list, preds_temp_list), axis=1)

        #予測のリストを次の学習の特徴量とする.
#             preds_temp_list = np.array(preds_temp_list)
#             merge_feature_list.append(preds_temp_list)
            count += 1

#         print(len(merge_feature_list))
        merge_feature_list = np.array(merge_feature_list)
        print('第一段階の特徴量のサイズ:{0}'.format(merge_feature_list.shape))
        
#         #Xも特徴量に追加 <--これは削除しても良い？
#         merge_feature_list = np.concatenate((merge_feature_list, X), axis=1)
#         print(merge_feature_list.shape)
#         print()
        
    
        #第二学習フェーズ
        y_merged_second = np.hstack([y[test_index]
                              for _, test_index in index_list])
        
        #リストの初期化
        self.clfs_dict_second =  defaultdict(list) 
        #交差検証(k=5)
        K = 5
        kf = KFold(n_splits=K, shuffle=True, random_state=1 )
        #indexのリストを作成
        index_list = list(kf.split(merge_feature_list, y_merged_second))
        
        merge_feature_list_second = []
        # 初期の学習器で学習を行う
        count = 0
        for clf_name in self.clfs_index_second:
            #学習器をセット
            clf_origin_second = self.original_clfs_second[clf_name]
            #予測のりすと
            preds_temp_list_second = []
            #交差検証実施
            for train_index, test_index in index_list:
                #この時点で学習器をコピー
                clf_copy_second = deepcopy(clf_origin_second)
                #学習スタート
                clf_copy_second.fit(merge_feature_list[train_index], y_merged_second[train_index])
                #予測のリストを格納
                preds_temp_list_second.extend(
                (clf_copy_second.predict(merge_feature_list[test_index])).tolist())
#                 print(preds_temp_list)
            
                #学習ずみのモデルを格納
                self.clf_dict_second[clf_name].append(clf_copy_second)
            if count == 0:
                merge_feature_list_second = preds_temp_list_second
                merge_feature_list_second  = np.array(merge_feature_list_second).reshape(-1,1)
                
            else:
                preds_temp_list_second = np.array(preds_temp_list_second).reshape(-1,1)
                merge_feature_list_second = np.concatenate((merge_feature_list_second, preds_temp_list_second), axis=1)

        #予測のリストを次の学習の特徴量とする.
#             preds_temp_list = np.array(preds_temp_list)
#             merge_feature_list.append(preds_temp_list)
            count += 1

#         print(len(merge_feature_list))
            merge_feature_list_second = np.array(merge_feature_list_second)
#         print(merge_feature_list)

        print('第二段階の特徴量のサイズ:{0}'.format(merge_feature_list_second.shape))
    
        #第三学習フェーズ
        X_merged = merge_feature_list_second
        
        print(X_merged.shape)
        
#         print(X_merged.shape)
        y_merged = np.hstack([y_merged_second[test_index]
                              for _, test_index in index_list])
        print(y_merged.shape)
        
        self.m_clf.fit(X_merged, y_merged)
        
        return self
    
    def predict(self, X):
        #第一段階
        #特徴量のリストを作成
        merge_feature_for_test = []
        # それぞれの学習器で学習
        count = 0
        for clf_name in self.clfs_index:
        #予測のリストを作成
            print(clf_name)
            temp_proba_list = []
            # 各学習済みモデルを引き出す
            for clf in self.clf_dict[clf_name]:
                #予測のリストに各学習器の予測を追加
                temp_proba_list.append(clf.predict(X))
            # 予測を行方向に平均をとる。
            merge_feature_for_test.append(
                np.mean(temp_proba_list, axis=0))
            
        print('(予測)第一段階の特徴量のサイズ:{0}'.format(np.array(merge_feature_for_test).T))
        

            
        #第二段階
        #特徴量のリストを作成
        X_merged_second = np.array(merge_feature_for_test).T
#         #Xも特徴量に追加 <--これは削除しても良い？
#         X_merged_second = np.concatenate((X_merged_second, X), axis=1)
        print(X_merged_second.shape)
        merge_feature_for_test_second = []
        # それぞれの学習器で学習
        count = 0
        for clf_name in self.clfs_index_second:
            print(clf_name)
        #予測のリストを作成
            temp_proba_list_second = []
            # 各学習済みモデルを引き出す
            for clf in self.clf_dict_second[clf_name]:
                #予測のリストに各学習器の予測を追加
                temp_proba_list_second.append(clf.predict(X_merged_second))
            # 予測を行方向に平均をとる。
            merge_feature_for_test_second.append(
                np.mean(temp_proba_list_second, axis=0))
            
        print('(予測)第二段階の特徴量のサイズ:{0}'.format(np.array(merge_feature_for_test_second).T))

            
        # 特徴量リストを
        X_merged = np.array(merge_feature_for_test_second).T
        print(X_merged.shape)
        print(X_merged[:,0])
        X_merged_harmonic_mean = 1/((1/X_merged[:,0]  + 1/X_merged[:, 1]  + 1/X_merged[:,2] ) * (1/3))
        print(X_merged_harmonic_mean)
        print(X_merged_harmonic_mean.shape)
        predict_X = np.average(X_merged, axis=1)
        print(predict_X)      
        print(np.exp(predict_X))



        return X_merged_harmonic_mean
            

# 各モデルを定義

In [46]:
ridge= linear_model.Ridge(**clf_ridge_cv.best_params_)#0.7
xgboost =  xgb.XGBRegressor(**clf_xgb_cv.best_params_)#0.7(**clf_xgb_cv.best_params_)#0.7
rf = RandomForestRegressor(**clf_rf_cv.best_params_)
lgbm = lgb.LGBMRegressor(lerning_rate=0.09, max_depth=6, n_estimators=100)
# svr = SVR(C=gridsearch.best_params_["C"], epsilon=gridsearch.best_params_["epsilon"])
est= GradientBoostingRegressor(**clf_cv.best_params_)#0.7
Bridge = BayesianRidge( **clf_Bridge_cv.best_params_)


In [47]:
# 第一段階で用いるモデルを定義
first_estimators = list(zip(["ridge","xgb","Bridge", "rf","lgb", "est"],
                          [ridge, xgboost,Bridge,  rf, lgbm, est]))
#第二段階で用いるモデルを定義
second_estimators = list(zip(["xgb", "ridge" ,"lgb"],[ xgboost, ridge, lgbm]))

#スタッキングクラスの定義
clf_stcl = StackingClassifer(first_estimators, second_estimators , xgb.XGBRegressor(**clf_xgb_cv.best_params_))#xgb.XGBRegressor(**clf_xgb_cv.best_params_)

 #学習
clf_stcl.fit(X_train, y_train)

#評価
print(rmsle(np.exp(clf_stcl.predict(X_train)), np.exp(y_train)))
print(np.exp(y_train))

#モデルの保存
with open('model_stcl_develop.pickle', mode='wb') as fp:
    pickle.dump(clf_stcl, fp, protocol=2)


第一段階の特徴量のサイズ:(3364, 6)
第二段階の特徴量のサイズ:(3364, 3)
(3364, 3)
(3364,)
Bridge
est
lgb
rf
ridge
xgb
(予測)第一段階の特徴量のサイズ:[[ 9.94894732  9.95335275  9.95697898  9.83501464  9.90248974  9.71342278]
 [ 9.34537213  9.44279754  9.44117455  9.44225514  9.36458537  9.26028252]
 [ 9.91000384 10.03944502  9.98900818 10.03206778  9.9100048  10.07854939]
 ...
 [10.02969346 10.10685382 10.16673586 10.0940703  10.00295396 10.03244114]
 [ 9.46998901  9.64441811  9.58008309  9.5463718   9.49398077  9.54263973]
 [ 9.32267049  9.42496458  9.37185891  9.43382532  9.33834591  9.41420364]]
(3364, 6)
lgb
ridge
xgb
(予測)第二段階の特徴量のサイズ:[[ 9.91179509  9.86487748  9.90713787]
 [ 9.45753692  9.43582282  9.41057587]
 [10.07270816  9.94433255 10.00866127]
 ...
 [10.10862516 10.01814342 10.1089201 ]
 [ 9.6335358   9.5696763   9.6185379 ]
 [ 9.33939272  9.42924715  9.38733673]]
(3364, 3)
[ 9.91179509  9.45753692 10.07270816 ... 10.10862516  9.6335358
  9.33939272]
[ 9.8945584   9.43460616 10.00829288 ... 10.07838124  9.6071725
  

In [48]:
X_train.shape

(3364, 144)