In [1]:
import numpy as np 
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle
from sklearn.externals import joblib
plt.rcParams['font.family'] = 'AppleGothic'



In [2]:
X_train = pd.read_csv('X_train.csv', index_col='id')
y_train = pd.read_csv("y_train.csv", index_col='id')

In [4]:
# DecisionTree 학습
decision_tree_regressor = DecisionTreeRegressor(max_depth=24, random_state=0)
decision_tree_regressor = decision_tree_regressor.fit(X_train, y_train)
joblib.dump(decision_tree_regressor, 'decision_tree_regressor.pkl') 

['decision_tree_regressor.pkl']

In [3]:
# RandomForest 학습
random_forest_regressor = RandomForestRegressor(n_estimators=150, n_jobs=-1)
random_forest_regressor = random_forest_regressor.fit(X_train, y_train)
joblib.dump(random_forest_regressor, 'random_forest_regressor.pkl') 

['random_forest_regressor.pkl']

In [4]:
# XGBoost 학습
xgboost_regressor = XGBRegressor(max_depth=13)
xgboost_regressor = xgboost_regressor.fit(X_train, y_train)
joblib.dump(xgboost_regressor, 'xgboost_regressor.pkl') 



['xgboost_regressor.pkl']

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, n_folds ):
    kf = StratifiedKFold(n_splits=n_folds, random_state=0)
    #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화 
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
    print(model.__class__.__name__ , ' model 시작 ')
    
    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n, y_train_n)):
        #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 
        print('\t 폴드 세트: ',folder_counter,' 시작 ')
        X_tr = X_train_n.iloc[train_index]
        y_tr = y_train_n.iloc[train_index] 
        X_te = X_train_n.iloc[valid_index]  
        
        #폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
        model.fit(X_tr , y_tr)       
        #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
    #train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred

In [4]:
dtr = DecisionTreeRegressor(max_depth=24, random_state=0)
rfr = RandomForestRegressor(n_estimators=200, n_jobs=-1)
xr = XGBRegressor(max_depth=13)

In [5]:
# CV스태킹 알고리즘 각 모델에 적용
rf_train = get_stacking_base_datasets(rfr, X_train, y_train, 3)
dt_train = get_stacking_base_datasets(dtr, X_train, y_train,  3)    
xgb_train = get_stacking_base_datasets(xr, X_train, y_train, 3)

RandomForestRegressor  model 시작 
	 폴드 세트:  0  시작 
	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 
DecisionTreeRegressor  model 시작 
	 폴드 세트:  0  시작 
	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 
XGBRegressor  model 시작 
	 폴드 세트:  0  시작 
	 폴드 세트:  1  시작 
	 폴드 세트:  2  시작 


In [6]:
# CV스태킹 알고리즘 결과로 메타 모델 학습/시험에 필요한 result_a result_b 만들기 
Stack_final_X_train = np.concatenate((rf_train, dt_train, xgb_train), axis=1)

In [7]:
print(Stack_final_X_train.shape)

(916557, 3)


In [9]:
pd.DataFrame(Stack_final_X_train).to_csv("stack_final_X_train.csv")

In [14]:
recent_train_df = pd.read_csv("./recent_train_df.csv", index_col='idx')

In [16]:
X_train_cf = X_train[['contactDay', 'floorRate']]
X_train_cf.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)

In [17]:
pred_X_train = pd.DataFrame(Stack_final_X_train, columns=['rf', 'dt', 'xgb'])

final_X_train = pd.concat([recent_train_df, X_train_cf, pred_X_train], axis=1)

In [18]:
train_data = pd.concat([final_X_train, y_train], axis=1)

In [19]:
train_data = train_data[train_data['recentContactDay']!=0]

In [20]:
corr = train_data.corr()
print (corr['price'].sort_values(ascending=False))

price               1.000000
xgb                 0.964531
rf                  0.950257
dt                  0.902649
contactDay          0.327626
floorRate           0.009318
recentFloorRate     0.002364
recentContactDay    0.001786
recentPrice        -0.001318
Name: price, dtype: float64


In [21]:
train_data['recentPrice'] = np.log2(train_data['recentPrice'])
train_data['price'] = np.log2(train_data['price'])
train_data['rf'] = np.log2(train_data['rf'])
train_data['dt'] = np.log2(train_data['dt'])
train_data['xgb'] = np.log2(train_data['xgb'])

train_data.dropna(inplace=True, axis=0)

In [22]:
final_X_train = train_data.drop(['price'], axis=1)
final_y_train = train_data['price']

In [23]:
final_X_train

Unnamed: 0,recentContactDay,recentFloorRate,recentPrice,contactDay,floorRate,rf,dt,xgb
0,3006,0.625000,28.060852,7044,0.416667,28.418896,28.275864,28.487397
1,3378,0.486111,27.071120,4503,0.555556,27.635500,27.538899,27.758191
2,6229,0.546875,28.060852,2855,0.555556,27.390201,27.297891,27.461650
3,2660,0.151515,28.248529,2473,0.277778,27.027282,26.568193,27.161782
4,7203,0.291667,28.315273,2464,0.555556,27.012386,26.568193,27.078489
...,...,...,...,...,...,...,...,...
916552,6412,0.054348,29.034856,2492,0.125000,27.932815,27.575425,28.043891
916553,6814,0.500000,30.005041,2460,0.562500,27.972690,27.744475,28.112489
916554,5621,0.638298,29.340960,2445,0.062500,27.867968,27.744475,27.901835
916555,3467,0.555556,27.160387,2443,0.312500,27.921331,27.744475,28.051566


In [24]:
final_X_train.to_csv("final_X_train.csv")
final_y_train.to_csv("final_y_train.csv")

In [25]:
# 메타 모델 학습 2
metaModel = XGBRegressor(max_depth=6).fit(final_X_train, final_y_train)
joblib.dump(metaModel, 'metaModel.pkl') 



['metaModel.pkl']