In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/LG Aimer/train.csv').drop(columns="ID")
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [None]:
train_x = train_x.drop(columns=['X_04','X_23','X_47','X_48'])
train_x=train_x[['X_03','X_05','X_07','X_08','X_09','X_10','X_11','X_13','X_14','X_17','X_18','X_19','X_20','X_21','X_22','X_30','X_32','X_49','X_50','X_51','X_52','X_53','X_54','X_55','X_56']]

In [None]:
#xgboost

x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3, random_state=42)

model = MultiOutputRegressor(XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7))
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = lg_nrmse(y_test,y_predict)
print(score)

1.9426425412471673


In [None]:
#randomforest

x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3, random_state=42)

model = MultiOutputRegressor(RandomForestRegressor(n_estimators=300, n_jobs=-1))
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = lg_nrmse(y_test,y_predict)
print(score)

1.9418420327153105


In [None]:
#lgbm
import lightgbm as lgb
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3, random_state=42)
lightgb = lgb.LGBMRegressor(objective='regression', learning_rate=0.1, num_leaves = 25)
model = MultiOutputRegressor(lightgb)
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = lg_nrmse(y_test,y_predict)
print(score)

1.9445237637921848


In [None]:
#xgboost
x_temp =train_x.drop(columns=['X_10','X_11'])
x_train, x_test, y_train, y_test = train_test_split(x_temp, train_y, test_size = 0.3, random_state=42)

model = MultiOutputRegressor(XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7))
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = lg_nrmse(y_test,y_predict)
print(score)

1.9427390478336046


In [None]:
#lgbm
import lightgbm as lgb
x_train, x_test, y_train, y_test = train_test_split(x_temp, train_y, test_size = 0.3, random_state=42)
lightgb = lgb.LGBMRegressor(objective='regression', learning_rate=0.1, num_leaves = 25)
model = MultiOutputRegressor(lightgb)
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = lg_nrmse(y_test,y_predict)
print(score)

1.9438635871265932


In [None]:
#randomforest

x_train, x_test, y_train, y_test = train_test_split(x_temp, train_y, test_size = 0.3, random_state=42)

model = MultiOutputRegressor(RandomForestRegressor(n_estimators=300, n_jobs=-1))
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = lg_nrmse(y_test,y_predict)
print(score)

1.9421188919622998


In [None]:
x_temp

Unnamed: 0,X_03,X_05,X_07,X_08,X_09,X_13,X_14,X_17,X_18,X_19,...,X_30,X_32,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,67.47,101.892,29.45,62.38,245.71,0.18,13.34,13.52,13.44,3.11,...,1.49,1.46,9706.03,137.043591,135.359219,147.837968,134.313475,125.605427,136.721425,125.028256
1,65.17,101.944,28.73,61.23,233.61,0.18,13.33,13.51,13.42,2.97,...,1.49,1.45,10423.43,133.736691,135.979817,149.924692,123.630583,127.893337,143.322659,124.877308
2,64.07,103.153,28.81,105.77,272.20,0.15,13.36,13.51,13.43,3.04,...,1.49,1.46,10948.53,132.805112,131.055355,146.814592,128.939070,127.012195,140.395688,122.238232
3,67.57,101.971,28.92,115.21,255.36,0.21,13.30,13.51,13.40,3.05,...,1.47,1.47,15007.03,134.138760,133.239422,139.720132,132.260824,130.723186,147.624829,134.875225
4,63.57,101.981,29.68,103.38,241.46,0.16,13.35,13.50,13.42,3.04,...,1.49,1.47,11051.03,142.728970,136.620022,134.853555,134.760252,125.647793,139.331105,123.272762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,62.27,103.150,30.20,77.83,298.05,0.15,13.37,13.52,13.46,3.20,...,1.37,1.36,60630.73,129.965741,130.807148,133.481737,125.273130,121.780933,133.780110,129.029812
39603,62.77,102.021,29.21,102.25,270.67,0.13,13.36,13.49,13.44,3.15,...,1.40,1.37,60763.43,127.633885,120.158764,142.667802,122.465490,122.987209,143.090741,122.811413
39604,64.67,103.144,29.96,102.61,198.07,0.14,13.38,13.52,13.46,3.23,...,1.39,1.37,8813.33,132.501286,136.893025,134.419328,129.115431,130.920147,140.489232,119.166699
39605,63.67,102.025,30.30,112.60,275.52,0.16,13.36,13.52,13.46,3.18,...,1.37,1.36,62222.33,128.189679,121.495930,141.288011,130.141676,125.518825,136.603634,124.525929


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3, random_state=42)

random_search = {
               'max_depth': [2],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [4, 6, 8],
               'min_samples_split': [5, 7,10],
               'n_estimators': [100,200,300]}
rf=RandomForestRegressor()
model_rf = RandomizedSearchCV(estimator = rf, param_distributions = random_search, n_iter = 10, 
                               cv = 4, verbose= 1, random_state= 10, n_jobs = -1)
model_rf.fit(x_train,y_train)



Fitting 4 folds for each of 10 candidates, totalling 40 fits


RandomizedSearchCV(cv=4, estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': [2],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [4, 6, 8],
                                        'min_samples_split': [5, 7, 10],
                                        'n_estimators': [100, 200, 300]},
                   random_state=10, verbose=1)

In [None]:
random_rf = model_rf.best_estimator_.predict(x_test)
print(model_rf.best_params_)

{'n_estimators': 100, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 2}


In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3, random_state=42)


In [None]:
random_search.best_params_

{'subsample': 0.6,
 'objective': 'reg:squarederror',
 'n_estimators': 500,
 'min_child_weight': 5,
 'max_depth': 2,
 'gamma': 0.4,
 'eval_metric': 'rmse',
 'eta': 0.4,
 'colsample_bytree': 0.6,
 'booster': 'gblinear'}

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3, random_state=42)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV


grid_search_xgb = {
    'n_estimators':[200,300,400],
    'min_child_weight':[4,5], 
    'gamma':[i/10.0 for i in range(3,6)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
    'eta': [i/10.0 for i in range(3,6)]
}

reg = XGBRegressor()

# run randomized search
n_iter_search = 10
model_xgb = GridSearchCV(reg, grid_search_xgb,
                                    cv=4,verbose=1,n_jobs=-1)
model_xgb.fit(x_train, y_train)

print(model_xgb.best_params_)



Fitting 4 folds for each of 27000 candidates, totalling 108000 fits


ERROR:concurrent.futures:exception calling callback for <Future at 0x7fe131c3da50 state=finished returned list>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
AttributeErro

KeyboardInterrupt: ignored

ERROR:concurrent.futures:exception calling callback for <Future at 0x7fe131cc85d0 state=finished returned list>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
AttributeErro

In [None]:
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
train_y01=train_y['Y_01']


In [None]:
selector = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y01)



In [None]:
selected_columns = train_x.columns
selected_columns[selector.get_support()]

Index(['X_03', 'X_07', 'X_09', 'X_13', 'X_18', 'X_19', 'X_20', 'X_21', 'X_22',
       'X_49'],
      dtype='object')

In [None]:
train_y02=train_y['Y_02']
selector02 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y02)



In [None]:
selected_columns02 = train_x.columns
selected_columns02[selector02.get_support()]

Index(['X_03', 'X_05', 'X_06', 'X_07', 'X_13', 'X_14', 'X_18', 'X_22', 'X_43',
       'X_49'],
      dtype='object')

In [None]:
train_y03=train_y['Y_03']
selector03 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y03)
          



In [None]:
selected_columns03 = train_x.columns
selected_columns03[selector03.get_support()]

Index(['X_05', 'X_07', 'X_13', 'X_18', 'X_19', 'X_22', 'X_28', 'X_32', 'X_43',
       'X_49'],
      dtype='object')

In [None]:
train_y04=train_y['Y_04']
selector04 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y04)
          



In [None]:
selected_columns04 = train_x.columns
selected_columns04[selector04.get_support()]

Index(['X_13', 'X_19', 'X_21', 'X_29', 'X_30', 'X_32', 'X_34', 'X_40', 'X_46',
       'X_49'],
      dtype='object')

In [None]:
train_y05=train_y['Y_05']
selector05 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y05)
          



In [None]:
selected_columns05 = train_x.columns
selected_columns05[selector05.get_support()]

Index(['X_09', 'X_13', 'X_17', 'X_18', 'X_32', 'X_40', 'X_49', 'X_51', 'X_54',
       'X_56'],
      dtype='object')

In [None]:
train_y06=train_y['Y_06']
selector06 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y06)
          



In [None]:
selected_columns06 = train_x.columns
selected_columns06[selector06.get_support()]

Index(['X_02', 'X_03', 'X_05', 'X_07', 'X_09', 'X_13', 'X_14', 'X_38', 'X_44',
       'X_49'],
      dtype='object')

In [None]:
train_y07=train_y['Y_07']
selector07 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y07)
          



In [None]:
selected_columns07 = train_x.columns
selected_columns07[selector07.get_support()]

Index(['X_03', 'X_07', 'X_09', 'X_13', 'X_14', 'X_17', 'X_19', 'X_21', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y08=train_y['Y_08']
selector08 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y08)
          



In [None]:
selected_columns08 = train_x.columns
selected_columns08[selector08.get_support()]

Index(['X_03', 'X_07', 'X_08', 'X_10', 'X_20', 'X_21', 'X_22', 'X_30', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y09=train_y['Y_09']
selector09 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y09)
          



In [None]:
selected_columns09 = train_x.columns
selected_columns09[selector09.get_support()]

Index(['X_03', 'X_07', 'X_10', 'X_11', 'X_20', 'X_21', 'X_22', 'X_30', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y10=train_y['Y_10']
selector10 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y10)
          



In [None]:
selected_columns10 = train_x.columns
selected_columns10[selector10.get_support()]

Index(['X_03', 'X_05', 'X_07', 'X_09', 'X_16', 'X_18', 'X_19', 'X_21', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y11=train_y['Y_11']
selector11 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y11)
          



In [None]:
selected_columns11 = train_x.columns
selected_columns11[selector11.get_support()]

Index(['X_06', 'X_07', 'X_08', 'X_09', 'X_10', 'X_13', 'X_17', 'X_22', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y12=train_y['Y_12']
selector12 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y12)
          



In [None]:
selected_columns12 = train_x.columns
selected_columns12[selector12.get_support()]

Index(['X_03', 'X_07', 'X_10', 'X_11', 'X_20', 'X_21', 'X_22', 'X_30', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y13=train_y['Y_13']
selector13 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y13)
          



In [None]:
selected_columns13 = train_x.columns
selected_columns13[selector13.get_support()]

Index(['X_03', 'X_07', 'X_09', 'X_10', 'X_20', 'X_21', 'X_22', 'X_30', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
train_y14=train_y['Y_14']
selector14 = SelectFromModel(estimator = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7),max_features=10).fit(train_x,train_y14)
          



In [None]:
selected_columns14 = train_x.columns
selected_columns14[selector14.get_support()]

Index(['X_03', 'X_07', 'X_10', 'X_11', 'X_20', 'X_21', 'X_22', 'X_30', 'X_32',
       'X_49'],
      dtype='object')

In [None]:
list1=[3,7,9,13,18,19,20,21,22,49]
list2=[3,5,6,7,13,14,18,22,43,49]
list3=[5,7,13,18,19,22,28,32,43,49]
list4=[13,19,21,29,30,32,34,40,46,49]
list5=[9,13,17,18,32,40,49,51,54,56]
list6=[2,3,5,7,9,13,14,38,44,49]
list7=[3,7,9,13,14,17,19,21,32,49]
list8=[3,7,8,10,20,21,22,30,32,49]
list9=[3,7,10,11,20,21,22,30,32,49]
list10=[3,5,7,9,16,18,19,21,32,49]
list11=[6,7,8,9,10,13,17,22,32,49]
list12=[3,7,10,11,20,21,22,30,32,49]
list13=[3,7,9,10,20,21,22,30,32,49]
list14=[3,7,10,11,20,21,22,30,32,49]

In [None]:
union = list(set(list1) | set(list2) | set(list3) | set(list4) | set(list5) | set(list6) | set(list7) | set(list8) | set(list9) | set(list10) | set(list11) | set(list12) | set(list13) | set(list14))
print(union)


[2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 22, 28, 29, 30, 32, 34, 38, 40, 43, 44, 46, 49, 51, 54, 56]


In [None]:
intersection = list(set(list1) & set(list2) & set(list3) & set(list4) & set(list5) & set(list6) & set(list7) & set(list8) & set(list9) & set(list10) & set(list11) & set(list12) & set(list13) & set(list14))
intersection

[49]

아래건 다른 방법, kbest

In [None]:
# target(Price)와 가장 correlated 된 features 를 k개 고르기.
## f_regresison, SelectKBest 불러오기.
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.model_selection import train_test_split
x_train, x_test, y01_train, y01_test = train_test_split(train_x, train_y01, test_size = 0.3, random_state=42)

## selctor 정의하기.
selector_kbest01 = SelectKBest(score_func=f_regression, k=20)
## 학습데이터에 fit_transform 
X_train_selected01 = selector.fit_transform(x_train, y01_train)
## 테스트 데이터는 transform
X_test_selected01 = selector.transform(x_test)
all_names = x_train.columns
## selector.get_support()
selected_mask = selector.get_support()
## 선택된 특성(변수)들
selected_names = all_names[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = all_names[~selected_mask]
print('Selected names: ', selected_names)
print('Unselected names: ', unselected_names)


Selected names:  Index(['X_03', 'X_05', 'X_07', 'X_09', 'X_18', 'X_19', 'X_20', 'X_21', 'X_22',
       'X_49'],
      dtype='object')
Unselected names:  Index(['X_01', 'X_02', 'X_06', 'X_08', 'X_10', 'X_11', 'X_12', 'X_13', 'X_14',
       'X_15', 'X_16', 'X_17', 'X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29',
       'X_30', 'X_31', 'X_32', 'X_33', 'X_34', 'X_35', 'X_36', 'X_37', 'X_38',
       'X_39', 'X_40', 'X_41', 'X_42', 'X_43', 'X_44', 'X_45', 'X_46', 'X_50',
       'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56'],
      dtype='object')
