In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [4]:
def lg_nrmse_one(gt, preds):
    # Y_08의 경우는 가중치 1.2 부여. 8 이후부터는 모두 가중치 1.0
    all_nrmse = []
    rmse = mean_squared_error(np.array(gt), preds, squared=False)
    nrmse = rmse/np.mean(np.abs(np.array(gt)))
    all_nrmse.append(nrmse)
    score = 1.0 * np.sum(all_nrmse)
    return score

#얘를 쓸것!!!

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/train.csv').drop(columns="ID")
train_x = train_df.filter(regex='X') # Input : X Featrue
train_x=train_x.drop(columns=['X_04','X_23','X_47','X_48'])
train_y = train_df.filter(regex='Y') # Output : Y Feature

#Y_14


In [6]:
train_y14 = train_y['Y_14']
train_y14

0       -25.304
1       -26.438
2       -26.370
3       -25.345
4       -24.974
          ...  
39602   -26.580
39603   -27.325
39604   -26.601
39605   -26.635
39606   -26.093
Name: Y_14, Length: 39607, dtype: float64

In [7]:
train_y14 = pd.DataFrame(train_y14, index= train_y.index)
train_y14

Unnamed: 0,Y_14
0,-25.304
1,-26.438
2,-26.370
3,-25.345
4,-24.974
...,...
39602,-26.580
39603,-27.325
39604,-26.601
39605,-26.635


#클리핑

In [8]:
from sklearn.model_selection import train_test_split

# -----------------------------------
# 열마다 학습 데이터의 1%, 99% 지점을 확인
p01 = train_x.quantile(0.01)
p99 = train_x.quantile(0.99)

# 1％점 이하의 값은 1%점으로, 99%점 이상의 값은 99%점으로 클리핑
train_x = train_x.clip(p01, p99, axis=1)

In [9]:
# 커넥터 핀 휨 
train_x['X__21'] = (train_x['X_24'] - train_x['X_12'])
train_x['X__22'] = (train_x['X_25'] - train_x['X_12'])
train_x['X__23'] = (train_x['X_26'] - train_x['X_12'])
train_x['X__24'] = (train_x['X_27'] - train_x['X_12'])
train_x['X__25'] = (train_x['X_28'] - train_x['X_12'])
train_x['X__26'] = (train_x['X_29'] - train_x['X_12'])
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

In [10]:
xgbcols= ['X_20', 'X_32', 'X_30', 'X_07', 'X_49', 'X_03', 'X_21', 'X_09', 'X_22', 'X_08', 'X_16', 'X_05', 
          'X_06', 'X_19', 'X_41', 'X_51', 'X__24', 'X_52', 'X_53', 'X_18', 'X_45', 'X_55', 'X__23', 'X_14', 
          'X__25', 'X_54', 'X_24', 'X_56', 'X__21', 'X_40', 'X__22', 'X_50', 'X_42', 'X__26', 'X_27', 'X_25', 
          'X_39', 'X_44', 'X_38', 'X_17', 'X_15', 'X_33', 'X_13', 'X_26', 'X_31']

train_set=train_x[xgbcols]
train_set

Unnamed: 0,X_20,X_32,X_30,X_07,X_49,X_03,X_21,X_09,X_22,X_08,...,X_25,X_39,X_44,X_38,X_17,X_15,X_33,X_13,X_26,X_31
0,3.17,1.46,1.49,29.45,9706.030,67.47,3.06,245.71,3.13,62.38,...,2.07,-16.36,21.09,-16.41,13.52,13.37,1.74,0.18,2.05,1.69
1,3.11,1.45,1.49,28.73,10423.430,65.17,2.98,233.61,3.20,61.23,...,2.10,-16.11,21.13,-16.06,13.51,13.33,1.66,0.18,2.10,1.67
2,3.04,1.46,1.49,28.81,10948.530,64.07,3.01,272.20,3.12,105.77,...,2.06,-16.17,21.12,-16.16,13.51,13.36,1.68,0.15,2.04,1.69
3,3.01,1.47,1.47,28.92,15007.030,67.57,3.02,255.36,3.08,115.21,...,2.05,-16.03,21.09,-16.05,13.51,13.33,1.68,0.21,2.03,1.68
4,3.07,1.47,1.49,29.68,11051.030,63.57,3.00,241.46,3.12,103.38,...,2.06,-16.23,21.10,-16.25,13.50,13.34,1.82,0.16,2.09,1.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,3.03,1.36,1.37,30.20,56583.294,62.27,3.06,298.05,3.13,77.83,...,2.11,-16.15,21.19,-16.09,13.52,13.38,1.67,0.15,2.08,1.60
39603,3.06,1.37,1.40,29.21,56583.294,62.77,3.05,270.67,3.06,102.25,...,2.12,-15.74,21.19,-15.70,13.49,13.36,1.77,0.13,2.09,1.68
39604,3.09,1.37,1.39,29.96,9092.372,64.67,3.07,198.07,3.12,102.61,...,2.09,-16.19,21.19,-16.12,13.52,13.39,1.58,0.14,2.08,1.61
39605,3.01,1.36,1.37,30.30,56583.294,63.67,3.15,275.52,3.09,112.60,...,2.03,-16.36,21.13,-16.36,13.52,13.38,1.67,0.16,2.05,1.56


In [11]:
import xgboost as xgb

x_train, x_valid, y_train, y_valid = train_test_split(train_set, train_y14, test_size=0.3, random_state = 42) 

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
XGB.fit(x_train, y_train)
#학습 완료

y_predict = XGB.predict(x_valid)

lg_score = lg_nrmse_one(y_valid, y_predict)
print('NRMSE :', lg_score)

NRMSE : 0.0239495725337811


In [12]:
test_x= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/test.csv').drop(columns="ID")

test_x = test_x.drop(columns=['X_04','X_23','X_47','X_48'])

test_x = test_x.clip(p01, p99, axis=1)

# 커넥터 핀 휨 
test_x['X__21'] = (test_x['X_24'] - test_x['X_12'])
test_x['X__22'] = (test_x['X_25'] - test_x['X_12'])
test_x['X__23'] = (test_x['X_26'] - test_x['X_12'])
test_x['X__24'] = (test_x['X_27'] - test_x['X_12'])
test_x['X__25'] = (test_x['X_28'] - test_x['X_12'])
test_x['X__26'] = (test_x['X_29'] - test_x['X_12'])

test_x=test_x[xgbcols]
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

XGB.fit(train_set, train_y14)

y_pred =XGB.predict(test_x)

submit = pd.DataFrame(y_pred, index= test_x.index, columns = ['Y_14'])



In [13]:
submit

Unnamed: 0,Y_14
0,-26.117653
1,-26.136555
2,-25.835609
3,-25.504868
4,-25.605021
...,...
39603,-26.388384
39604,-26.276550
39605,-26.380911
39606,-26.390242


In [16]:
submit

Unnamed: 0,Y_14
0,-25.874207
1,-26.075535
2,-25.666723
3,-25.576336
4,-25.578844
...,...
39603,-26.450916
39604,-26.345762
39605,-26.458601
39606,-26.428410


In [14]:
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/LG/submit_y_14_new.csv', index=False)