In [27]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [28]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [29]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [30]:
def lg_nrmse_one(gt, preds):
    # Y_08의 경우는 가중치 1.2 부여. 8 이후부터는 모두 가중치 1.0
    all_nrmse = []
    rmse = mean_squared_error(np.array(gt), preds, squared=False)
    nrmse = rmse/np.mean(np.abs(np.array(gt)))
    all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse)
    return score

#얘를 쓸것!!!

In [31]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/train.csv').drop(columns="ID")
train_x = train_df.filter(regex='X') # Input : X Featrue
train_x=train_x.drop(columns=['X_04','X_23','X_47','X_48'])
train_y = train_df.filter(regex='Y') # Output : Y Feature

#Y_04


In [32]:
train_y04 = train_y['Y_04']
train_y04

0        10.502
1        18.507
2        14.082
3        16.975
4        15.047
          ...  
39602    10.874
39603     8.759
39604    13.159
39605     9.123
39606    10.421
Name: Y_04, Length: 39607, dtype: float64

In [33]:
train_y04 = pd.DataFrame(train_y04, index= train_y.index)
train_y04

Unnamed: 0,Y_04
0,10.502
1,18.507
2,14.082
3,16.975
4,15.047
...,...
39602,10.874
39603,8.759
39604,13.159
39605,9.123


#클리핑

In [34]:
from sklearn.model_selection import train_test_split

# -----------------------------------
# 열마다 학습 데이터의 1%, 99% 지점을 확인
p01 = train_x.quantile(0.01)
p99 = train_x.quantile(0.99)

# 1％점 이하의 값은 1%점으로, 99%점 이상의 값은 99%점으로 클리핑
train_x = train_x.clip(p01, p99, axis=1)

In [35]:
# 커넥터 핀 휨 
train_x['X__21'] = (train_x['X_24'] - train_x['X_12'])
train_x['X__22'] = (train_x['X_25'] - train_x['X_12'])
train_x['X__23'] = (train_x['X_26'] - train_x['X_12'])
train_x['X__24'] = (train_x['X_27'] - train_x['X_12'])
train_x['X__25'] = (train_x['X_28'] - train_x['X_12'])
train_x['X__26'] = (train_x['X_29'] - train_x['X_12'])
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

In [36]:
xgbcols= ['X_32', 'X_49', 'X_46', 'X_21', 'X_07', 'X_09', 'X_19', 'X_29', 'X_52', 'X_54', 'X_51', 'X__23', 'X_13', 
          'X_30', 'X_03', 'X_34', 'X_20', 'X_56', 'X_01', 'X_25', 'X__21', 'X_08', 'X_05']

train_set=train_x[xgbcols]
train_set

Unnamed: 0,X_32,X_49,X_46,X_21,X_07,X_09,X_19,X_29,X_52,X_54,...,X_30,X_03,X_34,X_20,X_56,X_01,X_25,X__21,X_08,X_05
0,1.46,9706.030,1463,3.06,29.45,245.71,3.11,2.34,147.837968,125.605427,...,1.49,67.47,12.99,3.17,125.028256,70.544,2.07,-2.29,62.38,101.892
1,1.45,10423.430,1463,2.98,28.73,233.61,3.04,2.28,149.924692,127.893337,...,1.49,65.17,12.92,3.11,124.877308,69.524,2.10,-2.28,61.23,101.944
2,1.46,10948.530,1468,3.01,28.81,272.20,3.04,2.26,146.814592,127.012195,...,1.49,64.07,12.97,3.04,122.238232,72.583,2.06,-2.29,105.77,103.153
3,1.47,15007.030,1469,3.02,28.92,255.36,3.05,2.12,139.720132,130.723186,...,1.47,67.57,12.91,3.01,134.875225,71.563,2.05,-2.27,115.21,101.971
4,1.47,11051.030,1469,3.00,29.68,241.46,3.04,2.13,134.853555,125.647793,...,1.49,63.57,12.96,3.07,123.272762,69.524,2.06,-2.26,103.38,101.981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,1.36,56583.294,1469,3.06,30.20,298.05,3.20,2.20,133.481737,121.780933,...,1.37,62.27,12.89,3.03,129.029812,66.465,2.11,-2.26,77.83,103.150
39603,1.37,56583.294,1459,3.05,29.21,270.67,3.15,2.20,142.667802,122.987209,...,1.40,62.77,12.88,3.06,122.811413,66.465,2.12,-2.28,102.25,102.021
39604,1.37,9092.372,1459,3.07,29.96,198.07,3.23,2.16,134.419328,130.920147,...,1.39,64.67,12.98,3.09,119.166699,68.504,2.09,-2.25,102.61,103.144
39605,1.36,56583.294,1469,3.15,30.30,275.52,3.18,2.12,141.288011,125.518825,...,1.37,63.67,12.91,3.01,124.525929,66.465,2.03,-2.26,112.60,102.025


In [37]:
import xgboost as xgb

x_train, x_valid, y_train, y_valid = train_test_split(train_set, train_y04, test_size=0.3, random_state = 42) 

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
XGB.fit(x_train, y_train)
#학습 완료

y_predict = XGB.predict(x_valid)

lg_score = lg_nrmse_one(y_valid, y_predict)
print('NRMSE :', lg_score)

NRMSE : 0.23531584277253675


In [38]:
test_x= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/test.csv').drop(columns="ID")

test_x = test_x.drop(columns=['X_04','X_23','X_47','X_48'])

test_x = test_x.clip(p01, p99, axis=1)

# 커넥터 핀 휨 
test_x['X__21'] = (test_x['X_24'] - test_x['X_12'])
test_x['X__22'] = (test_x['X_25'] - test_x['X_12'])
test_x['X__23'] = (test_x['X_26'] - test_x['X_12'])
test_x['X__24'] = (test_x['X_27'] - test_x['X_12'])
test_x['X__25'] = (test_x['X_28'] - test_x['X_12'])
test_x['X__26'] = (test_x['X_29'] - test_x['X_12'])

test_x=test_x[xgbcols]
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

XGB.fit(train_set, train_y04)

y_pred =XGB.predict(test_x)

submit = pd.DataFrame(y_pred, index= test_x.index, columns = ['Y_04'])



In [39]:
submit

Unnamed: 0,Y_04
0,13.930837
1,13.689375
2,14.570437
3,14.812147
4,14.405573
...,...
39603,12.887421
39604,13.513806
39605,12.853561
39606,13.293039


In [40]:
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/LG/submit_y_04_new.csv', index=False)