In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [4]:
def lg_nrmse_one(gt, preds):
    # Y_08의 경우는 가중치 1.2 부여. 8 이후부터는 모두 가중치 1.0
    all_nrmse = []
    rmse = mean_squared_error(np.array(gt), preds, squared=False)
    nrmse = rmse/np.mean(np.abs(np.array(gt)))
    all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse)
    return score

#얘를 쓸것!!!

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/train.csv').drop(columns="ID")
train_x = train_df.filter(regex='X') # Input : X Featrue
train_x=train_x.drop(columns=['X_04','X_23','X_47','X_48'])
train_y = train_df.filter(regex='Y') # Output : Y Feature

#Y_02


In [6]:
train_y02 = train_y['Y_02']
train_y02

0        1.456
1        1.184
2        0.665
3        1.079
4        0.646
         ...  
39602    1.215
39603    0.606
39604    1.154
39605    0.187
39606    0.348
Name: Y_02, Length: 39607, dtype: float64

In [7]:
train_y02=pd.DataFrame(train_y02, index= train_y.index)
train_y02

Unnamed: 0,Y_02
0,1.456
1,1.184
2,0.665
3,1.079
4,0.646
...,...
39602,1.215
39603,0.606
39604,1.154
39605,0.187


#클리핑

In [8]:
from sklearn.model_selection import train_test_split

# -----------------------------------
# 열마다 학습 데이터의 1%, 99% 지점을 확인
p01 = train_x.quantile(0.01)
p99 = train_x.quantile(0.99)

# 1％점 이하의 값은 1%점으로, 99%점 이상의 값은 99%점으로 클리핑
train_x = train_x.clip(p01, p99, axis=1)

In [9]:
# 커넥터 핀 휨 
train_x['X__21'] = (train_x['X_24'] - train_x['X_12'])
train_x['X__22'] = (train_x['X_25'] - train_x['X_12'])
train_x['X__23'] = (train_x['X_26'] - train_x['X_12'])
train_x['X__24'] = (train_x['X_27'] - train_x['X_12'])
train_x['X__25'] = (train_x['X_28'] - train_x['X_12'])
train_x['X__26'] = (train_x['X_29'] - train_x['X_12'])
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

In [10]:
xgbcols= ['X_22', 'X_13', 'X_07', 'X_18', 'X_03', 'X_43', 'X_06', 'X__22', 'X_05', 'X_49', 'X_09', 'X_14', 
          'X_32', 'X_17', 'X_19', 'X__24', 'X_45', 'X_20', 'X_21']

train_set = train_x[xgbcols]
train_set

Unnamed: 0,X_22,X_13,X_07,X_18,X_03,X_43,X_06,X__22,X_05,X_49,X_09,X_14,X_32,X_17,X_19,X__24,X_45,X_20,X_21
0,3.13,0.18,29.45,13.44,67.47,21.28,74.983,-2.27,101.892,9706.030,245.71,13.34,1.46,13.52,3.11,-2.28,0.26,3.17,3.06
1,3.20,0.18,28.73,13.42,65.17,21.16,72.943,-2.28,101.944,10423.430,233.61,13.33,1.45,13.51,3.04,-2.28,0.13,3.11,2.98
2,3.12,0.15,28.81,13.43,64.07,21.17,72.943,-2.30,103.153,10948.530,272.20,13.36,1.46,13.51,3.04,-2.29,0.14,3.04,3.01
3,3.08,0.21,28.92,13.40,67.57,21.20,76.002,-2.28,101.971,15007.030,255.36,13.30,1.47,13.51,3.05,-2.27,0.22,3.01,3.02
4,3.12,0.16,29.68,13.42,63.57,21.18,70.904,-2.29,101.981,11051.030,241.46,13.35,1.47,13.50,3.04,-2.23,0.22,3.07,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,3.13,0.15,30.20,13.46,62.27,21.19,66.825,-2.25,103.150,56583.294,298.05,13.37,1.36,13.52,3.20,-2.25,0.11,3.03,3.06
39603,3.06,0.13,29.21,13.44,62.77,21.21,66.825,-2.28,102.021,56583.294,270.67,13.36,1.37,13.49,3.15,-2.28,0.12,3.06,3.05
39604,3.12,0.14,29.96,13.46,64.67,21.22,68.864,-2.29,103.144,9092.372,198.07,13.38,1.37,13.52,3.23,-2.31,0.13,3.09,3.07
39605,3.09,0.16,30.30,13.46,63.67,21.16,67.845,-2.30,102.025,56583.294,275.52,13.36,1.36,13.52,3.18,-2.28,0.11,3.01,3.15


In [11]:
import xgboost as xgb

x_train, x_valid, y_train, y_valid = train_test_split(train_set, train_y02, test_size=0.3, random_state = 42) 

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
XGB.fit(x_train, y_train)
#학습 완료

y_predict = XGB.predict(x_valid)

lg_score = lg_nrmse_one(y_valid, y_predict)
print('NRMSE :', lg_score)

NRMSE : 0.4254993304410972


In [12]:
test_x= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/test.csv').drop(columns="ID")

test_x = test_x.drop(columns=['X_04','X_23','X_47','X_48'])

test_x = test_x.clip(p01, p99, axis=1)

# 커넥터 핀 휨 
test_x['X__21'] = (test_x['X_24'] - test_x['X_12'])
test_x['X__22'] = (test_x['X_25'] - test_x['X_12'])
test_x['X__23'] = (test_x['X_26'] - test_x['X_12'])
test_x['X__24'] = (test_x['X_27'] - test_x['X_12'])
test_x['X__25'] = (test_x['X_28'] - test_x['X_12'])
test_x['X__26'] = (test_x['X_29'] - test_x['X_12'])

test_x=test_x[xgbcols]
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

XGB.fit(train_set, train_y02)

y_pred =XGB.predict(test_x)

submit = pd.DataFrame(y_pred, index= test_x.index, columns = ['Y_02'])



In [13]:
submit

Unnamed: 0,Y_02
0,1.220496
1,1.214667
2,1.187168
3,1.131825
4,1.017552
...,...
39603,1.052265
39604,0.928657
39605,0.973512
39606,0.926908


In [15]:
submit

Unnamed: 0,Y_02
0,1.156688
1,1.152798
2,1.122332
3,1.122852
4,0.997621
...,...
39603,1.046393
39604,0.988491
39605,0.954453
39606,0.900861


In [14]:
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/LG/submit_y_02_new.csv', index=False)