In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [5]:
def lg_nrmse_one(gt, preds):
    # Y_08의 경우는 가중치 1.2 부여. 8 이후부터는 모두 가중치 1.0
    all_nrmse = []
    rmse = mean_squared_error(np.array(gt), preds, squared=False)
    nrmse = rmse/np.mean(np.abs(np.array(gt)))
    all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse)
    return score

#얘를 쓸것!!!

In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/train.csv').drop(columns="ID")
train_x = train_df.filter(regex='X') # Input : X Featrue
train_x=train_x.drop(columns=['X_04','X_23','X_47','X_48'])
train_y = train_df.filter(regex='Y') # Output : Y Feature

#Y_01
숫자 바꿔주세용


In [7]:
train_y01 = train_y['Y_01']
train_y01

0        2.056
1        1.446
2        1.251
3        1.464
4        0.983
         ...  
39602    1.382
39603    1.482
39604    1.117
39605    0.895
39606    1.147
Name: Y_01, Length: 39607, dtype: float64

In [8]:
train_y01=pd.DataFrame(train_y01, index= train_y.index)
train_y01

Unnamed: 0,Y_01
0,2.056
1,1.446
2,1.251
3,1.464
4,0.983
...,...
39602,1.382
39603,1.482
39604,1.117
39605,0.895


#클리핑

In [9]:
from sklearn.model_selection import train_test_split

# -----------------------------------
# 열마다 학습 데이터의 1%, 99% 지점을 확인
p01 = train_x.quantile(0.01)
p99 = train_x.quantile(0.99)

# 1％점 이하의 값은 1%점으로, 99%점 이상의 값은 99%점으로 클리핑
train_x = train_x.clip(p01, p99, axis=1)

In [10]:
# 커넥터 핀 휨 
train_x['X__21'] = (train_x['X_24'] - train_x['X_12'])
train_x['X__22'] = (train_x['X_25'] - train_x['X_12'])
train_x['X__23'] = (train_x['X_26'] - train_x['X_12'])
train_x['X__24'] = (train_x['X_27'] - train_x['X_12'])
train_x['X__25'] = (train_x['X_28'] - train_x['X_12'])
train_x['X__26'] = (train_x['X_29'] - train_x['X_12'])
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

In [11]:
xgbcols= ['X_07', 'X_22', 'X_21', 'X_18', 'X_49', 'X_03', 'X_20', 'X_13', 'X_19', 'X_09', 'X__21', 'X_17', 
           'X_06', 'X_14', 'X_46', 'X_32', 'X_43', 'X_05', 'X_51', 'X_52', 'X__22', 'X__25', 'X_24', 'X_38', 
           'X_56', 'X_31', 'X_01', 'X_39', 'X_28']

train_set=train_x[xgbcols]
train_set

Unnamed: 0,X_07,X_22,X_21,X_18,X_49,X_03,X_20,X_13,X_19,X_09,...,X_52,X__22,X__25,X_24,X_38,X_56,X_31,X_01,X_39,X_28
0,29.45,3.13,3.06,13.44,9706.030,67.47,3.17,0.18,3.11,245.71,...,147.837968,-2.27,-2.28,2.05,-16.41,125.028256,1.69,70.544,-16.36,2.06
1,28.73,3.20,2.98,13.42,10423.430,65.17,3.11,0.18,3.04,233.61,...,149.924692,-2.28,-2.21,2.10,-16.06,124.877308,1.67,69.524,-16.11,2.17
2,28.81,3.12,3.01,13.43,10948.530,64.07,3.04,0.15,3.04,272.20,...,146.814592,-2.30,-2.26,2.07,-16.16,122.238232,1.69,72.583,-16.17,2.10
3,28.92,3.08,3.02,13.40,15007.030,67.57,3.01,0.21,3.05,255.36,...,139.720132,-2.28,-2.26,2.06,-16.05,134.875225,1.68,71.563,-16.03,2.07
4,29.68,3.12,3.00,13.42,11051.030,63.57,3.07,0.16,3.04,241.46,...,134.853555,-2.29,-2.07,2.09,-16.25,123.272762,1.68,69.524,-16.23,2.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,30.20,3.13,3.06,13.46,56583.294,62.27,3.03,0.15,3.20,298.05,...,133.481737,-2.25,-2.26,2.10,-16.09,129.029812,1.60,66.465,-16.15,2.10
39603,29.21,3.06,3.05,13.44,56583.294,62.77,3.06,0.13,3.15,270.67,...,142.667802,-2.28,-2.29,2.12,-15.70,122.811413,1.68,66.465,-15.74,2.11
39604,29.96,3.12,3.07,13.46,9092.372,64.67,3.09,0.14,3.23,198.07,...,134.419328,-2.29,-2.26,2.13,-16.12,119.166699,1.61,68.504,-16.19,2.12
39605,30.30,3.09,3.15,13.46,56583.294,63.67,3.01,0.16,3.18,275.52,...,141.288011,-2.30,-2.23,2.07,-16.36,124.525929,1.56,66.465,-16.36,2.10


In [12]:
import xgboost as xgb

x_train, x_valid, y_train, y_valid = train_test_split(train_set, train_y01, test_size=0.3, random_state = 42) 

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
XGB.fit(x_train, y_train)
#학습 완료

y_predict = XGB.predict(x_valid)

lg_score = lg_nrmse_one(y_valid, y_predict)
print('NRMSE :', lg_score)

NRMSE : 0.306564726648865


In [13]:
test_x= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/test.csv').drop(columns="ID")

test_x = test_x.drop(columns=['X_04','X_23','X_47','X_48'])

test_x = test_x.clip(p01, p99, axis=1)

# 커넥터 핀 휨 
test_x['X__21'] = (test_x['X_24'] - test_x['X_12'])
test_x['X__22'] = (test_x['X_25'] - test_x['X_12'])
test_x['X__23'] = (test_x['X_26'] - test_x['X_12'])
test_x['X__24'] = (test_x['X_27'] - test_x['X_12'])
test_x['X__25'] = (test_x['X_28'] - test_x['X_12'])
test_x['X__26'] = (test_x['X_29'] - test_x['X_12'])

test_x=test_x[xgbcols]
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
XGB.fit(train_set, train_y01)

y_pred =XGB.predict(test_x)

submit = pd.DataFrame(y_pred, index= test_x.index, columns = ['Y_01'])



In [14]:
submit # 전체학습

Unnamed: 0,Y_01
0,1.413998
1,1.518339
2,1.472182
3,1.412046
4,1.410268
...,...
39603,1.287902
39604,1.266021
39605,1.228359
39606,1.214165


In [18]:
submit # 전체학습 x

Unnamed: 0,Y_01
0,1.413429
1,1.553484
2,1.451596
3,1.388363
4,1.387532
...,...
39603,1.275408
39604,1.222636
39605,1.255226
39606,1.239365


In [15]:
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/LG/submit_y_01_new.csv', index=False)