In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [4]:
def lg_nrmse_one(gt, preds):
    # Y_08의 경우는 가중치 1.2 부여. 8 이후부터는 모두 가중치 1.0
    all_nrmse = []
    rmse = mean_squared_error(np.array(gt), preds, squared=False)
    nrmse = rmse/np.mean(np.abs(np.array(gt)))
    all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse)
    return score

#얘를 쓸것!!!

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/train.csv').drop(columns="ID")
train_x = train_df.filter(regex='X') # Input : X Featrue
train_x=train_x.drop(columns=['X_04','X_23','X_47','X_48'])
train_y = train_df.filter(regex='Y') # Output : Y Feature

#Y_03


In [6]:
train_y03 = train_y['Y_03']
train_y03

0        1.680
1        1.268
2        0.782
3        1.052
4        0.689
         ...  
39602    1.263
39603    1.083
39604    0.993
39605    0.477
39606    0.852
Name: Y_03, Length: 39607, dtype: float64

In [7]:
train_y03=pd.DataFrame(train_y03, index= train_y.index)
train_y03

Unnamed: 0,Y_03
0,1.680
1,1.268
2,0.782
3,1.052
4,0.689
...,...
39602,1.263
39603,1.083
39604,0.993
39605,0.477


#클리핑

In [8]:
from sklearn.model_selection import train_test_split

# -----------------------------------
# 열마다 학습 데이터의 1%, 99% 지점을 확인
p01 = train_x.quantile(0.01)
p99 = train_x.quantile(0.99)

# 1％점 이하의 값은 1%점으로, 99%점 이상의 값은 99%점으로 클리핑
train_x = train_x.clip(p01, p99, axis=1)

In [9]:
# 커넥터 핀 휨 
train_x['X__21'] = (train_x['X_24'] - train_x['X_12'])
train_x['X__22'] = (train_x['X_25'] - train_x['X_12'])
train_x['X__23'] = (train_x['X_26'] - train_x['X_12'])
train_x['X__24'] = (train_x['X_27'] - train_x['X_12'])
train_x['X__25'] = (train_x['X_28'] - train_x['X_12'])
train_x['X__26'] = (train_x['X_29'] - train_x['X_12'])
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

In [10]:
xgbcols= ['X_22', 'X_19', 'X_13', 'X_18', 'X__23', 'X_43', 'X_49', 'X_07', 'X_05', 'X_32', 'X__22', 'X_16', 
           'X_03', 'X__25', 'X_09', 'X_21', 'X_45', 'X_28', 'X_06', 'X_30', 'X_14', 'X_25', 'X_51', 'X_50']

train_set=train_x[xgbcols]
train_set

Unnamed: 0,X_22,X_19,X_13,X_18,X__23,X_43,X_49,X_07,X_05,X_32,...,X_09,X_21,X_45,X_28,X_06,X_30,X_14,X_25,X_51,X_50
0,3.13,3.11,0.18,13.44,-2.29,21.28,9706.030,29.45,101.892,1.46,...,245.71,3.06,0.26,2.06,74.983,1.49,13.34,2.07,135.359219,137.043591
1,3.20,3.04,0.18,13.42,-2.28,21.16,10423.430,28.73,101.944,1.45,...,233.61,2.98,0.13,2.17,72.943,1.49,13.33,2.10,135.979817,133.736691
2,3.12,3.04,0.15,13.43,-2.32,21.17,10948.530,28.81,103.153,1.46,...,272.20,3.01,0.14,2.10,72.943,1.49,13.36,2.06,131.055355,132.805112
3,3.08,3.05,0.21,13.40,-2.30,21.20,15007.030,28.92,101.971,1.47,...,255.36,3.02,0.22,2.07,76.002,1.47,13.30,2.05,133.239422,134.138760
4,3.12,3.04,0.16,13.42,-2.26,21.18,11051.030,29.68,101.981,1.47,...,241.46,3.00,0.22,2.28,70.904,1.49,13.35,2.06,136.620022,142.728970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,3.13,3.20,0.15,13.46,-2.28,21.19,56583.294,30.20,103.150,1.36,...,298.05,3.06,0.11,2.10,66.825,1.37,13.37,2.11,130.807148,129.965741
39603,3.06,3.15,0.13,13.44,-2.31,21.21,56583.294,29.21,102.021,1.37,...,270.67,3.05,0.12,2.11,66.825,1.40,13.36,2.12,120.158764,127.633885
39604,3.12,3.23,0.14,13.46,-2.30,21.22,9092.372,29.96,103.144,1.37,...,198.07,3.07,0.13,2.12,68.864,1.39,13.38,2.09,136.893025,132.501286
39605,3.09,3.18,0.16,13.46,-2.28,21.16,56583.294,30.30,102.025,1.36,...,275.52,3.15,0.11,2.10,67.845,1.37,13.36,2.03,121.495930,128.189679


In [11]:
import xgboost as xgb

x_train, x_valid, y_train, y_valid = train_test_split(train_set, train_y03, test_size=0.3, random_state = 42) 

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
XGB.fit(x_train, y_train)
#학습 완료

y_predict = XGB.predict(x_valid)

lg_score = lg_nrmse_one(y_valid, y_predict)
print('NRMSE :', lg_score)

NRMSE : 0.41565292912702406


In [12]:
test_x= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LG/test.csv').drop(columns="ID")

test_x = test_x.drop(columns=['X_04','X_23','X_47','X_48'])

test_x = test_x.clip(p01, p99, axis=1)

# 커넥터 핀 휨 
test_x['X__21'] = (test_x['X_24'] - test_x['X_12'])
test_x['X__22'] = (test_x['X_25'] - test_x['X_12'])
test_x['X__23'] = (test_x['X_26'] - test_x['X_12'])
test_x['X__24'] = (test_x['X_27'] - test_x['X_12'])
test_x['X__25'] = (test_x['X_28'] - test_x['X_12'])
test_x['X__26'] = (test_x['X_29'] - test_x['X_12'])

test_x=test_x[xgbcols]
#이후, RFE 상에서 혹여나 새로운 변수를 만드는데 기여한 기존 변수들(12, 24,25,26,27,28,29)가 important하게 잡힐 수 있으므로 일단은 놔둡니다.
#또한, 현재 train_x 상에서는 통과여부 변수들은 모두 drop된 상태입니다.

XGB = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=1, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

XGB.fit(train_set, train_y03)

y_pred =XGB.predict(test_x)

submit = pd.DataFrame(y_pred, index= test_x.index, columns = ['Y_03'])



In [13]:
submit

Unnamed: 0,Y_03
0,1.106832
1,1.153311
2,1.166778
3,0.977684
4,0.948229
...,...
39603,1.021235
39604,1.007390
39605,1.010640
39606,0.929542


In [16]:
submit

Unnamed: 0,Y_03
0,1.193610
1,1.182692
2,1.014048
3,1.001785
4,0.966265
...,...
39603,1.006554
39604,1.003109
39605,1.006087
39606,0.962937


In [14]:
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/LG/submit_y_03_new.csv', index=False)