1. Feature Engineering
  - Feature Extraction (계절, 년중일수)
  - Feature Selection (월, 일, 측정 시간대, 이슬점 온도(°C), 대기압(mbar), 증기압 부족량(mbar), 공기 밀도 (g/m**3), 풍향 (deg), Season, 년중 일수)
  - Label Encoding (측정 시간대)
  - Standard Scaling
2. Stacking
  - 총 4개의 예측 모형을 Ensemble
  - BaggingRegressor, ExtraTreesRegressor, BaggingRegressor, KNeighborsRegressor


Stacking

In [5]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

import argparse

parser = argparse.ArgumentParser(description='stacking')
parser.add_argument('--best_n', default=4, type=int)
parser.add_argument('--scaler', default='standard', type=str)
parser.add_argument('--cv', default=10, type=int)
parser.add_argument('--seed', default=826, type=int)
args = parser.parse_args("")

best_n = args.best_n
scaler = args.scaler
cv = args.cv
seed = args.seed

if scaler == 'standard': scaler = StandardScaler()
elif scaler == 'minmax': scaler = MinMaxScaler()
elif scaler == 'robust': scaler = RobustScaler()

def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds(seed)

path = 'data/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

train.tail(2)

Unnamed: 0,ID,월,일,측정 시간대,섭씨 온도(°⁣C),절대 온도(K),이슬점 온도(°C),상대 습도 (%),대기압(mbar),포화 증기압(mbar),실제 증기압(mbar),증기압 부족량(mbar),수증기 함량 (g/kg),공기 밀도 (g/m**3),풍향 (deg),풍속 (m/s)
36579,TRAIN_36579,9,10,저녁,25.65,299.81,15.3,52.81,988.39,32.98,17.41,15.56,11.03,1144.61,225.4,0.36
36580,TRAIN_36580,3,11,오전,3.14,276.3,1.88,91.4,1000.01,7.66,7.0,0.66,4.37,1257.47,30.67,4.33


Preprocessing

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36581 entries, 0 to 36580
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              36581 non-null  object 
 1   월               36581 non-null  int64  
 2   일               36581 non-null  int64  
 3   측정 시간대          36581 non-null  object 
 4   섭씨 온도(°⁣C)      36581 non-null  float64
 5   절대 온도(K)        36581 non-null  float64
 6   이슬점 온도(°C)      36581 non-null  float64
 7   상대 습도 (%)       36581 non-null  float64
 8   대기압(mbar)       36581 non-null  float64
 9   포화 증기압(mbar)    36581 non-null  float64
 10  실제 증기압(mbar)    36581 non-null  float64
 11  증기압 부족량(mbar)   36581 non-null  float64
 12  수증기 함량 (g/kg)   36581 non-null  float64
 13  공기 밀도 (g/m**3)  36581 non-null  float64
 14  풍향 (deg)        36581 non-null  float64
 15  풍속 (m/s)        36581 non-null  float64
dtypes: float64(12), int64(2), object(2)
memory usage: 4.5+ MB


In [7]:
le = LabelEncoder()
train['측정 시간대'] = le.fit_transform(train['측정 시간대'])
test['측정 시간대'] = le.transform(test['측정 시간대'])

train.tail(2)

Unnamed: 0,ID,월,일,측정 시간대,섭씨 온도(°⁣C),절대 온도(K),이슬점 온도(°C),상대 습도 (%),대기압(mbar),포화 증기압(mbar),실제 증기압(mbar),증기압 부족량(mbar),수증기 함량 (g/kg),공기 밀도 (g/m**3),풍향 (deg),풍속 (m/s)
36579,TRAIN_36579,9,10,3,25.65,299.81,15.3,52.81,988.39,32.98,17.41,15.56,11.03,1144.61,225.4,0.36
36580,TRAIN_36580,3,11,1,3.14,276.3,1.88,91.4,1000.01,7.66,7.0,0.66,4.37,1257.47,30.67,4.33


In [33]:
def preprocess_month(df):
    month = df['월']
    conditions = [
        (month.isin([3,4,5])),
        (month.isin([6,7,8])),
        (month.isin([9,10,11])),
        (month.isin([12,1,2]))
    ]
    choices = [0, 1, 2, 3]
    df['Season'] = np.select(conditions, choices)
    return df

train = preprocess_month(train)
test = preprocess_month(test)

train.tail(2)

Unnamed: 0,ID,월,일,측정 시간대,섭씨 온도(°⁣C),절대 온도(K),이슬점 온도(°C),상대 습도 (%),대기압(mbar),포화 증기압(mbar),실제 증기압(mbar),증기압 부족량(mbar),수증기 함량 (g/kg),공기 밀도 (g/m**3),풍향 (deg),풍속 (m/s),Season
36579,TRAIN_36579,9,10,3,25.65,299.81,15.3,52.81,988.39,32.98,17.41,15.56,11.03,1144.61,225.4,0.36,2
36580,TRAIN_36580,3,11,1,3.14,276.3,1.88,91.4,1000.01,7.66,7.0,0.66,4.37,1257.47,30.67,4.33,0


In [34]:
train['년중 일수'] = (train['월'] - 1) * 30 + train['일']
test['년중 일수'] = (test['월'] - 1) * 30 + test['일']

train.tail(2)

Unnamed: 0,ID,월,일,측정 시간대,섭씨 온도(°⁣C),절대 온도(K),이슬점 온도(°C),상대 습도 (%),대기압(mbar),포화 증기압(mbar),실제 증기압(mbar),증기압 부족량(mbar),수증기 함량 (g/kg),공기 밀도 (g/m**3),풍향 (deg),풍속 (m/s),Season,년중 일수
36579,TRAIN_36579,9,10,3,25.65,299.81,15.3,52.81,988.39,32.98,17.41,15.56,11.03,1144.61,225.4,0.36,2,250
36580,TRAIN_36580,3,11,1,3.14,276.3,1.88,91.4,1000.01,7.66,7.0,0.66,4.37,1257.47,30.67,4.33,0,71


Training

Modeling

In [35]:
knn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1)
bagging = BaggingRegressor(n_estimators=20, n_jobs=-1, random_state=seed)
ets = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, random_state=seed)
rf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=seed)

Stacking

In [36]:
def get_stacking_ml_datasets(model, x_train, y_train, x_test, n_folds):
    kf = KFold(n_splits = n_folds, shuffle=True, random_state=seed)

    train_fold_pred = np.zeros((x_train.shape[0], 1))
    test_pred = np.zeros((x_test.shape[0], n_folds))

    for idx, (train_idx, val_idx) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_idx]
        y_tr = y_train[train_idx]
        x_val = x_train[val_idx]

        model.fit(x_tr, y_tr)
        train_fold_pred[val_idx, :] = model.predict(x_val).reshape(-1, 1)
        test_pred[:, idx] = model.predict(x_test)
    
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    return train_fold_pred, test_pred_mean

In [39]:
x_train = train.drop(['ID', '풍속 (m/s)'], axis=1)
y_train = train['풍속 (m/s)']
x_test = test

select_columns = [
    '월',
    '일',
    '측정 시간대',
    '이슬점 온도(°C)',
    '대기압(mbar)',
    '증기압 부족량(mbar)',
    '공기 밀도 (g/m**3)',
    '풍향 (deg)',
    'Season',
    '년중 일수',
]

x_train = x_train[select_columns]
x_test = x_test[select_columns]

scaler = scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

x_train.shape, x_test.shape

((36581, 10), (15678, 10))

In [40]:
best_ml = [
    knn, bagging, ets, rf
]

meta_ml_x_train, meta_ml_x_test = [], []

for classifier in best_ml:
    print(classifier)
    temp_x_train, temp_x_test = get_stacking_ml_datasets(classifier, x_train, y_train, x_test, cv)

    meta_ml_x_train.append(temp_x_train)
    meta_ml_x_test.append(temp_x_test)

KNeighborsRegressor(n_jobs=-1, n_neighbors=10)
BaggingRegressor(n_estimators=20, n_jobs=-1, random_state=826)
ExtraTreesRegressor(n_estimators=200, n_jobs=-1, random_state=826)
RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=826)


In [50]:
meta_ml_x_train = np.hstack(meta_ml_x_train)
meta_ml_x_test = np.hstack(meta_ml_x_test)

meta_ml_x_train.shape, meta_ml_x_test.shape

((36581, 4), (15678, 4))

In [51]:
meta_clf = LinearRegression()
meta_clf.fit(meta_ml_x_train, y_train)
pred = meta_clf.predict(meta_ml_x_test)
pred.shape

(15678,)

Submission

In [53]:
submission = pd.read_csv(path + 'sample_submission.csv')
submission['풍속 (m/s)'] = pred.round(2)
submission.to_csv(path + '3rd_submission.csv', index=False)
submission.head()

Unnamed: 0,ID,풍속 (m/s)
0,TEST_00000,1.71
1,TEST_00001,1.03
2,TEST_00002,2.01
3,TEST_00003,1.05
4,TEST_00004,1.14
