PyCaret의 model selection과 stack ensemble 기능을 사용해 모델을 구성했습니다. [ExtraTree, RandomForest, XGBoost, LGBM]

주기성을 띄는 feature에 대해서는 sin, cos을 적용했고, 범주형 feature는 target encoding을 진행했습니다.

비슷한 feature 끼리 묶어 모델 학습에 도움을 주고자 clustering을 진행했습니다.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from category_encoders import * 

In [2]:
seed = 842

In [7]:
def preprocess(df, test=False):
    if test:
        df.columns = ['ID', 'Month', 'Day', 'Measurement Time', 'T (°⁣C)', 'T (K)',
    'Dew T(°C)', 'Relative Humidity (%)', 'P (mbar)', 'Saturation Vapor P(mbar)',
    'Actual Vapor P(mbar)', 'Vapor P Shortage (mbar)', 'Vapor Content (g/kg)', 'Air Density (g/m**3)',
    'Direction (deg)']
    else:
        df.columns = ['ID', 'Month', 'Day', 'Measurement Time', 'T (°⁣C)', 'T (K)',
      'Dew T(°C)', 'Relative Humidity (%)', 'P (mbar)', 'Saturation Vapor P(mbar)',
      'Actual Vapor P(mbar)', 'Vapor P Shortage (mbar)', 'Vapor Content (g/kg)', 'Air Density (g/m**3)',
      'Direction (deg)', 'Velocity (m/s)']
    
    df['Measurement Time'] = df['Measurement Time'].replace({'새벽' : 0, '오전' : 1, '오후' : 2, '저녁' : 3})
    df['Time_cos'] = df['Measurement Time'].apply(lambda x : np.cos(np.pi * x / 2))
    df['Time_sin'] = df['Measurement Time'].apply(lambda x : np.sin(np.pi * x / 2))

    df['Direction (sign)'] = ((df['Direction (deg)'] + 11.25) % 360) // 22.5
    df['Direction (sign)'] = df['Direction (sign)'].astype('int')

    df['Date'] = df['Month'].apply(lambda x : format(x, '02')) + df['Day'].apply(lambda x : format(x, '02')) # category

    df['cat_Month'] = df.Month.astype('category')
    df['cat_day'] = df['Day'].astype('category')
    df['cat_Measurement Time'] = df['Measurement Time'].astype('category')

    df['Direction_x'] = df['Direction (deg)'].apply(lambda x : np.cos(np.pi * x / 180))
    df['Direction_y'] = df['Direction (deg)'].apply(lambda x : np.sin(np.pi * x / 180))

    df['Month_cos'] = df['Month'].apply(lambda x : np.cos((x-1)/6 * np.pi))
    df['Month_sin'] = df['Month'].apply(lambda x : np.sin((x-1)/6 * np.pi))

    df = df.drop(['ID', 'T (K)', 'Vapor Content (g/kg)'], axis=1)
    return df

In [8]:
df = pd.read_csv('data/train.csv')
df_train = preprocess(df.copy())
df_train.tail(2)

Unnamed: 0,Month,Day,Measurement Time,T (°⁣C),Dew T(°C),Relative Humidity (%),P (mbar),Saturation Vapor P(mbar),Actual Vapor P(mbar),Vapor P Shortage (mbar),...,Time_sin,Direction (sign),Date,cat_Month,cat_day,cat_Measurement Time,Direction_x,Direction_y,Month_cos,Month_sin
36579,9,10,3,25.65,15.3,52.81,988.39,32.98,17.41,15.56,...,-1.0,10,910,9,10,3,-0.702153,-0.712026,-0.5,-0.866025
36580,3,11,1,3.14,1.88,91.4,1000.01,7.66,7.0,0.66,...,1.0,1,311,3,11,1,0.860119,0.510093,0.5,0.866025


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36581 entries, 0 to 36580
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Month                     36581 non-null  int64   
 1   Day                       36581 non-null  int64   
 2   Measurement Time          36581 non-null  int64   
 3   T (°⁣C)                   36581 non-null  float64 
 4   Dew T(°C)                 36581 non-null  float64 
 5   Relative Humidity (%)     36581 non-null  float64 
 6   P (mbar)                  36581 non-null  float64 
 7   Saturation Vapor P(mbar)  36581 non-null  float64 
 8   Actual Vapor P(mbar)      36581 non-null  float64 
 9   Vapor P Shortage (mbar)   36581 non-null  float64 
 10  Air Density (g/m**3)      36581 non-null  float64 
 11  Direction (deg)           36581 non-null  float64 
 12  Velocity (m/s)            36581 non-null  float64 
 13  Time_cos                  36581 non-null  floa

Clustring

In [15]:
from pycaret import clustering
from sklearn.preprocessing import StandardScaler

# cluster setting

num_cluster = 3
clustering_features = ['Month_cos', 'Month_sin', 'T (°\u2063C)', 'P (mbar)', 
                       'Actual Vapor P(mbar)', 'Direction_x', 'Direction_y']

cluster = clustering.setup(data=df_train, ignore_features=[i for i in df_train.columns if i not in clustering_features],
                           session_id = seed, use_gpu=True, normalize=True)
kmeans = clustering.create_model('kmeans', num_clusters= num_cluster)

Unnamed: 0,Description,Value
0,Session id,842
1,Original data shape,"(36581, 24)"
2,Transformed data shape,"(36581, 7)"
3,Ignore features,17
4,Numeric features,7
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Normalize,True


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2473,12999.8747,1.5495,0,0,0


In [24]:
df_train['Cluster'] = clustering.predict_model(model=kmeans, data=df_train.astype(np.float32))['Cluster']
df_train['Cluster'] = df_train['Cluster'].apply(lambda x : int(x[-1]))

for i in range(num_cluster):
    df_train[f'Cluster Distance {i}'] = kmeans.transform(cluster.train_transformed)[:, i]

df_train.tail(2)

Unnamed: 0,Month,Day,Measurement Time,T (°⁣C),Dew T(°C),Relative Humidity (%),P (mbar),Saturation Vapor P(mbar),Actual Vapor P(mbar),Vapor P Shortage (mbar),...,cat_day,cat_Measurement Time,Direction_x,Direction_y,Month_cos,Month_sin,Cluster,Cluster Distance 0,Cluster Distance 1,Cluster Distance 2
36579,9,10,3,25.65,15.3,52.81,988.39,32.98,17.41,15.56,...,10,3,-0.702153,-0.712026,-0.5,-0.866025,0,2.056279,4.296998,4.377728
36580,3,11,1,3.14,1.88,91.4,1000.01,7.66,7.0,0.66,...,11,1,0.860119,0.510093,0.5,0.866025,1,3.981288,2.712475,3.057602


In [33]:
from pycaret.regression import *
import category_encoders

reg = setup(data=df_train, target='Velocity (m/s)', ignore_features=['Direction_x', 'Direction_y'],
            use_gpu=True, session_id=seed, max_encoding_ohe=1, 
            encoding_method=category_encoders.TargetEncoder(smoothing=10))


numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 232 from PyObject



Unnamed: 0,Description,Value
0,Session id,842
1,Target,Velocity (m/s)
2,Target type,Regression
3,Original data shape,"(36581, 28)"
4,Transformed data shape,"(36581, 26)"
5,Transformed train set shape,"(25606, 26)"
6,Transformed test set shape,"(10975, 26)"
7,Ignore features,2
8,Numeric features,21
9,Categorical features,4


In [34]:
reg.pipeline

In [35]:
best = compare_models(n_select=4)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.392,0.2944,0.5424,0.8774,0.1823,0.3113,1.202
rf,Random Forest Regressor,0.4152,0.3238,0.5688,0.8652,0.1921,0.3433,3.522
xgboost,Extreme Gradient Boosting,0.4882,0.4295,0.6553,0.8212,0.2219,0.3985,0.502
catboost,CatBoost Regressor,0.5323,0.5062,0.7114,0.7893,0.2363,0.4342,13.604
lightgbm,Light Gradient Boosting Machine,0.549,0.5319,0.7292,0.7787,0.2438,0.4596,0.884
dt,Decision Tree Regressor,0.5397,0.6036,0.7767,0.7489,0.2569,0.4091,0.473
knn,K Neighbors Regressor,0.5975,0.6901,0.8305,0.7126,0.2804,0.5435,0.223
gbr,Gradient Boosting Regressor,0.6598,0.7738,0.8796,0.6779,0.2857,0.552,7.012
lr,Linear Regression,0.7725,1.0595,1.0293,0.559,0.3297,0.6415,0.169
br,Bayesian Ridge,0.7725,1.0596,1.0293,0.5589,0.3297,0.6415,0.175


In [36]:
best[2] = tune_model(best[2], optimize='MAE', n_iter=100)
best[3] = tune_model(best[3], optimize='MAE', n_iter=100)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


KeyboardInterrupt: 

In [None]:
stack_lr = stack_models(best, optimize='MAE', choose_better=True)

In [None]:
stack_finalized = finalize_model(stack_lr)

In [None]:
df_test = preprocess(pd.read_csv('data/test.csv'), True)

df_test['Cluster'] = clustering.predict_model(model=kmeans, data=df_test.astype(np.float32))['Cluster']
df_test['Cluster'] = df_test['Cluster'].apply(lambda x : int(x[-1]))

for i in range(num_cluster):
    df_test[f'Cluster Distance {i}'] = kmeans.transfrom(cluster.get_config('pipeline').transform(df_test[clustering_features]))[:, i]

df_test

In [None]:
df_test = df_test.drop(['Direction_x', 'Direction_y'], axis=1)

In [None]:
df_submit = pd.read_csv('data/sample_submission.csv')
df_submit['풍속 (m/s)'] = stack_finalized.predict(df_test)
df_submit

In [None]:
df_submit.to_csv('data/2nd_submission.csv', index=False)