PyCaret의 model selection과 stack ensemble 기능을 사용해 모델을 구성했습니다. [ExtraTree, RandomForest, XGBoost, LGBM]

주기성을 띄는 feature에 대해서는 sin, cos을 적용했고, 범주형 feature는 target encoding을 진행했습니다.

비슷한 feature 끼리 묶어 모델 학습에 도움을 주고자 clustering을 진행했습니다.


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from category_encoders import * 

In [38]:
seed = 842

In [39]:
def preprocess(df, test=False):
    if test:
        df.columns = ['ID', 'Month', 'Day', 'Measurement Time', 'T (°⁣C)', 'T (K)',
    'Dew T(°C)', 'Relative Humidity (%)', 'P (mbar)', 'Saturation Vapor P(mbar)',
    'Actual Vapor P(mbar)', 'Vapor P Shortage (mbar)', 'Vapor Content (g/kg)', 'Air Density (g/m**3)',
    'Direction (deg)']
    else:
        df.columns = ['ID', 'Month', 'Day', 'Measurement Time', 'T (°⁣C)', 'T (K)',
      'Dew T(°C)', 'Relative Humidity (%)', 'P (mbar)', 'Saturation Vapor P(mbar)',
      'Actual Vapor P(mbar)', 'Vapor P Shortage (mbar)', 'Vapor Content (g/kg)', 'Air Density (g/m**3)',
      'Direction (deg)', 'Velocity (m/s)']
    
    df['Measurement Time'] = df['Measurement Time'].replace({'새벽' : 0, '오전' : 1, '오후' : 2, '저녁' : 3})
    df['Time_cos'] = df['Measurement Time'].apply(lambda x : np.cos(np.pi * x / 2))
    df['Time_sin'] = df['Measurement Time'].apply(lambda x : np.sin(np.pi * x / 2))

    df['Direction (sign)'] = ((df['Direction (deg)'] + 11.25) % 360) // 22.5
    df['Direction (sign)'] = df['Direction (sign)'].astype('int')

    df['Date'] = df['Month'].apply(lambda x : format(x, '02')) + df['Day'].apply(lambda x : format(x, '02')) # category

    df['cat_Month'] = df.Month.astype('category')
    df['cat_day'] = df['Day'].astype('category')
    df['cat_Measurement Time'] = df['Measurement Time'].astype('category')

    df['Direction_x'] = df['Direction (deg)'].apply(lambda x : np.cos(np.pi * x / 180))
    df['Direction_y'] = df['Direction (deg)'].apply(lambda x : np.sin(np.pi * x / 180))

    df['Month_cos'] = df['Month'].apply(lambda x : np.cos((x-1)/6 * np.pi))
    df['Month_sin'] = df['Month'].apply(lambda x : np.sin((x-1)/6 * np.pi))

    df = df.drop(['ID', 'T (K)', 'Vapor Content (g/kg)'], axis=1)
    return df

In [40]:
df = pd.read_csv('data/train.csv')
df_train = preprocess(df.copy())
df_train.tail(2)

Unnamed: 0,Month,Day,Measurement Time,T (°⁣C),Dew T(°C),Relative Humidity (%),P (mbar),Saturation Vapor P(mbar),Actual Vapor P(mbar),Vapor P Shortage (mbar),...,Time_sin,Direction (sign),Date,cat_Month,cat_day,cat_Measurement Time,Direction_x,Direction_y,Month_cos,Month_sin
36579,9,10,3,25.65,15.3,52.81,988.39,32.98,17.41,15.56,...,-1.0,10,910,9,10,3,-0.702153,-0.712026,-0.5,-0.866025
36580,3,11,1,3.14,1.88,91.4,1000.01,7.66,7.0,0.66,...,1.0,1,311,3,11,1,0.860119,0.510093,0.5,0.866025


In [41]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36581 entries, 0 to 36580
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Month                     36581 non-null  int64   
 1   Day                       36581 non-null  int64   
 2   Measurement Time          36581 non-null  int64   
 3   T (°⁣C)                   36581 non-null  float64 
 4   Dew T(°C)                 36581 non-null  float64 
 5   Relative Humidity (%)     36581 non-null  float64 
 6   P (mbar)                  36581 non-null  float64 
 7   Saturation Vapor P(mbar)  36581 non-null  float64 
 8   Actual Vapor P(mbar)      36581 non-null  float64 
 9   Vapor P Shortage (mbar)   36581 non-null  float64 
 10  Air Density (g/m**3)      36581 non-null  float64 
 11  Direction (deg)           36581 non-null  float64 
 12  Velocity (m/s)            36581 non-null  float64 
 13  Time_cos                  36581 non-null  floa

Clustring

In [42]:
from pycaret import clustering
from sklearn.preprocessing import StandardScaler

# cluster setting

num_cluster = 3
clustering_features = ['Month_cos', 'Month_sin', 'T (°\u2063C)', 'P (mbar)', 
                       'Actual Vapor P(mbar)', 'Direction_x', 'Direction_y']

cluster = clustering.setup(data=df_train, ignore_features=[i for i in df_train.columns if i not in clustering_features],
                           session_id = seed, use_gpu=True, normalize=True)
kmeans = clustering.create_model('kmeans', num_clusters= num_cluster)

Unnamed: 0,Description,Value
0,Session id,842
1,Original data shape,"(36581, 24)"
2,Transformed data shape,"(36581, 7)"
3,Ignore features,17
4,Numeric features,7
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Normalize,True


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2473,12999.8747,1.5495,0,0,0


In [43]:
df_train['Cluster'] = clustering.predict_model(model=kmeans, data=df_train.astype(np.float32))['Cluster']
df_train['Cluster'] = df_train['Cluster'].apply(lambda x : int(x[-1]))

for i in range(num_cluster):
    df_train[f'Cluster Distance {i}'] = kmeans.transform(cluster.train_transformed)[:, i]

df_train.tail(2)

Unnamed: 0,Month,Day,Measurement Time,T (°⁣C),Dew T(°C),Relative Humidity (%),P (mbar),Saturation Vapor P(mbar),Actual Vapor P(mbar),Vapor P Shortage (mbar),...,cat_day,cat_Measurement Time,Direction_x,Direction_y,Month_cos,Month_sin,Cluster,Cluster Distance 0,Cluster Distance 1,Cluster Distance 2
36579,9,10,3,25.65,15.3,52.81,988.39,32.98,17.41,15.56,...,10,3,-0.702153,-0.712026,-0.5,-0.866025,0,2.056279,4.296998,4.377728
36580,3,11,1,3.14,1.88,91.4,1000.01,7.66,7.0,0.66,...,11,1,0.860119,0.510093,0.5,0.866025,1,3.981288,2.712475,3.057602


In [44]:
from pycaret.regression import *
import category_encoders

reg = setup(data=df_train, target='Velocity (m/s)', ignore_features=['Direction_x', 'Direction_y'],
            use_gpu=True, session_id=seed, max_encoding_ohe=1, 
            encoding_method=category_encoders.TargetEncoder(smoothing=10))

Unnamed: 0,Description,Value
0,Session id,842
1,Target,Velocity (m/s)
2,Target type,Regression
3,Original data shape,"(36581, 28)"
4,Transformed data shape,"(36581, 26)"
5,Transformed train set shape,"(25606, 26)"
6,Transformed test set shape,"(10975, 26)"
7,Ignore features,2
8,Numeric features,21
9,Categorical features,4


In [45]:
reg.pipeline

In [46]:
best = compare_models(n_select=4)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.392,0.2944,0.5424,0.8774,0.1823,0.3113,0.54
rf,Random Forest Regressor,0.4152,0.3238,0.5688,0.8652,0.1921,0.3433,0.467
xgboost,Extreme Gradient Boosting,0.4882,0.4295,0.6553,0.8212,0.2219,0.3985,0.327
catboost,CatBoost Regressor,0.5323,0.5062,0.7114,0.7893,0.2363,0.4342,0.286
lightgbm,Light Gradient Boosting Machine,0.549,0.5319,0.7292,0.7787,0.2438,0.4596,0.319
dt,Decision Tree Regressor,0.5397,0.6036,0.7767,0.7489,0.2569,0.4091,0.276
knn,K Neighbors Regressor,0.5975,0.6901,0.8305,0.7126,0.2804,0.5435,0.314
gbr,Gradient Boosting Regressor,0.6598,0.7738,0.8796,0.6779,0.2857,0.552,0.289
lr,Linear Regression,0.7725,1.0595,1.0293,0.559,0.3297,0.6415,0.278
br,Bayesian Ridge,0.7725,1.0596,1.0293,0.5589,0.3297,0.6415,0.287


In [47]:
best[2] = tune_model(best[2], optimize='MAE', n_iter=100)
best[3] = tune_model(best[3], optimize='MAE', n_iter=100)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4102,0.3332,0.5773,0.8625,0.1929,0.3323
1,0.4125,0.3162,0.5623,0.866,0.191,0.3339
2,0.4041,0.3077,0.5547,0.8722,0.1855,0.3146
3,0.4061,0.2992,0.547,0.8713,0.1864,0.3275
4,0.4195,0.3304,0.5748,0.865,0.1914,0.3212
5,0.4206,0.3436,0.5862,0.8522,0.1952,0.3253
6,0.3989,0.2927,0.5411,0.8799,0.1866,0.3193
7,0.415,0.3189,0.5647,0.8724,0.1898,0.3278
8,0.4122,0.3387,0.582,0.859,0.194,0.3296
9,0.4055,0.3188,0.5646,0.8674,0.1923,0.335


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4712,0.3991,0.6318,0.8353,0.2163,0.3878
1,0.4647,0.3836,0.6193,0.8375,0.2143,0.3841
2,0.4576,0.3679,0.6066,0.8472,0.2056,0.3613
3,0.4502,0.3581,0.5984,0.846,0.2068,0.3667
4,0.4732,0.4019,0.6339,0.8358,0.2119,0.3643
5,0.47,0.398,0.6309,0.8288,0.2138,0.3627
6,0.4596,0.372,0.6099,0.8473,0.209,0.3655
7,0.4768,0.406,0.6372,0.8375,0.215,0.3777
8,0.4649,0.4033,0.635,0.8321,0.2163,0.3794
9,0.4674,0.4044,0.6359,0.8317,0.2159,0.3816


Fitting 10 folds for each of 100 candidates, totalling 1000 fits




60 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "d:\project\wind_speed\wind_speed\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\project\wind_speed\wind_speed\lib\site-packages\pycaret\internal\pipeline.py", line 260, in fit
    fitted_estimator = self._memory_fit(
  File "d:\project\wind_speed\wind_speed\lib\site-packages\joblib\memory.py", line 655, in __call__
    return self._cached_call(args, kwargs)[0]
  File "d:\project\wind_speed\wind_speed\lib\site-packages\pycaret\internal\memory.py", line 398, in _cached_c

In [48]:
stack_lr = stack_models(best, optimize='MAE', choose_better=True)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3926,0.3068,0.5539,0.8734,0.1836,0.308
1,0.3931,0.2914,0.5398,0.8765,0.1826,0.3117
2,0.3854,0.2761,0.5255,0.8853,0.1759,0.2932
3,0.3804,0.2695,0.5191,0.8841,0.1735,0.2938
4,0.3942,0.2951,0.5432,0.8794,0.1809,0.2978
5,0.3963,0.3053,0.5525,0.8687,0.183,0.2963
6,0.3712,0.2523,0.5023,0.8964,0.1725,0.2853
7,0.3927,0.2837,0.5326,0.8865,0.179,0.3017
8,0.3887,0.2966,0.5446,0.8765,0.1816,0.3053
9,0.3795,0.2824,0.5314,0.8825,0.1811,0.3077


In [49]:
stack_finalized = finalize_model(stack_lr)

In [54]:
df_test = preprocess(pd.read_csv('data/test.csv'), True)

df_test['Cluster'] = clustering.predict_model(model=kmeans, data=df_test.astype(np.float32))['Cluster']
df_test['Cluster'] = df_test['Cluster'].apply(lambda x : int(x[-1]))

for i in range(num_cluster):
    df_test[f'Cluster Distance {i}'] = kmeans.transform(cluster.get_config('pipeline').transform(df_test[clustering_features]))[:, i]

df_test

Unnamed: 0,Month,Day,Measurement Time,T (°⁣C),Dew T(°C),Relative Humidity (%),P (mbar),Saturation Vapor P(mbar),Actual Vapor P(mbar),Vapor P Shortage (mbar),...,cat_day,cat_Measurement Time,Direction_x,Direction_y,Month_cos,Month_sin,Cluster,Cluster Distance 0,Cluster Distance 1,Cluster Distance 2
0,3,24,0,4.28,-0.21,72.5,984.48,8.30,6.02,2.28,...,24,0,-0.312335,-0.949972,5.000000e-01,0.866025,2,3.676719,2.789189,1.130520
1,9,24,3,13.40,10.36,81.8,996.98,15.40,12.59,2.80,...,24,3,-0.699663,-0.714473,-5.000000e-01,-0.866025,0,1.962391,2.683555,3.430493
2,5,28,3,19.89,14.95,73.2,984.83,23.26,17.03,6.23,...,28,3,0.983603,0.180347,-5.000000e-01,0.866025,0,2.474972,4.602650,3.988125
3,1,17,3,-2.88,-4.47,88.7,998.02,4.94,4.38,0.56,...,17,3,-0.170209,-0.985408,1.000000e+00,0.000000,1,4.546688,1.981464,2.610154
4,10,22,2,6.97,4.36,83.4,987.00,10.01,8.35,1.66,...,22,2,-0.130526,-0.991445,-1.836970e-16,-1.000000,1,2.937851,2.215346,2.666295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15673,3,16,0,1.61,-0.49,85.9,1001.26,6.86,5.89,0.97,...,16,0,0.556586,0.830790,5.000000e-01,0.866025,1,4.233205,2.741520,3.196230
15674,9,5,2,18.27,13.35,73.0,990.98,21.03,15.35,5.68,...,5,2,-0.081939,-0.996637,-5.000000e-01,-0.866025,0,1.869830,3.532801,3.834535
15675,8,8,0,16.08,12.25,78.0,994.97,18.30,14.28,4.03,...,8,0,-0.985408,-0.170209,-8.660254e-01,-0.500000,0,1.409275,3.258580,3.607442
15676,2,4,0,3.07,-0.95,74.8,994.31,7.62,5.70,1.92,...,4,0,-0.005236,-0.999986,8.660254e-01,0.500000,2,3.932658,2.155779,1.928483


In [55]:
df_test = df_test.drop(['Direction_x', 'Direction_y'], axis=1)

In [56]:
df_submit = pd.read_csv('data/sample_submission.csv')
df_submit['풍속 (m/s)'] = stack_finalized.predict(df_test)
df_submit

Unnamed: 0,ID,풍속 (m/s)
0,TEST_00000,1.947530
1,TEST_00001,1.118981
2,TEST_00002,2.030871
3,TEST_00003,1.116085
4,TEST_00004,1.196756
...,...,...
15673,TEST_15673,4.784213
15674,TEST_15674,1.935273
15675,TEST_15675,1.161204
15676,TEST_15676,2.007375


In [57]:
df_submit.to_csv('data/2nd_submission.csv', index=False)