# AutoML을 활용한 모델링

In [2]:
# preprocessing
import numpy as np
import pandas as pd
import tqdm
from scipy import stats

# imputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# model learning
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.utils.class_weight import compute_sample_weight

# 평가 지표
from sklearn.metrics import mean_absolute_error

# 모델 저장
import pickle

In [3]:
# 계절 이름
my_season = 'summer'

In [4]:
# load data
data = pd.read_csv('../data/' + my_season + '_mice_twice_train.csv')
test = pd.read_csv('../data/' + my_season + '_mice_twice_test.csv')

In [5]:
# split the train & valid data
X_train, X_valid, y_train, y_valid = train_test_split(
    data[data.columns.difference(['ts', 'stn', 'year'])],
    data['ts'],
    test_size=0.3,
    random_state=42,
)

In [10]:
data.dtypes

ta                float64
td                float64
hm                float64
ws                float64
sun               float64
rain              float64
rn                float64
re                float64
si                float64
ss                float64
mm                float64
dd                float64
hh                float64
ts                float64
year               object
stn                 int64
sensory_temp      float64
vapor_pressure    float64
air_pressure      float64
abs_hm            float64
air_mix           float64
dtype: object

In [7]:
data.describe()

Unnamed: 0,ta,td,hm,ws,sun,rain,rn,re,si,ss,mm,dd,hh,ts,stn,sensory_temp,vapor_pressure,air_pressure,abs_hm,air_mix
count,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0,110233.0
mean,22.177477,16.115262,71.80895,2.144995,0.612158,0.42676,0.233774,3.926609,0.802685,0.304204,5.999447,15.832537,11.499751,25.666181,5.499605,24.137018,27.831578,19.46502,14.249243,0.012213
std,5.029071,6.047881,20.134226,1.439565,0.48726,0.716518,2.43777,12.838278,1.058413,0.422075,0.82089,8.854828,6.923801,9.199028,2.872973,5.2649,8.45565,6.763213,4.818149,0.00432
min,3.6,-11.1,8.6,0.0,0.0,0.0,-0.6019,-6.5746,-5.1532,-0.3858,5.0,1.0,0.0,0.0,1.0,5.5576,7.909,2.6385,1.9648,0.0016
25%,18.8,12.4,57.7,1.1,0.0,0.0,0.0,0.0,0.0,0.0,5.0,8.0,5.0,19.5,3.0,20.608,21.7074,14.4049,10.6775,0.009
50%,22.2,16.8,74.4,1.9,1.0,0.0,0.0,0.0,0.24,0.0,6.0,16.0,11.0,24.0,6.0,24.178,26.7707,19.1377,14.0838,0.012
75%,25.7,20.7,89.1,2.9,1.0,1.0,0.0,0.0,1.4,0.8,7.0,23.0,18.0,30.2,8.0,27.808,33.0317,24.4689,17.8397,0.0154
max,37.8,29.1,100.0,16.0,1.0,2.0,623.5,60.0,4.77,1.206,7.0,31.0,23.0,64.9,10.0,40.8783,65.5504,40.2995,28.7462,0.0258


In [7]:
# smape
def smape(y_true, y_predicted, sample_weight=None):
    score = 100 / len(y_true) * np.sum(np.abs(y_predicted - y_true) / (np.abs(y_true) + np.abs(y_predicted)))
    return score

In [8]:
# automl modeling
automl = AutoML(mode="Compete",
                algorithms=["Baseline",
                            "CatBoost",
                            "Xgboost",
                            "Random Forest",
                            "Extra Trees",
                            "LightGBM",
                            "Neural Network",
                            "CatBoost"],
                ml_task = "regression",
                eval_metric = smape,
                random_state = 42
               )

- `Explain` : To to be used when the user wants to explain and understand the data.
                    - Uses 75%/25% train/test split.
                    - Uses the following models: `Baseline`, `Linear`, `Decision Tree`, `Random Forest`, `XGBoost`, `Neural Network`, and `Ensemble`.
                    - Has full explanations in reports: learning curves, importance plots, and SHAP plots.
                    
- `Perform` : To be used when the user wants to train a model that will be used in real-life use cases.
                    - Uses 5-fold CV (Cross-Validation).
                    - Uses the following models: `Linear`, `Random Forest`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`, and `Ensemble`.
                    - Has learning curves and importance plots in reports.

- `Compete` : To be used for machine learning competitions (maximum performance).
                    - Uses 80/20 train/test split, or 5-fold CV, or 10-fold CV (Cross-Validation) - it depends on `total_time_limit`. If not set directly, AutoML will select validation automatically.
                    - Uses the following models: `Decision Tree`, `Random Forest`, `Extra Trees`, `LightGBM`,  `XGBoost`, `CatBoost`, `Neural Network`,
                        `Nearest Neighbors`, `Ensemble`, and `Stacking`.
                    - It has only learning curves in the reports.

- `Optuna` : To be used for creating highly-tuned machine learning models.
                    - Uses 10-fold CV (Cross-Validation).
                    - It tunes with Optuna the following algorithms: `Random Forest`, `Extra Trees`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`.
                    - It applies `Ensemble` and `Stacking` for trained models.
                    - It has only learning curves in the reports.

In [9]:
# fit automl mljar
automl.fit(X_train, y_train)

AutoML directory: AutoML_1
The task is regression with evaluation metric user_defined_metric
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree user_defined_metric 5.161197 trained in 0.62 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline user_defined_metric 13.817937 trained in 2.15 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM user_defined_metric 

AutoML(algorithms=['Baseline', 'CatBoost', 'Xgboost', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Neural Network', 'CatBoost'],
       eval_metric=<function smape at 0x0000026E67510438>, ml_task='regression',
       mode='Compete', random_state=42)

In [6]:
# predict
automl_pred = automl.predict(X_valid)

In [7]:
# MAE값 구하기
mean_absolute_error(y_valid, automl_pred)

1.518110842907827

In [8]:
# testset의 예측값 구하기
# pred_test = automl.predict(test[test.columns.difference(['stn'])])
pred_test = automl.predict(test)

In [9]:
# 값 가져오기
test_result = test.copy()

In [10]:
# 예측값 넣기
test_result['pred_ts'] = pred_test

In [11]:
# write.csv
test_result.to_csv('../data/' + my_season + '_test_result.csv', index = False)

In [12]:
# 모델 저장
model_name = '008_plan'
with open(model_name+'_' + my_season + '_automl.pickle','wb') as fw:
    pickle.dump(automl, fw)

# 모델 불러오기
# with open('model_210519.pickle', 'rb') as f: 
#     model = pickle.load(f)

=======================================================================================================


### 기존에 존재하는 모델이 있을 경우 아래 코드만 사용해서 불러오기

In [3]:
# 모델 불러오기
with open('008_plan_' + my_season + '_automl.pickle', 'rb') as f: 
     model = pickle.load(f)

In [4]:
# 예측값 생성
pred_test = model.predict(test)

In [5]:
# 값 가져오기
test_result = test.copy()

In [6]:
# 예측값 넣기
test_result['pred_ts'] = pred_test

In [7]:
# write.csv
test_result.to_csv('../data/' + my_season + '_test_result.csv', index = False)