# AutoML을 활용한 모델링

In [1]:
# preprocessing
import numpy as np
import pandas as pd
import tqdm
from scipy import stats

# imputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# model learning
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
from sklearn.utils.class_weight import compute_sample_weight

# 평가 지표
from sklearn.metrics import mean_absolute_error

# 모델 저장
import pickle

In [2]:
# load data
data = pd.read_csv('../data/summer_mice_twice_train.csv')
test = pd.read_csv('../data/summer_mice_twice_test.csv')

In [3]:
# split the train & valid data
X_train, X_valid, y_train, y_valid = train_test_split(
    data[data.columns.difference(['ts', 'stn', 'year'])],
    data['ts'],
    test_size=0.3,
    random_state=42,
)

In [4]:
# automl modeling
automl = AutoML(mode="Compete",
                algorithms=["Baseline",
                            "CatBoost",
                            "Xgboost",
                            "Random Forest",
                            "Extra Trees",
                            "LightGBM",
                            "Neural Network",
                            "CatBoost"],
                ml_task = "regression",
                eval_metric = "mae",
                random_state = 42
               )

- `Explain` : To to be used when the user wants to explain and understand the data.
                    - Uses 75%/25% train/test split.
                    - Uses the following models: `Baseline`, `Linear`, `Decision Tree`, `Random Forest`, `XGBoost`, `Neural Network`, and `Ensemble`.
                    - Has full explanations in reports: learning curves, importance plots, and SHAP plots.
                    
- `Perform` : To be used when the user wants to train a model that will be used in real-life use cases.
                    - Uses 5-fold CV (Cross-Validation).
                    - Uses the following models: `Linear`, `Random Forest`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`, and `Ensemble`.
                    - Has learning curves and importance plots in reports.

- `Compete` : To be used for machine learning competitions (maximum performance).
                    - Uses 80/20 train/test split, or 5-fold CV, or 10-fold CV (Cross-Validation) - it depends on `total_time_limit`. If not set directly, AutoML will select validation automatically.
                    - Uses the following models: `Decision Tree`, `Random Forest`, `Extra Trees`, `LightGBM`,  `XGBoost`, `CatBoost`, `Neural Network`,
                        `Nearest Neighbors`, `Ensemble`, and `Stacking`.
                    - It has only learning curves in the reports.

- `Optuna` : To be used for creating highly-tuned machine learning models.
                    - Uses 10-fold CV (Cross-Validation).
                    - It tunes with Optuna the following algorithms: `Random Forest`, `Extra Trees`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`.
                    - It applies `Ensemble` and `Stacking` for trained models.
                    - It has only learning curves in the reports.

In [5]:
# fit automl mljar
automl.fit(X_train, y_train)

AutoML directory: AutoML_19
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Baseline', 'CatBoost', 'Xgboost', 'Random Forest', 'Extra Trees', 'LightGBM', 'Neural Network', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 2.68495 trained in 0.64 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 1 model
1_Baseline mae 7.087233 trained in 2.2 seconds
* Step default_algorithms will try to check up to 6 models
2_Default_LightGBM mae 1.552603 trained in 103.81 seconds
3_Default_Xgboost mae 1.603158 

AutoML(algorithms=['Baseline', 'CatBoost', 'Xgboost', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Neural Network', 'CatBoost'],
       eval_metric='mae', ml_task='regression', mode='Compete',
       random_state=42)

In [6]:
# predict
automl_pred = automl.predict(X_valid)

In [7]:
# MAE값 구하기
mean_absolute_error(y_valid, automl_pred)

1.518110842907827

In [8]:
# testset의 예측값 구하기
# pred_test = automl.predict(test[test.columns.difference(['stn'])])
pred_test = automl.predict(test)

In [9]:
# 값 가져오기
test_result = test.copy()

In [10]:
# 예측값 넣기
test_result['pred_ts'] = pred_test

In [11]:
# write.csv
test_result.to_csv('../data/summer_test_result.csv', index = False)

In [12]:
# 모델 저장
model_name = '008_plan'
with open(model_name+'_summer_automl.pickle','wb') as fw:
    pickle.dump(automl, fw)

# 모델 불러오기
# with open('model_210519.pickle', 'rb') as f: 
#     model = pickle.load(f)

=======================================================================================================

In [3]:
with open('008_plan_summer_automl.pickle', 'rb') as f: 
     model = pickle.load(f)

In [4]:
pred_test = model.predict(test)

In [5]:
# 값 가져오기
test_result = test.copy()

In [6]:
# 예측값 넣기
test_result['pred_ts'] = pred_test

In [7]:
# write.csv
test_result.to_csv('../data/summer_test_result.csv', index = False)