In [10]:
import pandas as pd
import json
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.lightgbm import log_model
from sklearn.model_selection import TimeSeriesSplit
import yaml
from settings import settings
import lgbm_tuner.columns_filter as col_filter
import lgbm_tuner.tuner as tuner
from ml_tune_helpers.lgbm_optuna.optuna_lgb_search import OptunaLgbSearch
import warnings
warnings.filterwarnings('ignore')

In [2]:
def __extract_labels(df_ts: pd.DataFrame, target_column: str):
    x_ts = df_ts.drop([target_column], axis=1)
    y_ts = df_ts[target_column]
    return x_ts, y_ts

In [0]:
file_name = "params.yaml"
with open(file_name, 'r', encoding='UTF-8') as file_stream:
    yaml_params = yaml.safe_load(file_stream)

model_params = yaml_params["lgbm_pm25"]
metric = yaml_params["metric"]
optuna_params = yaml_params["optuna"]
pol_id = model_params["pol_id"]
target_column_name = col_filter.get_target_column(
    prediction_value_type=model_params['prediction_value_type'],
    pol_id=pol_id)

default_params = {'n_jobs': -1, 'verbosity': -1, 'metric': 'rmse', 'boosting_type': 'gbdt',
                  'extra_trees': True, 'n_estimators': 1000, 'num_leaves': 150, 'learning_rate': 0.01,
                  'subsample': 0.7, 'subsample_freq': 5, 'subsample_for_bin': 100000, 'min_child_samples': 30,
                  'reg_alpha': 0.1, 'reg_lambda': 0.2, 'max_depth': 10, 'max_bin': 150}

In [17]:
df_train = pd.read_csv("experiments_results/lgbm/6001/train.csv", parse_dates=True,
                       index_col=settings.DATE_COLUMN_NAME)
df_val = pd.read_csv("experiments_results/lgbm/6001/val.csv", parse_dates=True,
                     index_col=settings.DATE_COLUMN_NAME)
x_train_1, y_train_1 = __extract_labels(df_train, target_column_name)
x_val_1, y_val_1 = __extract_labels(df_val, target_column_name)

In [14]:
TRAIN_DATE_FROM = '2015-01-08'
TRAIN_DATE_TO = '2023-02-05'
VAL_DATE_FROM = '2023-02-06'
VAL_DATE_TO = '2023-02-12'

CSV_JOINT_FILE = 'datasources/ds_joint/ds.csv'
df_joint = pd.read_csv(CSV_JOINT_FILE, parse_dates=True, index_col='DatetimeEnd')

In [15]:
optuna_tuner, x_train, y_train, x_val, y_val = tuner.init_optuna(
    df_timeseries=df_joint, pol_id=pol_id, prediction_value_type="AQI",
    use_aqi_cols=True, use_c_mean_cols=False, use_lag_cols=True, use_gen_lags_cols=True, use_weather_cols=True,
    train_start_dt=TRAIN_DATE_FROM, train_end_dt=TRAIN_DATE_TO, test_start_dt=VAL_DATE_FROM, test_end_dt=VAL_DATE_TO, default_params=default_params, default_category=model_params["default_category"], categories_for_optimization=model_params["categories"], default_top_features_count=-1)

In [29]:
set(x_val.columns.values) == set(x_val_1.columns.values)

False

In [43]:
optuna_tuner = OptunaLgbSearch(
    study_name=f'lgbm_{pol_id if pol_id > 0 else "all"}',
    metric=metric,
    objective=optuna_params["objective"],
    x_train=x_train_1, y_train=y_train_1,
    x_val=x_val_1, y_val=y_val_1,
    default_params= model_params["default_params"],
    default_category=model_params["default_category"],
    categories_for_optimization=model_params["categories"],
    default_top_features_count=-1)

In [44]:
optuna_tuner.run_params_search(
    n_trials=100,
    n_jobs=6,
    save_best_params=True,
    direction='minimize',
    best_features_only=True,
    search_category=True,
    with_pruner=True,
    cv_splitter= TimeSeriesSplit(16),
    warm_params=None)

run_params_search n_trials=100, search_category=True, best_features_only=True, with_pruner=True


  0%|          | 0/100 [00:00<?, ?it/s]

[LightGBM] [Fatal] Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .

[LightGBM] [Fatal] Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .



[33m[W 2023-04-14 10:43:23,409][0m Trial 2 failed with parameters: {'n_jobs': -1, 'verbosity': -1, 'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt', 'extra_trees': True, 'n_estimators': 1400, 'num_leaves': 85, 'learning_rate': 0.11118757375018051, 'subsample': 0.22034051608484195, 'subsample_freq': 6, 'subsample_for_bin': 238663, 'min_child_samples': 10, 'reg_alpha': 0.8739491050653283, 'reg_lambda': 0.8299948182739849, 'max_depth': 9, 'max_bin': 175, 'categorical_features': ['month', 'weekday']} because of the following error: LightGBMError('Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .\n').[0m
Traceback (most recent call last):
  File "/home/alexandra/work/projects/air_pol/air_pollution_predict/airpollpredictor/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/home/alexandra/work/projects/air_pol/air_pollution_predict/airpollpredictor

LightGBMError: Check failed: (num_data) > (0) at /__w/1/s/python-package/compile/src/io/dataset.cpp, line 33 .


In [None]:

train_score_best, val_score_best, model_best = optuna_tuner.run_model_and_eval(
    params=optuna_tuner.study_best_params,
    categorical_features=optuna_tuner.study_best_params['categorical_features'],
    best_features_only=True,
    set_as_best_model=False)