In [15]:
import sys
sys.path.append('../src/')
sys.dont_write_bytecode = True  # __pycache__ 生成を防ぐ

from vege_train_memo import Model, Experiments
from vege_train import TrainModel


# default
import numpy as np
import pandas as pd

# Manage experiments
import mlflow

# For training
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# import model framework
import lightgbm as lgb


# mlflow setting
DB_PATH = '../server/mlruns.db'
ARTIFACT_LOCATION = '../data/'
EXPERIMENT_NAME = '02_model_train'


## データ生成

In [16]:
lag_monthly_aggregation = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
lag_daily_modeprice = [30, 40, 50, 60]


vege = TrainModel()
train_df = vege.generate_model_data_2(
    file_name='model_input_data_daily',
    _lag_list=lag_monthly_aggregation,
    _lag_list2 = lag_daily_modeprice, 
    save_as_csv=False,
    for_submission=False
    )

test_df = vege.generate_model_data_2(
    file_name='model_input_data_daily',
    _lag_list=lag_monthly_aggregation,
    _lag_list2 = lag_daily_modeprice, 
    save_as_csv=False,
    for_submission=True
    )

#train_df.to_csv('../data/old/train_df_input.csv')
#test_df.to_csv('../data/old/test_df_input.csv')

#train_df = pd.read_csv('../data/old/train_df_input.csv')
#test_df = pd.read_csv('../data/old/test_df_input.csv')

Start TrainModel
Folder is already there
だいこん
にんじん
キャベツ
レタス
はくさい
こまつな
ほうれんそう
ねぎ
きゅうり
トマト
ピーマン
じゃがいも
なましいたけ
セルリー
そらまめ
ミニトマト


(16349, 225)

だいこん
にんじん
キャベツ
レタス
はくさい
こまつな
ほうれんそう
ねぎ
きゅうり
トマト
ピーマン
じゃがいも
なましいたけ
セルリー
そらまめ
ミニトマト


(16349, 225)

In [17]:
grouping_by = 'kind'
target_vege_type = set(train_df[grouping_by])

Y_column_name = 'mode_price'
X_column_name = set(train_df.head().select_dtypes(float).columns) - set([Y_column_name])

In [18]:
setting = {
    'CV': {
        'n_splits' : 3,
        'shuffle' : True, 
        'random_state' : 42
    },
    'model_params':{
        'params': {
            'objective': 'regression',
            'metric': 'rmse',
            'num_leaves': 100,
            'max_depth': 10,
            "feature_fraction": 0.8,
            'subsample_freq': 1,
            "bagging_fraction": 0.95,
            'min_data_in_leaf': 2,
            'learning_rate': 0.1,
            "boosting": "gbdt",
            "lambda_l1": 0.1,
            "lambda_l2": 10,
            "random_state": 42,
            "verbosity" : -1
        },
        "num_boost_round": 5000,
        "verbose_eval":-1
    },
    'data_processing' : {
        'monthly lag' : lag_monthly_aggregation,
        'daily lag ' : lag_daily_modeprice
    }
}


In [19]:

# 実験開始
Project = Experiments('Submit test')

res_list = []
for vege_type in target_vege_type:
    # 特定の野菜について取り出す
    df = train_df[train_df[grouping_by] == vege_type].select_dtypes(
        float).astype('float32')
    # 'mode_price'に欠損がない部分を訓練データとして利用する
    train = df[~df[Y_column_name].isna()]

    # 目的変数の切り分け
    X = train[X_column_name].to_numpy()
    Y = train[Y_column_name].to_numpy()
    
    # settingの追加
    setting['tag_info'] = {
        'vege_type' : vege_type
        }
    
    # モデルの学習実験
    Project.ready_experiment(setting)
    Project.start_experiment(X, Y)
    
    # 予測結果の出力
    X_test = test_df[test_df['kind'] == vege_type][X_column_name].astype('float32')
    
    out = Project.best_model_predict(X_test.to_numpy())
    
    res_list.append(
        pd.DataFrame(
            {
                'kind' : [vege_type] * len(out),
                'date' : test_df[test_df['kind'] == vege_type].date.to_list(),
                'mode_price' : out
                }
            ).sort_values(by = 'date')
        )


=== fold 0 MAE: {'RMSPE': 17.073990882049248}
=== fold 1 MAE: {'RMSPE': 14.567948627254875}
=== fold 2 MAE: {'RMSPE': 16.425685419746564}
=== CV score: 16.022541643016897
----------------------------------------------------
command1: cd ../server/
command2: mlflow ui --backend-store-uri sqlite:///mlruns.db
=== fold 0 MAE: {'RMSPE': 17.92458622046735}
=== fold 1 MAE: {'RMSPE': 15.8559427884773}
=== fold 2 MAE: {'RMSPE': 18.326492836623988}
=== CV score: 17.369007281856213
----------------------------------------------------
command1: cd ../server/
command2: mlflow ui --backend-store-uri sqlite:///mlruns.db
=== fold 0 MAE: {'RMSPE': 15.420612741953695}
=== fold 1 MAE: {'RMSPE': 15.16875985911411}
=== fold 2 MAE: {'RMSPE': 15.968696596296846}
=== CV score: 15.51935639912155
----------------------------------------------------
command1: cd ../server/
command2: mlflow ui --backend-store-uri sqlite:///mlruns.db
=== fold 0 MAE: {'RMSPE': 12.07539435186049}
=== fold 1 MAE: {'RMSPE': 10.2029678

In [20]:
out = pd.concat(res_list)
out.columns = ['kind', 'date', 'mode_price']
out.to_csv('../data/submit.csv', index = False)

In [21]:
setting

{'CV': {'n_splits': 3, 'shuffle': True, 'random_state': 42},
 'model_params': {'params': {'objective': 'regression',
   'metric': 'rmse',
   'num_leaves': 100,
   'max_depth': 10,
   'feature_fraction': 0.8,
   'subsample_freq': 1,
   'bagging_fraction': 0.95,
   'min_data_in_leaf': 2,
   'learning_rate': 0.1,
   'boosting': 'gbdt',
   'lambda_l1': 0.1,
   'lambda_l2': 10,
   'random_state': 42,
   'verbosity': -1},
  'num_boost_round': 5000,
  'verbose_eval': -1},
 'data_processing': {'monthly lag': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
  'daily lag ': [30, 40, 50, 60]},
 'tag_info': {'vege_type': 'にんじん'}}