In [1]:
import sys
sys.path.append('../src/')
sys.dont_write_bytecode = True  # __pycache__ 生成を防ぐ
from vege_train_memo import Model, Experiments


# default
import numpy as np
import pandas as pd

# Manage experiments
import mlflow

# For training
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# import model framework
import lightgbm as lgb


# mlflow setting
DB_PATH = '../server/mlruns.db'
ARTIFACT_LOCATION = '../data/'
EXPERIMENT_NAME = '02_model_train'


In [2]:
# データの読み込み
all_df = pd.read_csv('../data/model_input_data.csv')  # , index_col=0)


grouping_by = 'kind'
target_vege_type = set(all_df[grouping_by])

Y_column_name = 'mode_price'
X_column_name = set(all_df.head().select_dtypes(
    float).columns) - set(Y_column_name)

for vege_type in target_vege_type:
    # 特定の野菜について取り出す
    df = all_df[all_df[grouping_by] == vege_type].select_dtypes(
        float).astype('float32')
    # 'mode_price'に欠損がない部分を訓練データとして利用する
    train = df[~df[Y_column_name].isna()]

    # 目的変数の切り分け
    X = train[X_column_name].to_numpy()
    Y = train[Y_column_name].to_numpy()

    #X_train, X_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.2)

## クラスの確認

In [3]:
setting = {
    'CV': {
        'n_splits' : 3,
        'shuffle' : True, 
        'random_state' : 42
    },
    'model_params':{
        'params': {
            'objective': 'regression',
            'verbose': -1,
        },
        'num_boost_round' : 100,
        'verbose_eval' : -1
    }
}

In [4]:
Project = Experiments('project1')
Project.Ready_experiment(setting)
Project.Start_experiment(X, Y)

=== fold 0 MAE: 236.32672424316405
=== fold 1 MAE: 324.5181952582466
=== fold 2 MAE: 292.5000305175781
=== CV score: 284.44831667299627
----------------------------------------------------
command1: cd ../server/
command2: mlflow ui --backend-store-uri sqlite:///mlruns.db


## 元コード（メモ）

In [5]:
# トラッキングサーバの（バックエンドの）場所を指定
TRACKING_URL = f'sqlite:///{DB_PATH}'
mlflow.set_tracking_uri(TRACKING_URL)


# Experimentの生成
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# experiment IDの取得
if experiment is None:
    # 当該Experiment存在しないとき、新たに作成
    experiment_id = mlflow.create_experiment(
        name=EXPERIMENT_NAME,
        artifact_location=ARTIFACT_LOCATION)
else:
    # 当該Experiment存在するとき、IDを取得
    experiment_id = experiment.experiment_id

In [6]:
FOLD = 5


valid_scores = []
models = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)


param_dict = {
    'params': {
        'objective': 'regression',
        'verbose': -1,
    },
    'num_boost_round' : 100,
    'verbose_eval' : -1
}

with mlflow.start_run(experiment_id=experiment_id) as run:
    
    # モデルの学習に利用したパラメータを記録
    mlflow.log_params(
        param_dict
        )
    
    model = Model(param_dict)
    
    for fold, (train_indices, valid_indices) in enumerate(kf.split(X)):
        X_train, X_valid = X[train_indices], X[valid_indices]
        y_train, y_valid = Y[train_indices], Y[valid_indices]

        # Convert data for LightGMB
        lgb_train = model.Dataset(X_train, y_train)
        lgb_eval = model.Dataset(X_valid, y_valid)

        # model train
        model.train(
            train_data = lgb_train, 
            valid_data = lgb_eval
            )
        
        # 予測
        y_valid_pred = model.predict(X_valid)

        # calc score
        score = mean_absolute_error(y_valid, y_valid_pred)
        
        # record
        mlflow.log_metrics(
            {
                'fold score' : score
            },
            step = fold
        )
        print(f'fold {fold} MAE: {score}')
        valid_scores.append(score)
        #models.append(model)
    
    # Get mean of model scores
    cv_score = np.mean(valid_scores)
    mlflow.log_metrics(
        {
            'CV_score' : cv_score
        }
    )
    print(f'CV score: {cv_score}')


fold 0 MAE: 224.253251139323
fold 1 MAE: 145.63805745442718
fold 2 MAE: 425.36007080078116
fold 3 MAE: 385.71429443359375
fold 4 MAE: 213.31964666193198
CV score: 278.8570640980114
