In [1]:
import sys, os
import pandas as pd
import pathSetting
sys.path.append("../../..")

from Clust.clust.ML.common.common import p1_integratedDataSaving as p1
from Clust.clust.ML.tool import data as ml_data
from Clust.clust.ML.tool import model as ml_model
from Clust.clust.ML.tool import clean as ml_clean
from Clust.clust.ML.tool import meta as ml_meta

import torch

#import main_regression as mr
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"{device}" " is available.")

cuda is available.


In [2]:
# Set regression mode i.e., 'regression','forecast' 
mode_selection = "regression"

# Set model methods i.e., 'LSTM_rg', 'GRU_rg', 'RNN_rg', 'CNN_1D_rg', 'LSTM_FCNs_rg', 'FC_rg' 
model_method = 'GRU_rg'

# 2. Training 

## 2-1. Data selection

In [3]:
DataMeta = p1.read_json_data(pathSetting.DataMetaPath)
dataList =  list(DataMeta.keys())

In [4]:
# for regression data
if mode_selection == 'regression':
    #2
    # dataX
    dataName_X = dataList[0]
    dataSaveMode_X = DataMeta[dataName_X]["integrationInfo"]["DataSaveMode"]

    # datay
    dataName_y = dataList[1]
    dataSaveMode_y = DataMeta[dataName_y]["integrationInfo"]["DataSaveMode"]

    #3
    dataFolderName = "data_integrated_result"
    current = os.getcwd()
    dataFolderPath = os.path.join(current, dataFolderName)
    dataX = ml_data.get_saved_integrated_data(dataSaveMode_X, dataName_X, dataFolderPath)
    datay = ml_data.get_saved_integrated_data(dataSaveMode_y, dataName_y, dataFolderPath)
    integration_freq_sec = DataMeta[dataName_X]["integrationInfo"]["integration_freq_sec"]

# for forecast data test
elif mode_selection == 'forecast':
    cleanParamList = ['Clean', 'NoClean']
    cleanMode = cleanParamList[1]

    datasetNameList = ['Hs1SwineFarmWithWeatherTime', 'gunwiStrawberryWithWeatherTime', 'strawberryOpenTime']
    datasetName = datasetNameList[0]

    dataName_X = 'train' + cleanMode + '_' + datasetName
    dataSaveMode_X = DataMeta[dataName_X]['integrationInfo']['DataSaveMode']

    dataX = ml_data.get_saved_integrated_data(dataSaveMode_X, dataName_X, pathSetting.dataFolderPath)
    integration_freq_sec = DataMeta[dataName_X]['integrationInfo']['integration_freq_sec']

## 2-2. Training Data Preparation

In [None]:
# for regression
if mode_selection == 'regression':
    # 2 Training Data Preparation
    # 2-1
    featureListX= list(dataX.columns)
    featureListy= list(datay.columns)

    # 2-2
    cleanTrainDataParam = 'NoClean'#  Classification, Regression과 같이 X, y가 분리된 경우에는 현재 고정해서 사용해야함

    # 2-2-1 cleanTrainDataParam == Clean 일 경우
    NaNProcessingParam ={
        "feature_cycle":'Day',
        "feature_cycle_times":1,
        "NanInfoForCleanData":{'type':'num', 'ConsecutiveNanLimit':3, 'totalNaNLimit':30000}
    }
    # 2-3
    scalerParam='scale'
    scaleMethod='minmax'

    # 2-4
    splitRatio = 0.8

    # 2-5
    scalerRootPath_X = os.path.join(pathSetting.scalerRootDir, dataName_X, cleanTrainDataParam)
    scalerRootPath_y = os.path.join(pathSetting.scalerRootDir, dataName_X, cleanTrainDataParam)
    train_x, val_x, X_scalerFilePath = ml_data.get_train_val_data(dataX, featureListX, scalerRootPath_X, splitRatio, scalerParam, scaleMethod)
    train_y, val_y, y_scalerFilePath = ml_data.get_train_val_data(datay, featureListy, scalerRootPath_y, splitRatio, scalerParam, scaleMethod)

# for forecast
elif mode_selection == 'forecast':
    # 2-1
    featureListX = ['Temperature', 'out_temp','sin_hour']
    target_col = 'Temperature'

    # 2-2
    cleanTrainDataParam = cleanMode

    # 2-2-1 cleanMode == Clean 일 때만 활성화
    NaNProcessingParam ={
        "feature_cycle":'Day',
        "feature_cycle_times":1,
        "NanInfoForCleanData":{'type':'num', 'ConsecutiveNanLimit':3, 'totalNaNLimit':30000}
    }

    # 2-3
    scalerParam='scale'
    scaleMethod ='robust'

    # 2-4
    splitRatio = 0.8

    # 2-5
    scalerRootPath_X = os.path.join(pathSetting.scalerRootDir, dataName_X, cleanMode)
    train_x, val_x, X_scalerFilePath = ml_data.get_train_val_data(dataX, featureListX, scalerRootPath_X, splitRatio, scalerParam, scaleMethod)
    train_x, val_x = ml_clean.delete_low_quality_train_val_data(train_x, val_x, cleanMode, NaNProcessingParam)

## 2-3. Training Data Transformation

In [None]:
# for regression
if mode_selection == 'regression':
    transformParameter = {}

    from Clust.clust.transformation.type.DFToNPArray import transDFtoNP, trans_df_to_np, trans_df_to_np_inf

    trainX, trainy = transDFtoNP(train_x, train_y)
    valX, valy = transDFtoNP(val_x, val_y)

# for forecast
elif mode_selection == 'forecast':
    transformParameter = {
    'future_step': 2,
    'past_step': 24,
    'feature_col': featureListX,
    'target_col': target_col,
    'clean_param': cleanTrainDataParam
    }
    from Clust.clust.transformation.purpose.machineLearning import LSTMData

    LSTMD = LSTMData()
    trainX, trainy = LSTMD.transform_Xy_arr(train_x, transformParameter, transformParameter['clean_param'])
    valX, valy = LSTMD.transform_Xy_arr(val_x, transformParameter, transformParameter['clean_param'])

## 2-4 Set Model Parameters & Train Parameters

In [None]:
input_size, seq_len = trainX.shape[1], trainX.shape[2]
print(input_size, seq_len)

# RNN models (RNN, LSTM, GRU) parameters
if model_method == 'LSTM_rg' or model_method == 'GRU_rg' or model_method == 'RNN_rg':
    modelParameter = {
        'rnn_type': 'lstm',
        'input_size': input_size, 
        'hidden_size': 64,
        'num_layers': 2,
        'output_dim': 1, 
        'dropout': 0.1, 
        'bidirectional': True
    }
# CNN_1D model parameters
elif model_method == 'CNN_1D_rg':
    modelParameter = {
    'input_size': input_size,
    'seq_len': seq_len,
    'output_channels': 64,
    'kernel_size': 3,
    'stride': 1,
    'padding': 0, 
    'dropout': 0.1
    }
# LSTM_FCNs model parameters
elif model_method == 'LSTM_FCNs_rg':
    modelParameter = {
    'input_size': input_size,
    'num_layers': 2,
    'lstm_dropout': 0.4,
    'fc_dropout': 0.1
    }
# FC model parameters
elif model_method == 'FC_rg':
    modelParameter = {
    'input_size': input_size,
    'dropout': 0.1,
    'bias': True
    }

trainParameter = {
    'lr': 1e-4,
    'weight_decay': 1e-6, 
    'device': 'cpu', 
    'n_epochs': 10, 
    'batch_size': 16
}

In [None]:
modelTags =["aaaaa"]
trainDataType = "timeseries"
modelPurpose = "regression"

# # 2
trainDataInfo = DataMeta[dataName_X]['integrationInfo']

# 3. 모델을 저장할 파일 패스를 생성한다.

model_name = 'Test' + model_method
from Clust.clust.transformation.general.dataScaler import encode_hash_style
trainParameter_encode =  encode_hash_style(str(trainParameter))
trainDataPathList = [model_name, dataName_X, trainParameter_encode]
modelFilePath = ml_model.get_model_file_path(trainDataPathList, model_method)

## 2-5 Training 

In [None]:
from Clust.clust.ML.regression_YK.train import RegressionTrain as RML

rml = RML()
rml.set_param(trainParameter)
rml.set_model(model_method, modelParameter)
rml.set_data(trainX, trainy, valX, valy)
rml.train()
rml.save_best_model(modelFilePath)

## 2-6 Save MetaData 

In [None]:
from Clust.clust.ingestion.mongo.mongo_client import MongoClient
from Clust.setting import influx_setting_KETI as ins
mongo_client = MongoClient(ins.CLUSTMetaInfo2)

# from Clust.clust.transformation.general.dataScaler import encode_hash_style
# trainParameter_encode =  encode_hash_style(str(trainParameter))
modelInfoMeta ={
    "trainDataInfo":trainDataInfo,
    "modelName":model_name,
    "featureList":featureListX,
    "target": featureListy,
    "trainDataType":trainDataType,
    "modelPurpose":modelPurpose,
    "model_method":model_method,
    "modelTags":modelTags,
    "cleanTrainDataParam":cleanTrainDataParam,
    "NaNProcessingParam":NaNProcessingParam,
    "trainDataName":[dataName_X, dataName_y],
    "trainParameter": trainParameter,
    "modelParameter": modelParameter,
    "transformParameter":transformParameter,
    "NaNProcessingParam":NaNProcessingParam,
    "scalerParam":scalerParam,
    "files":{
        "modelFile":{
            "fileName":"model.pth",
            "filePath":modelFilePath
        },
        "XScalerFile":{
            "fileName":"scaler.pkl",
            "filePath":X_scalerFilePath       
        },
        "yScalerFile":{
            "fileName":"scaler.pkl",
            "filePath":y_scalerFilePath       
        }
    }
}

modelInfoMeta = ml_meta.save_model_meta_data(mongo_client, modelInfoMeta)
