In [1]:
import sys, os
import pandas as pd
sys.path.append("../../../../")

from Clust.setting import influx_setting_KETI as ins
from Clust.clust.ingestion.influx import influx_client_v2 as influx_Client
from Clust.clust.ingestion.mongo.mongo_client import MongoClient
db_client = influx_Client.InfluxClient(ins.CLUSTDataServer2)
mongo_client = MongoClient(ins.CLUSTMetaInfo2)

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"{device}" " is available.")

from Clust.clust.ML.common import ML_pipeline, tool
app_name= "energy" # "Hs2SwineFarmWithWeatherTime", "energy"

ModuleNotFoundError: No module named 'Clust'

## 1. Data Preparation

### 1-1. Parameter Setting

In [None]:
if app_name == "energy":
    model_purpose = 'regression'
    feature_X_list = ['Press_mm_hg', 'RH_1', 'RH_2', 'RH_3', 'RH_4', 'RH_5', 'RH_6', 'RH_7',
       'RH_8', 'RH_9', 'RH_out', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7',
       'T8', 'T9', 'T_out', 'Tdewpoint', 'Visibility', 'Windspeed']
    feature_y_list = ['value']
    split_mode ="windows_split"
    data_y_flag = True # 이미 만들어진 Y 데이터를 활용함
    
elif app_name == "Hs2SwineFarmWithWeatherTime":
    model_purpose = 'forecasting' 
    feature_X_list = ['Temperature', 'out_temp','sin_hour']
    feature_y_list = ['Temperature']
    split_mode = 'step_split'
    data_y_flag = False # Y데이터는 없음, X 에서 Y 데이터를 도출함
    
step = 'train'
bucket_name = 'integration' 
data_clean_level = 4

In [None]:
all_integrated_ms_list = db_client.measurement_list(bucket_name)
print(all_integrated_ms_list)
print("==========================================================")
collection_list = mongo_client.get_collection_list(bucket_name)
print(collection_list)

### 1-2. Data Ingestion
#### 1-2-1. Select data name 

In [None]:
dataset_name = model_purpose + '_' + app_name  
print(dataset_name)
data_name_X = dataset_name + '_cleanLevel' + str(data_clean_level)+'_'+step+'X'
data_name_y = dataset_name+'_cleanLevel' + str(data_clean_level)+'_'+ step+'y'
data_meta = mongo_client.get_document_by_json('integration', data_name_X, {'ms_name':data_name_X})[0]

In [None]:
data_meta

#### 1-2-2. X-y Data Ingestion pipeline

In [None]:
# Data Ingestion
ingestion_method = 'ms_all'
ingestion_param_X = {
    "bucket_name" : bucket_name,
    'ms_name' : data_name_X,
    'feature_list' : feature_X_list                              
}
ingestion_param_y = {
    "bucket_name" : bucket_name,
    'ms_name' : data_name_y,
    'feature_list' : feature_y_list                              
}

In [None]:
data_X, data_y = ML_pipeline.Xy_data_preparation(ingestion_param_X, data_y_flag, ingestion_param_y, ingestion_method, db_client)

### 1-2-2. Random Nan Insert (Test)

In [None]:
nan_ratio = 0.00

In [None]:
data_X = tool.random_nan_df(data_X, nan_ratio)
data_y = tool.random_nan_df(data_y, nan_ratio)

#### 1-2-3. Data scaling

In [None]:
scaler_param='scale'
scale_method='minmax'
scaler_path = './scaler/'

In [None]:
dataX_scaled, X_scalerFilePath, datay_scaled, y_scalerFilePath= ML_pipeline.Xy_data_scaling_train(data_name_X, data_X, data_name_y, data_y, scaler_path, scaler_param, scale_method)

## 2. Cleaning and split
### 2.1 pipeline - clean low quality column

In [None]:

model_data_clean = True # Front End Parameter 
if model_data_clean:
    nan_process_info = {'type':'num', 'ConsecutiveNanLimit':10, 'totalNaNLimit':100}
    max_nan_limit_ratio = 0.9
else:
    nan_process_info = {'type':'num', 'ConsecutiveNanLimit':10000, 'totalNaNLimit':100000}
    max_nan_limit_ratio = 0.5

nan_process_info = {'type':'num', 'ConsecutiveNanLimit':10000, 'totalNaNLimit':100000}

In [None]:
dataX_scaled = ML_pipeline.clean_low_quality_column(model_data_clean, nan_process_info, dataX_scaled)
feature_X_list= list(dataX_scaled.columns)

### 2.2 Train/Val Split pipeline

In [None]:
split_ratio = 0.8

In [None]:
# TODO 데이터 나뉘는 부분 추가로 작성된 것 지수님에게 물어봐야 함
day_window_size = tool.get_default_day_window_size(dataX_scaled)
train_x, val_x, train_y, val_y = ML_pipeline.split_data_by_mode(split_mode, split_ratio, dataX_scaled, datay_scaled, day_window_size)

### 2.3 Data Transformation & Clean2 pipeline

In [None]:
if split_mode =='windows_split':
    transform_parameter = {
            'past_step':day_window_size,
            'max_nan_limit_ratio': max_nan_limit_ratio
    }
else:
    transform_parameter = {
            'future_step': 2,
            'past_step': 12, 
            'max_nan_limit_ratio': max_nan_limit_ratio
    }

In [None]:
train_X_array, train_y_array = ML_pipeline.transform_data_by_split_mode(split_mode, transform_parameter, train_x, train_y)
val_X_array, val_y_array = ML_pipeline.transform_data_by_split_mode(split_mode, transform_parameter, val_x, val_y)

In [None]:
print(train_X_array.shape)
print(train_y_array.shape)
print(val_X_array.shape)
print(val_y_array.shape)

### 2.4 Set Model and train parameter

In [None]:
# RNN models (RNN, LSTM, GRU) parameters
seq_len, input_size = train_X_array.shape[1], train_X_array.shape[2]
model_method = 'GRU_rg' # Set model methods i.e., 'LSTM_rg', 'GRU_rg', 'CNN_1D_rg', 'LSTM_FCNs_rg', 'FC_rg' 

defalut_model_info ={"LSTM_rg":{"hidden_size":64,"num_layers":2,"output_dim":1,"dropout":0.1,"bidirectional":True},
                     "GRU_rg":{"hidden_size":64,"num_layers":2,"output_dim":1,"dropout":0.1,"bidirectional":True},
                     "CNN_1D_rg":{"output_channels":64,"kernel_size":3,"stride":1,"dropout":0.1,"padding":0},
                     "LSTM_FCNs_rg":{"num_layers":2,"lstm_dropout":0.4,"fc_dropout":0.1},
                     "FC_rg":{"dropout":2,"bias":0.4}}
default_train_param_info = {"lr":1e-4,"weight_decay":1e-6,"n_epochs":100,"batch_size":16}

model_info = defalut_model_info[model_method]

from Clust.clust.ML.common import parameter_setting
model_parameter = parameter_setting.set_model_parameter(model_method, model_info, seq_len, input_size)
train_parameter = parameter_setting.set_train_parameter(default_train_param_info)


In [None]:
model_parameter

### 2.5 Set Model name and path pipeline

In [None]:
model_name = None
model_file_path = None

# model name & path
if model_name is None:
    collection_index = dataset_name.find('_')
    app_name = dataset_name[collection_index:]
    model_name = tool.get_default_model_name(model_name, app_name, model_method, model_data_clean)
        
model_file_path = tool.get_default_model_path(model_name, data_name_X, model_method, train_parameter)

In [None]:
model_name

## 3. Training

In [None]:
ML_pipeline.CLUST_regression_train(train_parameter, model_method, model_parameter, model_file_path, train_X_array, train_y_array, val_X_array, val_y_array)

## 4. save metadata

In [None]:
# from Clust.clust.transformation.general.dataScaler import encode_hash_style
model_tags =["model_tag_example"]
trainDataType = "timeseries"
from Clust.clust.ML.tool import meta as ml_meta
model_info_meta = ml_meta.model_meta_update(data_meta, model_name, split_mode, feature_X_list, feature_y_list, data_y_flag, model_purpose, model_method, model_tags, model_data_clean, train_parameter, model_parameter, transform_parameter, scaler_param, data_name_X, data_name_y, model_file_path, X_scalerFilePath, y_scalerFilePath)


In [None]:
model_info_meta