In [1]:
import sys, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
sys.path.append("../")
sys.path.append("../..")

from Clust.setting import influx_setting_KETI as ins
from Clust.setting import influx_setting_KETI as ins
from Clust.clust.ingestion.influx import influx_Client_v2 as influx_Client, multipleDataSets
db_client = influx_Client.influxClient(ins.CLUSTDataServer2)

In [2]:
start_time = "2021-01-01 00:00:00"
end_time = "2022-01-01 00:00:00"

from clust.integration.utils import param
dataInfo = [['air_indoor_modelSchool', 'ICW0W2000011'], ['air_outdoor_kweather', 'OC3CL200012'],
            ['life_additional_Info', 'trigonometicInfoByHours'],['weather_outdoor_keti_clean','seoul']]

refine_param = {
    "removeDuplication":{"flag":True},
    "staticFrequency":{"flag":True, "frequency":None}
}

CertainParam= {'flag': True}

uncertainParam= {'flag': False, "param":{
          "outlierDetectorConfig":[
                {'algorithm': 'IQR', 'percentile':99 ,'alg_parameter': {'weight':100}}    
]}}
outlier_param ={
    "certainErrorToNaN":CertainParam, 
    "unCertainErrorToNaN":uncertainParam
}
imputation_param = {
    "serialImputation":{
        "flag":True,
        "imputation_method":[{"min":0,"max":3,"method":"linear", "parameter":{}}],
        "totalNonNanRatio":80
    }
}
process_param = {'refine_param':refine_param, 'outlier_param':outlier_param, 'imputation_param':imputation_param}

integration_freq_sec = 60 * 60 # 1시간
integration_param = {
    "granularity_sec":integration_freq_sec,
    "param":{},
    "method":"meta"
}

In [3]:
re_frequency_sec = 4*60
intDataInfo = param.makeIntDataInfoSet(dataInfo, start_time, end_time)
multiple_dataset  = multipleDataSets.get_onlyNumericDataSets(db_client, intDataInfo)

# 1. Data Integration & Scaling

In [4]:
from Clust.clust.integration.integrationInterface import IntegrationInterface
originD = IntegrationInterface().clustIntegrationFromInfluxSource(db_client, intDataInfo, process_param,integration_param)

KeyError: 'integration_duration'

## 2. Feature Extension & Selection

In [None]:
from KETIAppTestCode.JWTest.LSTMTest import LSTMTest

"""
feature_col_list = ['CO2ppm', 'H2Sppm', 'Humidity', 'NH3ppm', 'Temperature', 'out_humid',
       'out_pressure', 'out_rainfall', 'out_sunshine', 'out_temp',
       'out_wind_direction', 'out_wind_speed', 'out_CO', 'out_NO2', 'out_O3',
       'out_PM10', 'out_PM25', 'out_SO2', 'day', 'sin_hour', 'cos_hour',
       'sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month',
       'sin_week_of_year', 'cos_week_of_year']
"""
feature_col_list1 = ['CO2ppm', 'H2Sppm', 'Humidity', 'NH3ppm', 'Temperature', 'out_humid', 'out_temp', 'out_wind_speed',  'sin_hour']
target_name = 'Humidity'

In [None]:
feature_col_list =  feature_col_list1
df_Exfeatures = LSTMTest.featureExtension2(originD)

In [None]:
df_Exfeatures.columns

## 3. Correlation Exploration

In [None]:
from KETIToolDataExploration.stats_table.corr_table import Correlation
dataSet = df_Exfeatures[feature_col_list]

dataSet.columns

from KETIToolDataExploration.plot_graph import plot_features, plot_correlation
plt.rcParams['figure.figsize'] =(18, 20)
plot_correlation.plot_correlation_chart(dataSet)
plot_features.plot_all_feature_data(dataSet)

from KETIToolDataExploration.stats_table.corr_table import Correlation
corrT = Correlation(dataSet)
topCorrelation = corrT.get_top_abs_correlations(50, target = target_name)
print(topCorrelation)

## 4. Scaling

In [None]:
#scalerRootpath = os.path.join('/Users','jw_macmini','CLUSTGit','KETIAppMachineLearning','scaler')
scalerRootpath = os.path.join('/home','keti','CLUST_KETI','Clust','KETIAppMachineLearning','scaler','Clust')
#scalerRootpath = os.path.join('/Users','bunny','Code_CLUST','KETIAppMachineLearning','scaler','CLUST')

from Clust.clust.transformation.general.dataScaler import DataScaler

DS = DataScaler('minmax', scalerRootpath )
#from Clust.clust.transformation.general import dataScaler
#feature_col_list = dataScaler.get_scalable_columns(train_o)
DS.setScaleColumns(feature_col_list)
DS.setNewScaler(dataSet)
dataSet_scaled = DS.transform(dataSet)

## 5. Cycle Data Preparation

In [None]:
dataSet = dataSet_scaled

In [None]:
from Clust.clust.transformation.dataByCycle import cycle_Module
feature_cycle = 'Day'
feature_cycle_times = 1
dayCycle = cycle_Module.getCycleSelectDataSet(dataSet, feature_cycle, feature_cycle_times)

## 6. Clean Data Preparation

In [None]:
dataSet = dayCycle
feature_list = dataSet_scaled.columns
feature_name = feature_list[0]

In [None]:
import matplotlib.pyplot as plt
NanInfoForCleanData = {'type':'num', 'ConsecutiveNanLimit':2, 'totalNaNLimit':5}

from Clust.clust.quality.NaN import clean_feature_data
integrationFreq_min = 2
CMS = clean_feature_data.CleanFeatureData(feature_list, integrationFreq_min)
refinedData, filterImputedData = CMS.getMultipleCleanDataSetsByDF(dataSet, NanInfoForCleanData)
print(len(refinedData), len(filterImputedData))
CleanData = pd.concat(filterImputedData.values())