# Import

In [1]:
import sys
import os
import datetime
import pandas as pd
sys.path.append("../")
sys.path.append("../..")

# data load
from Clust.setting import influx_setting_KETI as ins
from Clust.clust.ingestion.influx import influx_Client_v2 as influx_Client, multipleDataSets

# preprocessing
from Clust.clust.quality.NaN import clean_feature_data
from clust.preprocessing.dataPreprocessing import DataPreprocessing

# split by holiday
from Clust.clust.transformation.splitDataByCondition import holiday

# setting common parameter : influxDB instance
db_client = influx_Client.influxClient(ins.CLUSTDataServer2)

- TimeSeries를 Holiday/notHoliday 기준으로 데이터를 나누는 것을 보여준다.
- 아래 순서에 따라 테스트가 진행
    1. Input : 단일 Dataframe
    2. Input : 특정 Bucket의 Dataset
    3. Input : 특정 Bucket의 Dataset / Output : som 알고리즘 활용에 맞춰진 형태
- Holiday/notHoliday 를 나누는 모든 챕터들은 아래와 같은 진행 순서를 갖는다.
    1. Setting parameter
        - duration : 2022.02.01~2022.02.28
    2. Data Load
        - kWeather Indoor 초등학교 
    3. Data Preprocessing
        - 1min에서 10min으로 DownSampling
    4. splitDataByCondition

### 질문
Q1. split data by holiday when inputting dataframe 일 경우에 preprocessing 을 아래와 같은 요구사항으로 하기 위해서는 어떻게 진행해야할까요?
1. Freq 1 min => 10 min
2. 특정 열에 nan의 연속 개수와 총 개수에 따른 예외 처리

# 1. Split data by holiday when inputting Dataframe

## 1-1. Setting parameter 

In [11]:
## 1. Freq parameter
#freq_min = 10
#timedelta_frequency_sec = datetime.timedelta(minutes= freq_min)

## 2. Nan Clean Parameter
#NanInfoForCleanData = {'type':'num', 'ConsecutiveNanLimit':3, 'totalNaNLimit':50}

## 3. Time
start_time = pd.to_datetime("2022-02-01 00:00:00")
end_time = pd.to_datetime("2022-02-28 23:59:59")

#duration = {"start_time": start_time, "end_time":end_time}

In [12]:
bucket_name ='air_indoor_초등학교'
measurement_name = "ICW0W2000025"

## 1-2. Data Load

In [13]:
data = db_client.get_data_by_time(start_time, end_time, bucket_name, measurement_name)

In [14]:
data

Unnamed: 0_level_0,in_co2,in_humi,in_noise,in_pm01,in_pm10,in_pm25,in_temp,in_voc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-02-01 00:00:00+00:00,397.0,20.0,47.0,43.0,53.0,46.0,15.5,7.0
2022-02-01 00:01:00+00:00,399.0,20.0,48.0,43.0,53.0,46.0,15.5,6.0
2022-02-01 00:02:00+00:00,407.0,20.0,48.0,43.0,53.0,46.0,15.5,6.0
2022-02-01 00:03:00+00:00,398.0,20.0,48.0,43.0,56.0,47.0,15.5,6.0
2022-02-01 00:04:00+00:00,401.0,20.0,48.0,43.0,55.0,46.0,15.5,7.0
...,...,...,...,...,...,...,...,...
2022-02-28 23:55:00+00:00,404.0,24.0,46.0,21.0,26.0,22.0,18.5,6.0
2022-02-28 23:56:00+00:00,403.0,24.0,46.0,21.0,26.0,22.0,18.5,5.0
2022-02-28 23:57:00+00:00,400.0,24.0,46.0,21.0,25.0,22.0,18.5,7.0
2022-02-28 23:58:00+00:00,403.0,24.0,46.0,21.0,26.0,22.0,18.5,6.0


## 1-3. Data Preprocessing

In [6]:
# fill in the missing time index
refine_param = {'removeDuplication': {'flag': True} , 'staticFrequency': {'flag': True, 'frequency': None}}
data_preprocessing = DataPreprocessing().get_refinedData(data, refine_param)

## 1-4. Split Data by holiday

In [7]:
result = holiday.get_holidayCycleSet_from_dataframe(data_preprocessing)

In [8]:
result

{'holiday': [                           in_co2  in_humi  in_noise  in_pm01  in_pm10  \
  time                                                                     
  2022-02-01 00:00:00+00:00   397.0     20.0      47.0     43.0     53.0   
  2022-02-01 00:01:00+00:00   399.0     20.0      48.0     43.0     53.0   
  2022-02-01 00:02:00+00:00   407.0     20.0      48.0     43.0     53.0   
  2022-02-01 00:03:00+00:00   398.0     20.0      48.0     43.0     56.0   
  2022-02-01 00:04:00+00:00   401.0     20.0      48.0     43.0     55.0   
  ...                           ...      ...       ...      ...      ...   
  2022-02-01 23:55:00+00:00   395.0     21.0      46.0     49.0     56.0   
  2022-02-01 23:56:00+00:00   393.0     21.0      47.0     50.0     57.0   
  2022-02-01 23:57:00+00:00   398.0     21.0      47.0     50.0     57.0   
  2022-02-01 23:58:00+00:00   390.0     21.0      47.0     50.0     58.0   
  2022-02-01 23:59:00+00:00   397.0     21.0      47.0     50.0     58.0   
 

# 2. Split data by holiday when inputting Dataset

## 2-1. Setting parameter

In [2]:
## 1. Freq parameter
freq_min = 10
timedelta_frequency_sec = datetime.timedelta(minutes= freq_min)

## 2. Nan Clean Parameter
NanInfoForCleanData = {'type':'num', 'ConsecutiveNanLimit':3, 'totalNaNLimit':50}

## 3. Time
start_time = pd.to_datetime("2022-02-01 00:00:00")
end_time = pd.to_datetime("2022-02-28 23:59:59")

duration = {"start_time": start_time, "end_time":end_time}

In [3]:
bucket_list =['air_indoor_초등학교']
new_bucket_list = ['elementarySchool']
feature_list = ['in_temp', 'in_humi', 'in_co2', 'in_voc', 'in_noise', 'in_pm10', 'in_pm25', 'in_pm01']

## 2-2. Data Load

In [4]:
dataSet = multipleDataSets.get_all_msdata_in_bucket_list(bucket_list, db_client, start_time, end_time, new_bucket_list)

air_indoor_초등학교  length: 10


## 2-3. Data Preprocessing

In [5]:
CMS = clean_feature_data.CleanFeatureData(feature_list, timedelta_frequency_sec)
dataSet, dataSetName, NaNRemovedDataSet, imputedDatasetName, imputedDataSet  = CMS.getMultipleCleanDataSetsByFeature(dataSet, NanInfoForCleanData, duration) 

In [6]:
imputedDataSet

{'in_temp': [                     in_temp
  time                        
  2022-02-01 00:00:00      9.0
  2022-02-01 00:10:00      9.1
  2022-02-01 00:20:00      9.0
  2022-02-01 00:30:00      9.1
  2022-02-01 00:40:00      9.1
  ...                      ...
  2022-02-28 23:00:00     16.0
  2022-02-28 23:10:00     16.0
  2022-02-28 23:20:00     16.0
  2022-02-28 23:30:00     16.0
  2022-02-28 23:40:00     16.0
  
  [4031 rows x 1 columns],
                       in_temp
  time                        
  2022-02-01 00:00:00     21.9
  2022-02-01 00:10:00     21.9
  2022-02-01 00:20:00     22.0
  2022-02-01 00:30:00     22.0
  2022-02-01 00:40:00     22.0
  ...                      ...
  2022-02-28 23:00:00     24.7
  2022-02-28 23:10:00     24.6
  2022-02-28 23:20:00     24.6
  2022-02-28 23:30:00     24.6
  2022-02-28 23:40:00     24.5
  
  [4031 rows x 1 columns],
                       in_temp
  time                        
  2022-02-01 00:00:00     15.3
  2022-02-01 00:10:00     15.3

## 2-4. Split Data by holiday

In [7]:
dataset_result = holiday.get_holidayCycleSet_from_dataset(imputedDataSet, feature_list)

In [8]:
dataset_result

{'in_temp': [{'holiday': [                     in_temp
    time                        
    2022-02-01 00:00:00      9.0
    2022-02-01 00:10:00      9.1
    2022-02-01 00:20:00      9.0
    2022-02-01 00:30:00      9.1
    2022-02-01 00:40:00      9.1
    ...                      ...
    2022-02-01 23:10:00      7.7
    2022-02-01 23:20:00      7.7
    2022-02-01 23:30:00      7.7
    2022-02-01 23:40:00      7.8
    2022-02-01 23:50:00      7.8
    
    [144 rows x 1 columns],
                         in_temp
    time                        
    2022-02-02 00:00:00      7.8
    2022-02-02 00:10:00      7.8
    2022-02-02 00:20:00      7.8
    2022-02-02 00:30:00      7.9
    2022-02-02 00:40:00      7.9
    ...                      ...
    2022-02-02 23:10:00      8.4
    2022-02-02 23:20:00      8.4
    2022-02-02 23:30:00      8.4
    2022-02-02 23:40:00      8.4
    2022-02-02 23:50:00      8.5
    
    [144 rows x 1 columns],
                         in_temp
    time             

In [11]:
len(dataset_result["in_temp"])

9