In [None]:
# find consecutive missing values > 2 hours
def find_missing_days(raw_data, column, counter_limit=1, trigger = 'DATE'):
    """ Find consecutive missing values in column `feature` of  `data_raw`
        consecutive missing values during the day.

    Arguments:
    ----------
        raw_data: dataframe, default=None
            Must have column "DATE". 

        column: list, default=None
            The column you want to search.
        
        counter_limit: int, default=1
            The number of consecutive occurrences of the missing value you allow.
            
        trigger: string, default='DATE'
            The trigger your used to reset counter.
            must be datetime family [year, date, month, day, hour...]

    Returns:
    --------
        missing_days: list
            The list of missing days.
    """
    
    # initialization
    data = raw_data.copy()
    missing_days = []
    time_tag = pd.to_datetime('')
    counter = 0

    # traverse the data.
    for i in range(len(data)):
        row = data.iloc[i]
        
        # reset the trigger when enter a new [trigger]
        if(time_tag != row[trigger]):
            counter, time_tag = 0, row[trigger]
            
        # judgment status
        if((counter>=counter_limit) & np.isnan(row[column])):
            # append the day with continuous missing value
            missing_days.append(row[trigger])
        elif(np.isnan(row[column])):
            # add missing value counter
            counter += 1
        else:
            # compliance value, clear counter
            counter = 0
            
    return np.unique(missing_days)

In [None]:
# 替函式 find_missing_days 新增了可以容許的連續缺失值數量 counter_limit
# 考量到目前使用資料集的缺失值過多，建議將 counter_limit 設為 2
missing_days = find_missing_days(merge_raw, column, counter_limit=2)

In [None]:
# 負責打包訓練和測試資料，僅用於歷遍資料，必須額外提供輸入和輸出變數組合
def build_by_3day(_data_1h_raw, _data_1d_raw, _test_start_day, **args):
    """ Build train/test dataset, only used to traverse the data. 

    Arguments:
    ----------
        data_1h_raw: dataframe, default=None
        
        data_1d_raw: dataframe, default=None
        
        test_start_day: string, default='1996-01-01'
        
        **args: dict
            ['set_input', 'set_output', 'set_idx']

    Returns:
    --------
        train_x/test_x: list
            The list of input data.
            
        train_y/test_y: list
            The list of output data.

        train_idx/test_idx: dataframe
            The dataframe of data index.
            
    """
    
    # initialization
    _train_x, _train_y, _train_idx = [], [], []
    _test_x, _test_y, _test_idx = [], [], []
    _data_1h, _data_1d = _data_1h_raw.copy(), _data_1d_raw.copy()
    _test_start_day = pd.to_datetime(_test_start_day).date()

    # get 1day list, used to traverse the data set
    _list_1d = _data_1h['TIME_TO_INTERVAL'].apply(lambda date: get_date_list(date))
#     _list_1d = _list_1d.drop_duplicates().reset_index(inplace=False, drop=True)
    
    _list_1d = pd.date_range(start=_list_1d.min(), end=_list_1d.max())
    _list_1d = _list_1d.to_series().apply(lambda x: x.date())
    _list_1d = _list_1d.reset_index(inplace=False, drop=True)
    
    # traverse the data set
    for i in range(len(_list_1d)-1):

        # check whether the data in the specified range is missing in hours
        _data_range = _list_1d[i-3:i+3]
        _data_in_range_mask = _data_1h['TIME_TO_INTERVAL'].dt.date.isin(_data_range)
        _data_in_range = _data_1h[_data_in_range_mask]

        # 24 hour * 3 day * 2 (historical+future) = 144 point
        # if there are missing values, skip this day
        if (_data_in_range.isnull().values.any() or len(_data_in_range) != 144):
            continue
            
        # if there is no missing value, then pack the data
        _few_days, _target_days = _list_1d[i-3:i], _list_1d[i:i+3]
        _few_day_by_1d = _data_1d[_data_1d['DATE'].isin(_few_days)]
        _few_day_by_1h = _data_1h[_data_1h['DATE'].isin(_few_days)]
        _target_day_by_1d = _data_1d[_data_1d['DATE'].isin(_target_days)]
        _target_day_by_1h = _data_1h[_data_1h['DATE'].isin(_target_days)]
        
        # set your input & output
        _input = args['_set_input'](
            _few_day_by_1d, _few_day_by_1h, _target_day_by_1d, _target_day_by_1h)
        
        _output = args['_set_output'](
            _few_day_by_1d, _few_day_by_1h, _target_day_by_1d, _target_day_by_1h)
        
        if "_set_idx" in args:
            _idx = args['_set_idx'](
                _few_day_by_1d, _few_day_by_1h, _target_day_by_1d, _target_day_by_1h)
        else:
            _idx = ''

        # data for train or test
        _isin_last_week = True
        for date in _list_1d[i:i+3]:
            _isin_last_week = _isin_last_week & isin_last_week(date, 7)
        
        _isin_test_data = _isin_last_week & (_list_1d[i+1] > _test_start_day)
        
        if(_isin_test_data):
            _test_x.append(_input)
            _test_y.append(_output)
            _test_idx.append(_idx)
        else:
            _train_x.append(_input)
            _train_y.append(_output)
            _train_idx.append(_idx)
            
    _train_x, _train_y = np.array(_train_x), np.array(_train_y)
    _test_x, _test_y = np.array(_test_x), np.array(_test_y)
    _train_idx, _test_idx = pd.DataFrame(_train_idx), pd.DataFrame(_test_idx)
    
    return _train_x, _train_y, _train_idx, _test_x, _test_y, _test_idx

In [None]:
# 主要的調整項目在以下段落，過去直接根據資料產生日期清單的方式會有問題
# 故調整為根據 date_range 產生日期清單的方式
_list_1d = _data_1h['TIME_TO_INTERVAL'].apply(lambda date: get_date_list(date))
_list_1d = pd.date_range(start=_list_1d.min(), end=_list_1d.max())
_list_1d = _list_1d.to_series().apply(lambda x: x.date())
_list_1d = _list_1d.reset_index(inplace=False, drop=True)