In [None]:
## Checking if raw data directory is present
import glob
import re

BASE_FOLDER = "./raw"
cities = [re.search(r".*/([A-Z]+)", s).group(1) for s in glob.glob(f"{BASE_FOLDER}/*/")]
print(f'Found data for {len(cities)} cities: {", ".join(cities)}')

In [None]:
import numpy as np
import pandas as pd
from utils.date_util import generate_date_range, weekday_parser
from utils.h5utils import load_h5_file
import csv

def process_hourly_means_to_dataframe(city, date_range):
    # create empty frame structure
    frame = []
    # Fetch map mask
    static_map_with_mask = load_h5_file(f"{BASE_FOLDER}/{city}/{city}_static_with_region_layer.h5")[-1]
    #get the region indices
    regions_mask_values = np.unique(static_map_with_mask)
    region_indices = {}
    for index, region in enumerate(regions_mask_values):
        indices = np.argwhere(static_map_with_mask == region)
        region_indices[region] = indices
    # fetch and process temporal data files
    for date in date_range:
        ## Can be used to aggregate data in pandas with groupby (where weekday is 0 (i.e. sunday))
        weekday = weekday_parser(date)
        
        data = load_h5_file(f"{BASE_FOLDER}/{city}/training/{date}_{city}_8ch_aggregated.h5")
        
        for hour in range(24):
            hour_frame = data[hour]
            ### for all channels, 
            for region_id, region in region_indices.items():
                region_mean = 0
                for channel in [0, 2, 4, 6]:
                    channel_frame = hour_frame[:,:,channel].astype('float')
                    ### calculate mean voulume of the region
                    # Clearing all 0 values so as to ignore unavailable values.
                    channel_frame[channel_frame == 0] = np.nan
                    region_mean += np.nanmean(channel_frame[(region)])
                    #print(region_mean, channel_frame[(region)].sum(), np.count_nonzero(channel_frame[(region)]))
                    
                    #print(channel_frame.shape, region.shape, np.count_nonzero(channel_frame), np.count_nonzero(region_mean))
                frame.append([date, weekday, hour, region_id, region_mean])
                ### Save zipped [date, time (hour), region_id, channel id, and mean_values to frame]
    
    # Save frame
    dataframe = pd.DataFrame(frame, columns=['date', 'weekday', 'hour', 'region_id', 'region_mean_volume'])
    return dataframe

In [20]:
city = "BANGKOK"
pre_cov = pd.read_csv(f"{BASE_FOLDER}/{city}/hourly_processed_means_pre.csv")
cov_dataframe = pd.read_csv(f"{BASE_FOLDER}/{city}/hourly_processed_means_in_covid.csv")

In [21]:
print(pre_cov.shape, cov_dataframe.shape)

(7920, 6) (7920, 6)


In [22]:
# Only run this cell if you need to start writing fresh csv
# pre_cov = pd.DataFrame()
# cov_dataframe = pd.DataFrame()

In [None]:
%%time
date_range = generate_date_range("2019-04-11", "2019-04-20")
dataframe = process_hourly_means_to_dataframe(city, date_range)

In [None]:
pre_cov = pre_cov.append(dataframe)
pre_cov.to_csv(f"{BASE_FOLDER}/{city}/hourly_processed_means_pre.csv")

In [None]:
%%time
cov_date_range = generate_date_range("2020-04-11", "2020-04-20")
dataframe2 = process_hourly_means_to_dataframe(city, cov_date_range)

In [None]:
cov_dataframe = cov_dataframe.append(dataframe2)
cov_dataframe.to_csv(f"{BASE_FOLDER}/{city}/hourly_processed_means_in_covid.csv")
cov_dataframe.shape

In [None]:
dataframe['region_mean'] = dataframe['region_mean'].fillna(0)

In [None]:
dataframe.head()

In [None]:
dataframe.describe()

In [None]:
dataframe.shape

In [None]:
moscow_dataframe = process_hourly_means_to_dataframe("MOSCOW", date_range)

In [None]:
moscow_dataframe.head()