In [2]:
## Checking if raw data directory is present
import glob
import re

BASE_FOLDER = "./raw"
cities = [re.search(r".*/([A-Z]+)", s).group(1) for s in glob.glob(f"{BASE_FOLDER}/*/")]
print(f'Found data for {len(cities)} cities: {", ".join(cities)}')

Found data for 2 cities: BARCELONA, MOSCOW


In [99]:
import numpy as np
import pandas as pd
from utils.date_util import generate_date_range, weekday_parser
from utils.h5utils import load_h5_file
import csv

def process_hourly_means_to_dataframe(city, date_range):
    # create empty frame structure
    frame = []
    # Fetch map mask
    static_map_with_mask = load_h5_file(f"{BASE_FOLDER}/{city}/{city}_static_with_region_layer.h5")[-1]
    #get the region indices
    regions_mask_values = np.unique(static_map_with_mask)
    region_indices = {}
    for index, region in enumerate(regions_mask_values):
        indices = np.argwhere(static_map_with_mask == region)
        region_indices[region] = indices
    # fetch and process temporal data files
    for date in date_range:
        ## Can be used to aggregate data in pandas with groupby (where weekday is 0 (i.e. sunday))
        weekday = weekday_parser(date)
        
        data = load_h5_file(f"{BASE_FOLDER}/{city}/training/2019-04-03_{city}_8ch_aggregated.h5")
        
        for hour in range(24):
            hour_frame = data[hour]
            ### for all channels, 
            for channel in [0, 2, 4, 6]:
                channel_frame = hour_frame[:,:,channel]
                for region_id, region in region_indices.items():
                    ### calculate mean voulume of the region
                    region_mean = channel_frame[(region)].sum() / np.count_nonzero(channel_frame[(region)])
                    #print(region_mean, channel_frame[(region)].sum(), np.count_nonzero(channel_frame[(region)]))
                    
                    #print(channel_frame.shape, region.shape, np.count_nonzero(channel_frame), np.count_nonzero(region_mean))
                    frame.append([date, weekday, hour, region_id, channel, region_mean])
                    ### Save zipped [date, time (hour), region_id, channel id, and mean_values to frame]
    
    # Save frame
    dataframe = pd.DataFrame(frame, columns=['date', 'weekday', 'hour', 'region_id', 'channel', 'region_mean'])
    dataframe.to_csv(f"{BASE_FOLDER}/{city}/hourly_processed_means.csv")
    return dataframe

In [None]:
date_range = generate_date_range("2019-04-01", "2019-04-04")
dataframe = process_hourly_means_to_dataframe("BARCELONA", date_range)

  region_mean = channel_frame[(region)].sum() / np.count_nonzero(channel_frame[(region)])
