In [125]:
import pandas as pd 
import numpy as np

# Data Processing

lets make conditional data represent [time interval of day, day of week, mean traffic vol, max traffic vol, grid cell with max traffic vol, weather]

expand to weather  
holiday

In [126]:
import requests
import pandas as pd
import datetime

def fetch_and_encode_weather_data(latitude, longitude, start_date, end_date):
    """
    Fetch and encode weather data into 30-minute intervals with one-hot encoding for weather categories.

    Parameters:
    - latitude (float): Latitude of the location.
    - longitude (float): Longitude of the location.
    - start_date (str): Start date in YYYY-MM-DD format.
    - end_date (str): End date in YYYY-MM-DD format.

    Returns:
    - pd.DataFrame: Encoded weather data as a DataFrame.
    """
    base_url = "https://archive-api.open-meteo.com/v1/archive"

    # Generate list of dates for the range
    start = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.datetime.strptime(end_date, "%Y-%m-%d")
    date_list = [(start + datetime.timedelta(days=i)).strftime("%Y-%m-%d") for i in range((end - start).days + 1)]

    all_data = []

    for date in date_list:
        # Build API request
        params = {
            "latitude": latitude,
            "longitude": longitude,
            "start_date": date,
            "end_date": date,
            "hourly": "precipitation,sunshine_duration,weather_code",
            "timezone": "Asia/Shanghai"
        }

        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        # Extract hourly data
        hourly_data = data.get("hourly", {})
        for i, timestamp in enumerate(hourly_data.get("time", [])):
            condition = "cloudy"  # Default condition

            # Define weather conditions based on precipitation and temperature
            precipitation = hourly_data.get("precipitation", [0])[i]
            sunshine_duration =  hourly_data.get("sunshine_duration", [0])[i]
            weather_code = hourly_data.get("weather_code", [0])[i]

            if (precipitation > 0) & (weather_code < 60 | weather_code > 50):  # rain
                encoding = 2
            elif sunshine_duration < 1000:  # cloudy
                encoding = 1
            elif sunshine_duration > 1000:  # sunny 
                encoding = 0 

            all_data.append({
                "timestamp": timestamp,
                "sunshine_duration": sunshine_duration,
                "precipitation": precipitation,
                "weather_code": weather_code,
                "encoding": encoding
            })

    # Convert to DataFrame
    df = pd.DataFrame(all_data)

    # Resample to 30-minute intervals (assuming input is hourly)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.set_index("timestamp").resample("30T").asfreq().fillna(method="ffill").reset_index()

    # Add final row
    encoded_array = df['encoding'].copy()
    encoded_array[len(encoded_array)] = df['encoding'].iloc[-1]
    
    return encoded_array

# Example usage
LATITUDE = 30.5728  # Latitude of Chengdu, China
LONGITUDE = 104.0668  # Longitude of Chengdu, China
START_DATE = "2016-11-01"
END_DATE = "2016-11-30"

weather_data = fetch_and_encode_weather_data(LATITUDE, LONGITUDE, START_DATE, END_DATE)

In [127]:
def conditional_data_from_inflow(inflow_data):
    
    def z_score_normalisation_main_data(data):
        # Check if the input shape matches the expected format (x, 1440, 100)
        if data.shape[2] != 100:
            raise ValueError("Input data must have shape (x, 1440, 100), where x can vary.")
    
        # Compute mean and standard deviation for the entire dataset
        mean = np.mean(data)
        std = np.std(data)
    
        # Avoid division by zero by replacing zero std values with a small number
        if std == 0:
            std = 1e-8
    
        # Apply z-score normalization to the entire dataset
        normalized_data = (data - mean) / std
    
        return normalized_data, mean, std

    
    def z_score_normalisation_head_data(data):
        # Check if the input shape matches the expected format (x, 1440, 5)
    
        # Extract the second last feature (index 2,3)
        feature_data = data[:, :, 2:3]
    
        # Compute mean and standard deviation for the selected feature across the entire dataset
        mean = np.mean(feature_data)
        std = np.std(feature_data)
    
        # Avoid division by zero by replacing zero std values with a small number
        if std == 0:
            std = 1e-8
    
        # Apply z-score normalization only to the selected feature
        normalized_feature = (feature_data - mean) / std
    
        # Create a copy of the original data and replace the normalized feature
        normalized_data = data.copy()
        normalized_data[:, :, 2:3] = normalized_feature

        mean = np.mean(data, axis=(0, 1))  # Mean for each feature (length 4)
        std = np.std(data, axis=(0, 1))    # Std for each feature (length 4)

        mean[0:2] = 0
        mean[3:] = 0
        std[0:2] = 1
        std[3:] = 1
    
        return normalized_data, mean, std
    
    
    # conditional data represent [time interval of day, day of week, max traffic vol, grid cell with max traffic vol]
    conditional_data = np.zeros((inflow_data.shape[0],inflow_data.shape[1],4))

    for client in range(inflow_data.shape[0]):
        for i in range(1440):
            # time interval of day
            conditional_data[client,i,0] = (i%48)
            # day of week, first day is Tuesday
            conditional_data[client,i,1] = ((1 + (i // 48))%7)
            # # mean traffic vol
            # conditional_data[client,i,2] = np.mean(inflow_data[client,i])
            # max traffic vol
            conditional_data[client,i,2] = np.max(inflow_data[client,i])
            # grid cell with max traffic vol
            conditional_data[client,i,3] = np.argmax(inflow_data[client,i])
            # # weather data
            # conditional_data[client,i,5] = weather_data[i] 

    normalised_conditional_data, mean_conditional_data, std_conditional_data = z_score_normalisation_head_data(conditional_data)
    normalised_main_data, mean_main_data, std_main_data = z_score_normalisation = z_score_normalisation_main_data(inflow_data)
    
    return conditional_data, normalised_conditional_data, mean_conditional_data, std_conditional_data,\
    inflow_data, normalised_main_data, mean_main_data, std_main_data

In [128]:
def save_data(num_clients,city, normalised_main_data,normalised_conditional_data,extra_data):
    np.save(f"data/{city}/{num_clients}_client/inflow_main.npy", normalised_main_data[..., np.newaxis])
    np.save(f"data/{city}/{num_clients}_client/inflow_conditional.npy", normalised_conditional_data)
    np.save(f"data/{city}/{num_clients}_client/extra_data.npy", extra_data)

def data_processing(num_clients, city):
    inflow_data = np.load(f"data/{city}/{num_clients}_client/inflow_main.npy")
    conditional_data, normalised_conditional_data, mean_conditional_data,\
    std_conditional_data, inflow_data, normalised_main_data, mean_main_data, std_main_data = conditional_data_from_inflow(inflow_data) 

    extra_data = {
    'mean conditional data':[mean_conditional_data],
    'std conditional data':[std_conditional_data],
    'mean main data':[mean_main_data],
    'std main data':[std_main_data],
    }
    print(extra_data)
    save_data(num_clients,city,normalised_main_data,normalised_conditional_data,extra_data)
