#### Single model data analysis

In [None]:
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# load uxsim_data.pkl
with open('results/journeys_df.pkl', 'rb') as f:
    journeys_df = pickle.load(f)
with open('results/uxsim_df.pkl', 'rb') as f:
    uxsim_df = pickle.load(f)
with open('results/parked_dict.pkl', 'rb') as f:
    parked_dict = pickle.load(f)

In [None]:
city_areas = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 28, 29, 31, 34, 41, 43, 44, 45]

trips_by_hour_chances = pd.read_pickle("../data/trips_by_hour_chances.pickle")
trip_counts_distribution = pd.read_pickle("../data/trip_counts_distribution.pickle")

start_time, end_time = int(journeys_df['start_time'].min()), int(journeys_df['start_time'].max())

### Journeys data

In [None]:
journeys_df.head()

In [None]:
# Print the mode by start_time hour (float to int)
journeys_df['start_time_h'] = journeys_df['start_time'].astype(int)

# Group by start_time
journeys_grouped = journeys_df.groupby(['start_time_h'])
# Get the percentage for each mode of each hour in a dataframe
journeys_grouped = journeys_grouped['mode'].value_counts(normalize=True).unstack().fillna(0)

# Plot
plt.figure(figsize=(6, 3))
sns.lineplot(data=journeys_grouped, dashes=False)
plt.xlim(start_time, end_time)
plt.ylim(0, 1)
plt.title('Mode distribution per hour')

In [None]:
journeys_df['travel_time_min'] = journeys_df['travel_time'] / 60

n_bins = 40
hist_columns = ['travel_time_min', 'distance', 'cost', 'perceived_cost']
x_labels = ['Travel time (min)', 'Distance (km)', 'Cost (€)', 'Perceived cost (€)']
upper_xlim = [journeys_df[col].quantile(0.999) for col in hist_columns]

# Histogram of each column
fig, axs = plt.subplots(2, 4, figsize=(16, 7))
axs = axs.flatten()

for i, col in enumerate(hist_columns):
    plot_df = journeys_df[(journeys_df[col] < upper_xlim[i]) & (journeys_df[col] > 0)][[col, 'mode']].copy()
    sns.histplot(plot_df, x=col, hue='mode', multiple='stack', ax=axs[i], bins=n_bins)
    axs[i].set_title(f'{col} distribution')
    axs[i].set_xlabel(x_labels[i])
    axs[i].set_ylabel('Frequency')
    axs[i].set_xlim(0, upper_xlim[i])
    
# 100% stacked histograms
for i, col in enumerate(hist_columns):
    plot_df = journeys_df[(journeys_df[col] < upper_xlim[i]) & (journeys_df[col] > 0)][[col, 'mode']].copy()
    
    # Create bins and count values within each bin for each mode
    hist, bin_edges = np.histogram(plot_df[col], bins=n_bins, range=(0, upper_xlim[i]))
    mode_counts = {}
    for mode in plot_df['mode'].unique():
        mode_hist, _ = np.histogram(plot_df[plot_df['mode'] == mode][col], bins=bin_edges)
        mode_counts[mode] = mode_hist
    
    # Calculate percentages
    total = np.sum(list(mode_counts.values()), axis=0)
    for mode in mode_counts:
        mode_counts[mode] = np.divide(mode_counts[mode], total, out=np.zeros_like(mode_counts[mode], dtype=float), where=total!=0) * 100
    
    # Plot stacked histogram
    bottom = np.zeros(n_bins)
    for mode, counts in mode_counts.items():
        axs[i+4].bar(bin_edges[:-1], counts, bottom=bottom, width=np.diff(bin_edges), label=mode, alpha=0.7)
        bottom += counts
    
    axs[i+4].set_title(f'{col} distribution (100% stacked)')
    axs[i+4].set_xlabel(x_labels[i])
    axs[i+4].set_ylabel('Percentage')
    axs[i+4].set_xlim(0, upper_xlim[i])
    axs[i+4].legend()

plt.tight_layout()
plt.savefig('img/journeys_data.png', dpi=300, bbox_inches='tight')
plt.show()

### Input data visualization (for comparison)

In [None]:
# For a weekday, take the average of days 0-3 (Monday-Thursday)
trips_by_hour_chance = trips_by_hour_chances.iloc[:, 0:4].mean(axis=1).drop("Total")
# Drop the hours that are not in the range of the model and save as a dictionary
trips_by_hour_chance = trips_by_hour_chance.loc[start_time:(end_time)]
# Set column name
trips_by_hour_chance.name = 'Chance'
# To df
trips_by_hour_chance = trips_by_hour_chance.reset_index()
# Set hour as int
trips_by_hour_chance['Hour'] = trips_by_hour_chance['Hour'].astype(int)
trips_by_hour_chance.head(3)

In [None]:
# Plot trips_by_hour_chances series
fig, ax = plt.subplots(figsize=(6, 3))
sns.barplot(data=trips_by_hour_chance, x='Hour', y='Chance', ax=ax)
ax.set_title('Trip chances per Agent per hour')
ax.set_ylabel('Chance of taking trip')
ax.set_xlabel('Time of day (hour)')

### UXsim data analysis

In [None]:
# time_bin (first level multi index) from seconds to hours
uxsim_df.index = pd.MultiIndex.from_tuples([(time/3600+start_time, area) for time, area in uxsim_df.index], names=['time_bin', 'area'])

# Filter the area index over city_areas
uxsim_df = uxsim_df.loc[(slice(None), city_areas), :]

In [None]:
# Plot
fig, axs = plt.subplots(3, 2, figsize=(12, 12))
axs = axs.flatten()

for i, variable in enumerate(uxsim_df.columns):
    sns.lineplot(data=uxsim_df, x='time_bin', y=variable,  ax=axs[i], errorbar=("pi", 50))  # hue='area', palette='rocket'
    axs[i].set_title(f"{variable} in different areas")
    axs[i].set_ylabel(variable)
    axs[i].set_xlabel('Time of day (hour)')
    axs[i].set_xlim(start_time, end_time)
    axs[i].set_ylim(bottom=0)
    
plt.savefig('img/uxsim_data.png', dpi=300, bbox_inches='tight')

### Parking data visualization

In [None]:
# Convert parked_dict to DataFrame
parked_df = pd.DataFrame(parked_dict)
# Long form
long_parked_df = parked_df.stack().reset_index()
# Rename columns
long_parked_df.columns = ['area', 'time', 'value']
long_parked_df = long_parked_df.set_index(['time', 'area'])

# Normalize the values on the first time step
long_parked_df['parked_norm'] = long_parked_df.groupby('area')['value'].transform(lambda x: x / x.iloc[0])
long_parked_df.head(3)

In [None]:
gdf_mrdh_65 = pd.read_pickle("../data/areas_mrdh_weighted_centroids.pkl")

In [None]:
with open("../data/polygons.pkl", "rb") as f:
    city_polygon, area_polygon = pickle.load(f)

city_polygon_series = gpd.GeoSeries(city_polygon, crs="epsg:4326")
city_polygon_series = city_polygon_series.to_crs(epsg=28992)
gdf_mrdh_65["in_city"] = gdf_mrdh_65.centroid.within(city_polygon_series.geometry[0])
gdf_mrdh_65 = gdf_mrdh_65[gdf_mrdh_65["in_city"]]

In [None]:
# Create a dictionary mapping the index to the area of gdf_mrdh_65
area_dict = (gdf_mrdh_65.area / 1000000).to_dict()

# Calculate the parked cars per area
long_parked_df["parked_density"] = long_parked_df["value"] / long_parked_df.index.get_level_values("area").map(area_dict)
long_parked_df.head(3)

In [None]:
# Plot
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
axs = axs.flatten()
for i, col in enumerate(['value', 'parked_norm', 'parked_density']):
    sns.lineplot(data=long_parked_df, x='time', y=col, hue='area', palette='rocket', ax=axs[i])
    axs[i].set_title(f'Parked cars in different areas ({col})')
    axs[i].set_ylabel('Parked cars')
    axs[i].set_xlabel('Time of day (hour)')
    axs[i].set_xlim(start_time, end_time)
    if col == 'value':
        axs[i].set_ylim(bottom=0)

# Save image
plt.savefig('img/parked_data.png', dpi=300, bbox_inches='tight')