In [None]:
import pandas as pd
import numpy as np

In [None]:
path_paris = './paris/'
paris_stops = pd.concat([pd.read_parquet(path_paris + 'enriched_occasional.parquet'),
                        pd.read_parquet(path_paris + 'enriched_systematic.parquet')])
display(paris_stops)
display(paris_stops.info())

path_nyc = './nyc/'
nyc_stops = pd.concat([pd.read_parquet(path_nyc + 'enriched_occasional.parquet'),
                      pd.read_parquet(path_nyc + 'enriched_systematic.parquet')])
display(nyc_stops)
display(nyc_stops.info())

In [None]:
# Count the number of POIs associated with each stop -- we look at the non NaN values in the 'category' column.
num_pois_stop_paris = paris_stops.groupby('stop_id')['category'].count()
num_pois_stop_nyc = nyc_stops.groupby('stop_id')['category'].count()

print(f'Average number of POIs per stop in Paris: {num_pois_stop_paris.mean()} ± {num_pois_stop_paris.std()}')
print(f'Average number of POIs per stop in NYC: {num_pois_stop_nyc.mean()} ± {num_pois_stop_nyc.std()}')

### Histogram number POIs per stop

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

# Adjust the sizes of the text used in the plot.
plt.rcParams.update({
    'font.size': 12,        # Base font size.
    'axes.titlesize': 14,   # Size font title.
})


paris = num_pois_stop_paris
nyc   = num_pois_stop_nyc

# Compute a common min/max interval.
min_d = min(paris.min(), nyc.min())
max_d = max(paris.max(), nyc.max())

# Choose a common binning (e.g. 20 bins spanning the full range)
bins = 30
bin_edges = np.linspace(min_d, max_d, bins + 1)

# Paris histogram
plt.hist(paris,
         bins=bin_edges,
         alpha=1,
         density=True,
         label='Paris',
         color='orange')

# NYC histogram
plt.hist(nyc,
         bins=bin_edges,
         alpha=0.5,            # semi-transparent
         density=True,
         label='New York City',
         color='white',           # Keep the fill white
         edgecolor='black',
         hatch='xxx',)            # Try '///', 'xxx', '...' or other patterns)


plt.axvline(paris.mean(),
            color='blue',
            linestyle='--',
            linewidth=1.5,
            label=f'Avg. # POIs per stop in Paris ({paris.mean():.2f}±{paris.std():.2f})')
plt.axvline(nyc.mean(),
            color='blue',
            linestyle=':',
            linewidth=1.5,
            label=f'Avg. # POIs per stop in NYC ({nyc.mean():.2f}±{nyc.std():.2f})')


# Log-scale on the y-axis
plt.yscale('log')
plt.xlim(left=0)

plt.xlabel('Number of POIs per stop')
plt.ylabel('Fraction of stops (log scale)')
plt.title(f'Number of POIs per stop distribution (#bins = {bins})')
plt.legend()
# plt.show()

plt.savefig('number_pois_stop.pdf',
            format='pdf',
            bbox_inches='tight')  # trims extra whitespace

### Analysis stop duration

In [None]:
duration_stop_paris = paris_stops.loc[:, ['stop_id', 'datetime', 'leaving_datetime']].drop_duplicates('stop_id')
duration_stop_paris['duration_mins'] = (duration_stop_paris['leaving_datetime'] - duration_stop_paris['datetime']).dt.total_seconds() / 60

duration_stop_nyc = nyc_stops.loc[:, ['stop_id', 'datetime', 'leaving_datetime']].drop_duplicates('stop_id')
duration_stop_nyc['duration_mins'] = (duration_stop_nyc['leaving_datetime'] - duration_stop_nyc['datetime']).dt.total_seconds() / 60

### Histogram duration stop plot

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

# Adjust the sizes of the text used in the plot.
plt.rcParams.update({
    'font.size': 12,        # Base font size.
    'axes.titlesize': 14,   # Size font title.
})


paris = duration_stop_paris['duration_mins']
nyc   = duration_stop_nyc  ['duration_mins']

# Compute a common min/max interval.
min_d = min(paris.min(), nyc.min())
max_d = max(paris.max(), nyc.max())

# Choose a common binning (e.g. 20 bins spanning the full range)
bins = 30
bin_edges = np.linspace(min_d, max_d, bins + 1)

# Paris histogram
plt.hist(paris,
         bins=bin_edges,
         alpha=1,
         density=True,
         label=f'Paris (avg: {paris.mean():.2f}±{paris.std():.2f})',
         color='orange')

# NYC histogram
plt.hist(nyc,
         bins=bin_edges,
         alpha=0.3,            # semi-transparent
         density=True,
         label=f'New York City (avg: {nyc.mean():.2f}±{nyc.std():.2f})',
         color='white',           # Keep the fill white
         edgecolor='black',
         hatch='xxx')         # user-requested different colors

plt.axvline(24 * 60,
            color='blue',
            linestyle='--',
            linewidth=1.5,
            label='1 day duration')

plt.axvline(7 * 24 * 60,
            color='blue',
            linestyle='--',
            linewidth=1.5,
            label='1 week duration')

# Log-scale on the y-axis
plt.yscale('log')
plt.xlim(left=0)

plt.xlabel('Stop duration (minutes)')
plt.ylabel('Fraction of stops (log scale)')
plt.title(f'Stop duration distribution (#bins = {bins})')
plt.legend()
# plt.show()

plt.savefig('duration_stop.pdf',
            format='pdf',
            bbox_inches='tight')  # trims extra whitespace

### Analysis categories POIs associated with stops

In [None]:
categories_poi_stops_paris = paris_stops.groupby('category').size().sort_values(ascending = False)
categories_poi_stops_paris /= categories_poi_stops_paris.sum()
categories_poi_stops_nyc = nyc_stops.groupby('category').size().sort_values(ascending = False)
categories_poi_stops_nyc /= categories_poi_stops_nyc.sum()

# combine into one DataFrame
df = pd.concat([categories_poi_stops_paris, categories_poi_stops_nyc], axis=1)
df.columns = ['Paris', 'New York City']

# Compute combined fractions and sort index by descending sum. Then select the top-k categories when considering Paris and NYC toghether.
k = 15
order = df.sum(axis=1).sort_values(ascending=False).index
df = df.loc[order][:k]


# Adjust the sizes of the text used in the plot.
plt.rcParams.update({
    'font.size': 12,        # Base font size.
    'axes.titlesize': 14,   # Size font title.
})

# plot — pandas will draw grouped bars by default
ax = df.plot(
    kind='bar',
    figsize=(8, 4),
    alpha=0.7
)

# Rotate the x-axis ticks' labels appropriately.
ax.set_xticklabels(
    df.index,                   # your category labels
    rotation=45,                # still rotate 45°
    ha='right',                 # anchor text’s right end at the tick
    rotation_mode='anchor'      # rotate around that anchor point
)

ax.set_title(f'Distribution of the top-{k} POI Categories in Paris and New York City')
ax.set_xlabel('POI Category')
ax.set_ylabel('Fraction')
ax.legend()
plt.tight_layout()

# plt.show()

plt.savefig('categories_stop.pdf',
            format='pdf',
            bbox_inches='tight')  # trims extra whitespace