In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

### Auxiliary functions

In [None]:
def compute_distance_moves(df_move) :

    # 1. (Re)project to metric CRS if needed
    move_df = df_move.copy()
    move_df = move_df.to_crs(epsg=3857)
    
    # 2. Sort so that shifts make sense
    move_df = move_df.sort_values(['uid', 'tid', 'move_id', 'datetime'])
    
    # 3. Shift geometry
    move_df['prev_geom'] = move_df.groupby(['uid','tid','move_id'])['geometry'].shift()
    
    # 4. Compute segment distances
    move_df['segment_dist'] = move_df.geometry.distance(move_df['prev_geom'])
    
    # 5. Sum up per move_id
    distance_per_move = (move_df
                         .groupby(['uid','tid','move_id'])['segment_dist']
                         .sum()
                         .reset_index(name='distance_traveled_meters'))
    
    return(distance_per_move)

### Main code

#### Read the enriched move segments datasets.

In [None]:
path_paris = './paris/'
paris_moves = pd.read_parquet(path_paris + 'enriched_moves.parquet')
paris_moves = gpd.GeoDataFrame(paris_moves,
                               geometry=gpd.points_from_xy(paris_moves['lng'], paris_moves['lat']),
                               crs="EPSG:4326")

path_nyc= './nyc/'
nyc_moves = pd.read_parquet(path_nyc + 'enriched_moves.parquet')
nyc_moves = gpd.GeoDataFrame(nyc_moves,
                             geometry=gpd.points_from_xy(nyc_moves['lng'], nyc_moves['lat']),
                             crs="EPSG:4326")

#### Compute the distance traveled within each move segment.

In [None]:
distance_per_move_paris = compute_distance_moves(paris_moves)
print(distance_per_move_paris)

distance_per_move_nyc = compute_distance_moves(nyc_moves)
print(distance_per_move_nyc)

### Plot the histograms of the distances traveled.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

# Adjust the sizes of the text used in the plot.
plt.rcParams.update({
    'font.size': 12,        # Base font size.
    'axes.titlesize': 14,   # Size font title.
})


paris = distance_per_move_paris['distance_traveled_meters'] / 1000
nyc   = distance_per_move_nyc['distance_traveled_meters'] / 1000

# Compute a common min/max interval.
min_d = min(paris.min(), nyc.min())
max_d = max(paris.max(), nyc.max())

# Choose a common binning (e.g. 20 bins spanning the full range)
bins = 30
bin_edges = np.linspace(min_d, max_d, bins + 1)

# Paris histogram
plt.hist(paris,
         bins=bin_edges,
         alpha=1,
         density=True,
         label='Paris',
         color='orange')

# NYC histogram
plt.hist(nyc,
         bins=bin_edges,
         alpha=0.5,            # semi-transparent
         density=True,
         label='New York City',
         color='white',           # Keep the fill white
         edgecolor='black',
         hatch='xxx',)            # Try '///', 'xxx', '...' or other patterns)


plt.axvline(paris.mean(),
            color='blue',
            linestyle='--',
            linewidth=1.5,
            label=f"Avg. distance Paris' moves (in km, {paris.mean():.2f}±{paris.std():.2f})")
plt.axvline(nyc.mean(),
            color='red',
            linestyle='--',
            linewidth=1.5,
            label=f"Avg. distance NYC's moves (in km, {nyc.mean():.2f}±{nyc.std():.2f})")


# Log-scale on the y-axis
plt.yscale('log')
plt.xlim(left=0)

plt.xlabel('Distance covered (in km)')
plt.ylabel('Fraction of move segments (log scale)')
plt.title(f'Distribution of the distance covered by move segments (#bins = {bins})')
plt.legend()
# plt.show()

plt.savefig('distance_move_segments.pdf',
            format='pdf',
            bbox_inches='tight')  # trims extra whitespace

### Transportation means statistics

In [None]:
dic_moves = {0 : 'walk',
                  1 : 'bike',
                  2 : 'bus',
                  3 : 'car',
                  4 : 'subway',
                  5 : 'train',
                  6 : 'taxi'}

trans_stats_paris = paris_moves.groupby(['uid', 'tid', 'move_id', 'label']).size().reset_index().iloc[:, :-1]
trans_stats_paris['label'] = trans_stats_paris['label'].astype(int).map(dic_moves)
trans_stats_paris = trans_stats_paris.groupby('label').size()
display(trans_stats_paris)

trans_stats_nyc = nyc_moves.groupby(['uid', 'tid', 'move_id', 'label']).size().reset_index().iloc[:, :-1]
trans_stats_nyc['label'] = trans_stats_nyc['label'].astype(int).map(dic_moves)
trans_stats_nyc = trans_stats_nyc.groupby('label').size()
display(trans_stats_nyc)

In [None]:
trans_stats_paris /= trans_stats_paris.sum()
trans_stats_nyc /= trans_stats_nyc.sum()

# combine into one DataFrame
df = pd.concat([trans_stats_paris, trans_stats_nyc], axis=1)
df.columns = ['Paris', 'New York City']


# Adjust the sizes of the text used in the plot.
plt.rcParams.update({
    'font.size': 12,        # Base font size.
    'axes.titlesize': 14,   # Size font title.
})

# plot — pandas will draw grouped bars by default
ax = df.plot(
    kind='bar',
    figsize=(8, 4),
    alpha=0.7
)

# Rotate the x-axis ticks' labels appropriately.
ax.set_xticklabels(
    df.index,                   # your category labels
    rotation=45,                # still rotate 45°
    ha='right',                 # anchor text’s right end at the tick
    rotation_mode='anchor'      # rotate around that anchor point
)

ax.set_title(f"Distribution transportation means for Paris and New York City")
ax.set_xlabel('Transportation means')
ax.set_ylabel('Fraction')
ax.legend()
plt.tight_layout()

# plt.show()

plt.savefig('transportation_moves.pdf',
            format='pdf',
            bbox_inches='tight')  # trims extra whitespace