### Overhead

In [31]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

### Parameters

In [32]:
INPUT_STATION_LOCATIONS_AND_ACTIVITY_FILE = 'data/input/station-locations-and-activity.csv'
INPUT_DISTANCE_BETWEEN_STATIONS_FILE = 'data/input/distance-between-stations.csv'

OUTPUT_TRIPS_FILE = 'data/output/synthetic-trips.csv'

INTRAZONAL_TRIPS = 10
MINIMUM_ADDITIONAL_DURATION_IN_MINUTES = 2
MAXIMUM_ADDITIONAL_DURATION_IN_MINUTES = 120
BIKE_SPEED_IN_KPH = 20
INPUT_DATE = "2025-06-15"

### Methods

#### Read Methods

In [33]:
def read_station_locations_and_activity(file_path):
    return pd.read_csv(file_path)

In [34]:
def read_distance_between_stations(file_path):
    return pd.read_csv(file_path)

#### Other Methods

In [35]:
def apply_gravity_model(activity_df, distance_df, intrazonal_trips):
    """Applies a gravity model to estimate the number of trips between locations"""
    # Create a matrix to store the estimated trips
    trips_df = pd.DataFrame(index=activity_df['station_name'], columns=activity_df['station_name'])
    
    # Apply the gravity model
    for i in activity_df.index:
        for j in activity_df.index:
            if i != j:
                distance = distance_df[(distance_df['from'] == activity_df.loc[i, 'station_name']) & (distance_df['to'] == activity_df.loc[j, 'station_name'])]['distance_in_km'].values[0]
                productions = activity_df.loc[i, 'daily_activity']
                attractions = activity_df.loc[j, 'daily_activity']
                trips_df.loc[activity_df.loc[i, 'station_name'], activity_df.loc[j, 'station_name']] = productions * attractions / (distance ** 2)
            else:
                trips_df.loc[activity_df.loc[i, 'station_name'], activity_df.loc[j, 'station_name']] = intrazonal_trips
    
    # Melt the matrix into a DataFrame
    trips_df = trips_df.reset_index()
    trips_df = pd.melt(trips_df, id_vars='station_name', value_vars=activity_df['station_name'], var_name='to', value_name='trips')
    trips_df = trips_df.rename(columns={'station_name': 'from'})
    
    trips_df = trips_df[trips_df['trips'] > 0]

    adjustment_factor = sum(activity_df['daily_activity']) / sum(trips_df['trips'])

    trips_df['trips'] = trips_df['trips'] * adjustment_factor
    
    return trips_df

In [36]:
def enumerate_trips(trips_df, date):
    """Enumerates the trips with their own time stamp"""
    # Create a new DataFrame to store the enumerated trips
    enumerated_trips_df = pd.DataFrame(columns=['start_station_name', 'end_station_name', 'start_date'])
    
    # Define the time ranges for each type of trip
    time_ranges = {
        'Spaulding High School': [(7, 9), (15, 19)],
        'Montpelier High School': [(7, 9), (15, 19)],
        'from_school': [(14.5, 18)],
        'other': [(8, 19)]
    }
    
    # Enumerate the trips
    index = 0
    for i in trips_df.index:
        num_trips = int(trips_df.loc[i, 'trips'])
        for _ in range(num_trips):
            # Determine the time range based on the from and to stations
            if trips_df.loc[i, 'from'] in time_ranges:
                time_range = time_ranges[trips_df.loc[i, 'from']]
            elif trips_df.loc[i, 'to'] in time_ranges:
                time_range = time_ranges[trips_df.loc[i, 'to']]
            elif trips_df.loc[i, 'from'] in ['Spaulding High School', 'Montpelier High School']:
                time_range = time_ranges['from_school']
            else:
                time_range = time_ranges['other']
            
            # Draw a random time from the time range
            start_time = datetime.strptime(f"{date} {time_range[0][0]}:00", "%Y-%m-%d %H:%M")
            end_time = datetime.strptime(f"{date} {time_range[0][1]}:00", "%Y-%m-%d %H:%M")
            time_stamp = start_time + timedelta(hours=np.random.uniform(0, (end_time - start_time).total_seconds() / 3600))
            
            # Add the trip to the enumerated trips DataFrame
            enumerated_trips_df.loc[index] = [trips_df.loc[i, 'from'], trips_df.loc[i, 'to'], time_stamp]
            index += 1
    
    return enumerated_trips_df

In [37]:
def add_duration(input_df):
    """Adds a duration column and computes the end time"""

    input_df['duration'] = np.random.randint(
        MINIMUM_ADDITIONAL_DURATION_IN_MINUTES, 
        MAXIMUM_ADDITIONAL_DURATION_IN_MINUTES + 1, 
        size=len(input_df)) + input_df['distance_in_km']/BIKE_SPEED_IN_KPH * 60
    
    # Compute the end time based on the depart time and duration
    input_df['end_date'] = input_df.apply(lambda row: row['start_date'] + timedelta(minutes=row['duration']), axis=1)
    
    return input_df

### Apply Steps

#### Step 1: Read input files, apply gravity model, and enumerate trips

In [38]:
activity_df = read_station_locations_and_activity(INPUT_STATION_LOCATIONS_AND_ACTIVITY_FILE)
distance_df = read_distance_between_stations(INPUT_DISTANCE_BETWEEN_STATIONS_FILE)

trips_df = apply_gravity_model(activity_df, distance_df, INTRAZONAL_TRIPS)
enumerated_trips_df = enumerate_trips(trips_df, INPUT_DATE)


In [39]:
enumerated_trips_df = pd.merge(
    enumerated_trips_df,
    distance_df[['from', 'to', 'distance_in_km']],
    left_on=['start_station_name', 'end_station_name'],
    right_on=['from', 'to'],
    how='left'
).drop(['from', 'to'], axis=1)

enumerated_trips_df.head()

Unnamed: 0,start_station_name,end_station_name,start_date,distance_in_km
0,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 16:39:40.163865,1.61
1,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 16:50:19.020543,1.61
2,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 18:57:31.999224,1.61
3,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 14:15:05.906426,1.61
4,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 10:58:59.540547,1.61


#### Step 2: Estimate duration

In [40]:
enumerated_trips_df = add_duration(enumerated_trips_df)

#### Step 3: Tidy up with `station_id`

In [41]:
output_trips_df = pd.merge(
    enumerated_trips_df,
    activity_df[['station_name', 'station_id']],
    left_on='start_station_name',
    right_on='station_name',
    how='left'
).rename(columns={'station_id': 'start_station_id'}).drop('station_name', axis=1)

output_trips_df = pd.merge(
    output_trips_df,
    activity_df[['station_name', 'station_id']],
    left_on='end_station_name',
    right_on='station_name',
    how='left'
).rename(columns={'station_id': 'end_station_id'}).drop('station_name', axis=1)

output_trips_df['trip_id'] = output_trips_df.index + 1

output_trips_df.head()

Unnamed: 0,start_station_name,end_station_name,start_date,distance_in_km,duration,end_date,start_station_id,end_station_id,trip_id
0,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 16:39:40.163865,1.61,111.83,2025-06-15 18:31:29.963865,2,1,1
1,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 16:50:19.020543,1.61,18.83,2025-06-15 17:09:08.820543,2,1,2
2,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 18:57:31.999224,1.61,114.83,2025-06-15 20:52:21.799224,2,1,3
3,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 14:15:05.906426,1.61,31.83,2025-06-15 14:46:55.706426,2,1,4
4,Corner of Main and State in Montpelier,Kellogg-Hubbard Library in Montpelier,2025-06-15 10:58:59.540547,1.61,40.83,2025-06-15 11:39:49.340547,2,1,5


#### Step 4: Write to disk

In [42]:
output_trips_df[['trip_id',
                 'start_station_id',
                 'start_station_name',
                 'start_date',
                 'end_station_id',
                 'end_station_name',
                 'end_date']].to_csv(OUTPUT_TRIPS_FILE, index=False)