Generate a probability matrix for trip dropoff zone, given the pickup zone, at one hour intervals

In [None]:
import pandas as pd
import numpy as np

def build_transition_zoneMat(parquet_path: str,
                            num_zones: int = 266,
                            datetime_col: str = 'tpep_pickup_datetime',
                            pu_col: str = 'PULocationID',
                            do_col: str = 'DOLocationID',
                            output_path: str = 'zone_transition_probabilities_2024.npy'):
    """
    Reads trip data from Parquet, computes a (num_zones x num_zones x 24) tensor
    of transition probabilities P(dropoff=j | pickup=i, hour=h), and saves it.

    Parameters
    ----------
    parquet_path : str
        Path to yellow_taxi_tripdata_2024_clean.parquet.
    num_zones : int
        Number of taxi zones (default: 266).
    datetime_col : str
        Name of the column with pickup timestamps.
    pu_col : str
        Name of the column with pickup zone IDs (1..num_zones).
    do_col : str
        Name of the column with dropoff zone IDs (1..num_zones).
    output_path : str
        Where to write the resulting .npy file.
    """
    #Load parquet
    df = pd.read_parquet(parquet_path, engine='pyarrow')
    
    #Extract hour-of-day [0–23]
    df['hour'] = pd.to_datetime(df[datetime_col]).dt.hour
    
    #Count trips by (hour, pickup_zone, dropoff_zone)
    grouped = (
        df
        .groupby(['hour', pu_col, do_col])
        .size()
        .reset_index(name='count')
    )
    
    
    tensor = np.zeros((num_zones, num_zones, 24), dtype=np.float64)
    
    #For each hour, build and normalize the (266×266) matrix
    zones = np.arange(1, num_zones+1)
    for h in range(24):
        sub = grouped[grouped['hour'] == h]
        #pivot to matrix form, missing entries → 0
        mat = (
            sub
            .pivot_table(index=pu_col,
                         columns=do_col,
                         values='count',
                         fill_value=0)
            .reindex(index=zones, columns=zones, fill_value=0)
            .to_numpy()
        )
        #normalize each row to sum to 1 (where row sum > 0)
        row_sums = mat.sum(axis=1, keepdims=True)
        with np.errstate(divide='ignore', invalid='ignore'):
            mat = np.divide(mat, row_sums, where=(row_sums != 0))
        tensor[:, :, h] = mat
    
    #Save as .npy for fast loading
    np.save(output_path, tensor)
    print(f"Saved transition tensor to {output_path} "
          f"(shape = {tensor.shape}, dtype = {tensor.dtype})")





   

In [10]:

build_transition_zoneMat(
        parquet_path='yellow_taxi_tripdata_2024_clean.parquet',
        output_path='zone_transition_probabilities_2024.npy'
    )



Saved transition tensor to zone_transition_probabilities_2024.npy (shape = (266, 266, 24), dtype = float64)


Now the same type of matrix but for average fare amount

In [None]:


def build_avg_earnings_matrix(
    parquet_path: str,
    num_zones: int = 266,
    datetime_col: str = 'tpep_pickup_datetime',
    pu_col: str = 'PULocationID',
    do_col: str = 'DOLocationID',
    amount_col: str = 'total_amount',
    output_path: str = 'zone_avg_earnings_2024.npy'
):
    """
    Reads trip data from Parquet, computes a (num_zones x num_zones x 24) tensor
    of average earnings per trip: E[amount_col | pickup=i, dropoff=j, hour=h],
    and saves it.

    Parameters
    ----------
    parquet_path : str
        Path to your cleaned 2024 trips Parquet file.
    num_zones : int
        Number of taxi zones (default: 266).
    datetime_col : str
        Column name for pickup timestamps.
    pu_col : str
        Column name for pickup zone IDs (1..num_zones).
    do_col : str
        Column name for drop‑off zone IDs (1..num_zones).
    amount_col : str
        Column name for the money earned on each trip (e.g. 'total_amount').
    output_path : str
        Filename for saving the resulting .npy tensor.
    """
    #Load data
    df = pd.read_parquet(parquet_path, engine='pyarrow')

    #Extract hour of day 0–23
    df['hour'] = pd.to_datetime(df[datetime_col]).dt.hour

    #Compute mean earnings per (hour, pickup, dropoff)
    grouped = (
        df
        .groupby(['hour', pu_col, do_col])[amount_col]
        .mean()
        .reset_index(name='avg_amount')
    )

    
    tensor = np.full((num_zones, num_zones, 24), np.nan, dtype=np.float64)

    #For each hour, pivot into a 266×266 matrix
    zones = np.arange(1, num_zones + 1)
    for h in range(24):
        sub = grouped[grouped['hour'] == h]
        mat = (
            sub
            .pivot_table(
                index=pu_col,
                columns=do_col,
                values='avg_amount',
                fill_value=np.nan
            )
            .reindex(index=zones, columns=zones)
            .to_numpy()
        )
        tensor[:, :, h] = mat

    
    np.save(output_path, tensor)
    print(f"Saved average‐earnings tensor to {output_path} "
          f"(shape={tensor.shape}, dtype={tensor.dtype})")



In [14]:
build_avg_earnings_matrix(
    parquet_path='yellow_taxi_tripdata_2024_clean.parquet',
    output_path='zone_avg_earnings_2024.npy'
)


Saved average‐earnings tensor to zone_avg_earnings_2024.npy (shape=(266, 266, 24), dtype=float64)
