# Augment DF with most likely next pickup

In [3]:
import pandas as pd
import numpy as np
import os


NUMBER_OF_ZONES = 265

In [4]:
df = pd.read_parquet("all_cleaned_data/all_cleaned_data.parquet", engine='fastparquet')
ptm = np.load("probability_transition_matrix.npy")

In [3]:
df.head()


Unnamed: 0,VendorID,PickupDatetime,DropoffDatetime,TripDuration,PassengerCount,TripDistance,PULocationID,DOLocationID,PaymentType,FareAmount,ExtraCharges,MTATax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
0,2.0,2023-10-01 00:57:33,2023-10-01 01:07:58,10.416667,1.0,1.45,166.0,74.0,1.0,12.1,1.0,0.5,2.92,0.0,1.0,17.52,0.0,0
1,2.0,2023-10-01 01:00:16,2023-10-01 01:06:13,5.95,1.0,0.89,74.0,42.0,2.0,7.9,1.0,0.5,0.0,0.0,1.0,10.4,0.0,0
2,2.0,2023-10-01 00:51:52,2023-10-01 01:00:32,8.666667,1.0,2.38,83.0,129.0,2.0,13.5,1.0,0.5,0.0,0.0,1.0,16.0,0.0,0
3,2.0,2023-10-01 00:03:39,2023-10-01 00:11:20,7.683333,1.0,2.26,74.0,263.0,1.0,11.4,1.0,0.5,3.33,0.0,1.0,19.98,2.75,0
4,2.0,2023-10-01 00:27:42,2023-10-01 00:39:10,11.466667,1.0,2.14,74.0,236.0,1.0,13.5,1.0,0.5,2.81,0.0,1.0,21.559999,2.75,0


In [7]:
from tqdm import tqdm

def augment_df(df, ptm, chunk_size=100_000):
    # Initialize the column in-place to avoid full dataframe copy
    df["NextPU"] = 0

    # Use tqdm to track the progress of chunks
    num_chunks = (len(df) // chunk_size) + 1
    for start in tqdm(range(0, len(df), chunk_size), total=num_chunks, desc="Processing chunks"):
        end = min(start + chunk_size, len(df))
        chunk = df.iloc[start:end]

        for row in chunk.itertuples(index=True):
            i = row.Index

            try:
                hour = row.DropoffDatetime.hour
                do_zone = int(row.DOLocationID)
                distribution = ptm[hour][do_zone]

                if not np.isclose(distribution.sum(), 1.0):
                    df.at[i, "NextPU"] = 0
                else:
                    next_pu = np.random.choice(len(distribution), p=distribution)
                    df.at[i, "NextPU"] = int(next_pu)

            except Exception as e:
                # Fallback to 0 in case of unexpected errors
                df.at[i, "NextPU"] = 0

    return df

In [9]:
output_filename = "all_cleaned_data_augmented.parquet"

df_augmented = augment_df(df.sample(10000), ptm)

df_augmented.to_parquet(output_filename, index=False, engine="fastparquet")



Processing chunks: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.11it/s]


In [10]:
df_augmented.head(500)

Unnamed: 0,VendorID,PickupDatetime,DropoffDatetime,TripDuration,PassengerCount,TripDistance,PULocationID,DOLocationID,PaymentType,FareAmount,ExtraCharges,MTATax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee,NextPU
46510781,2.0,2023-02-16 16:07:21,2023-02-16 16:20:42,13.350000,1.0,2.04,238.0,163.0,1.0,14.200000,2.5,0.5,4.14,0.00,1.0,24.840000,2.5,0,0
13747889,2.0,2024-02-23 22:44:39,2024-02-23 23:18:08,33.483333,1.0,4.03,79.0,48.0,1.0,28.900000,1.0,0.5,3.00,0.00,1.0,36.900002,2.5,0,0
32968806,1.0,2023-11-03 19:47:15,2023-11-03 20:06:09,18.900000,1.0,4.90,87.0,170.0,2.0,25.400000,5.0,0.5,0.00,0.00,1.0,31.900000,2.5,0,0
39620116,2.0,2023-06-06 21:28:08,2023-06-06 21:54:20,26.200000,5.0,3.11,161.0,262.0,1.0,24.700001,1.0,0.5,4.46,0.00,1.0,34.160000,2.5,0,0
30622043,1.0,2024-09-11 07:44:45,2024-09-11 08:14:48,30.050000,1.0,2.00,239.0,43.0,1.0,25.400000,2.5,0.5,5.85,0.00,1.0,35.250000,2.5,0,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14558567,2.0,2024-10-01 20:18:43,2024-10-01 20:26:10,7.450000,1.0,2.08,233.0,79.0,1.0,10.700000,1.0,0.5,3.14,0.00,1.0,18.840000,2.5,0,0
28337129,1.0,2024-11-19 17:45:47,2024-11-19 17:52:35,6.800000,1.0,0.80,170.0,234.0,1.0,7.900000,5.0,0.5,3.60,0.00,1.0,18.000000,2.5,0,0
49201614,2.0,2023-04-15 12:38:33,2023-04-15 14:07:49,89.266667,3.0,20.52,132.0,230.0,1.0,70.000000,0.0,0.5,10.00,6.55,1.0,92.300003,2.5,1,0
6882928,2.0,2023-07-14 16:36:24,2023-07-14 16:42:12,5.800000,1.0,0.87,90.0,246.0,1.0,7.200000,2.5,0.5,2.74,0.00,1.0,16.440001,2.5,0,0
