# Augment DF with most likely next pickup

In [3]:
import pandas as pd
import numpy as np
import os


NUMBER_OF_ZONES = 265

In [4]:
df = pd.read_parquet("all_cleaned_data/all_cleaned_data.parquet", engine='fastparquet')
ptm = np.load("probability_transition_matrix.npy")

In [3]:
df.head()


Unnamed: 0,VendorID,PickupDatetime,DropoffDatetime,TripDuration,PassengerCount,TripDistance,PULocationID,DOLocationID,PaymentType,FareAmount,ExtraCharges,MTATax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
0,2.0,2023-10-01 00:57:33,2023-10-01 01:07:58,10.416667,1.0,1.45,166.0,74.0,1.0,12.1,1.0,0.5,2.92,0.0,1.0,17.52,0.0,0
1,2.0,2023-10-01 01:00:16,2023-10-01 01:06:13,5.95,1.0,0.89,74.0,42.0,2.0,7.9,1.0,0.5,0.0,0.0,1.0,10.4,0.0,0
2,2.0,2023-10-01 00:51:52,2023-10-01 01:00:32,8.666667,1.0,2.38,83.0,129.0,2.0,13.5,1.0,0.5,0.0,0.0,1.0,16.0,0.0,0
3,2.0,2023-10-01 00:03:39,2023-10-01 00:11:20,7.683333,1.0,2.26,74.0,263.0,1.0,11.4,1.0,0.5,3.33,0.0,1.0,19.98,2.75,0
4,2.0,2023-10-01 00:27:42,2023-10-01 00:39:10,11.466667,1.0,2.14,74.0,236.0,1.0,13.5,1.0,0.5,2.81,0.0,1.0,21.559999,2.75,0


In [5]:
import numpy as np
from tqdm import tqdm

def augment_df(df, ptm, chunk_size=100_000):
    # Initialize the column in-place to avoid full dataframe copy
    df["NextPU"] = 0

    # Use tqdm to track the progress of chunks
    num_chunks = (len(df) // chunk_size) + 1
    for start in tqdm(range(0, len(df), chunk_size), total=num_chunks, desc="Processing chunks"):
        end = min(start + chunk_size, len(df))
        chunk = df.iloc[start:end]

        for row in chunk.itertuples(index=True):
            i = row.Index

            try:
                hour = row.DropoffDatetime.hour
                do_zone = int(row.DOLocationID)
                distribution = ptm[hour][do_zone]

                if not np.isclose(distribution.sum(), 1.0):
                    df.at[i, "NextPU"] = 0
                else:
                    next_pu = np.random.choice(len(distribution), p=distribution)
                    df.at[i, "NextPU"] = int(next_pu)

            except Exception as e:
                # Fallback to 0 in case of unexpected errors
                df.at[i, "NextPU"] = 0

    return df

In [6]:
output_filename = "all_cleaned_data_augmented.parquet"

df_augmented = augment_df(df, ptm)

df_augmented.to_parquet(output_filename, index=False, engine="fastparquet")



KeyboardInterrupt: 

In [14]:
df_augmented.head(500)

Unnamed: 0,VendorID,PickupDatetime,DropoffDatetime,TripDuration,PassengerCount,TripDistance,PULocationID,DOLocationID,PaymentType,FareAmount,ExtraCharges,MTATax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee,NextPU
19813584,1.0,2024-05-20 14:05:59,2024-05-20 14:32:45,26.766667,1.0,1.100000,141.0,162.0,1.0,21.900000,2.5,0.5,5.15,0.00,1.0,31.049999,2.5,0,
32956254,1.0,2023-11-03 18:56:22,2023-11-03 19:02:02,5.666667,1.0,0.900000,239.0,236.0,1.0,7.900000,5.0,0.5,2.85,0.00,1.0,17.250000,2.5,0,
68650444,2.0,2024-03-08 14:52:06,2024-03-08 14:58:34,6.466667,2.0,0.970000,50.0,68.0,1.0,7.900000,0.0,0.5,6.00,0.00,1.0,17.900000,2.5,0,
10087710,1.0,2023-05-16 13:10:45,2023-05-16 13:44:40,33.916667,1.0,3.100000,186.0,140.0,1.0,28.200001,2.5,0.5,6.40,0.00,1.0,38.599998,2.5,0,
13877830,1.0,2024-02-25 07:57:59,2024-02-25 08:00:47,2.800000,1.0,0.900000,229.0,162.0,1.0,5.800000,2.5,0.5,1.00,0.00,1.0,10.800000,2.5,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5287163,2.0,2023-08-26 09:10:33,2023-08-26 09:29:57,19.400000,1.0,5.540000,90.0,151.0,2.0,26.799999,0.0,0.5,0.00,0.00,1.0,30.799999,2.5,0,
14525600,2.0,2024-10-01 16:56:00,2024-10-01 17:08:45,12.750000,3.0,1.170000,244.0,243.0,2.0,12.800000,2.5,0.5,0.00,0.00,1.0,16.799999,0.0,0,
55329072,1.0,2024-08-24 10:23:29,2024-08-24 10:49:59,26.500000,1.0,6.700000,79.0,190.0,1.0,31.000000,2.5,0.5,7.00,0.00,1.0,42.000000,2.5,0,
41765163,2.0,2023-06-28 10:21:07,2023-06-28 11:40:38,79.516667,2.0,18.389999,132.0,230.0,1.0,70.000000,0.0,0.5,0.00,6.55,1.0,82.300003,2.5,1,
