In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

NUMBER_OF_ZONES = 265

In [4]:
#get distance mat and df
intra_zone_matrix = np.load("intra_zone_matrix.npy")
df = pd.read_parquet("all_cleaned_data/all_cleaned_data.parquet", engine='fastparquet')
df.head()

Unnamed: 0,VendorID,PickupDatetime,DropoffDatetime,TripDuration,PassengerCount,TripDistance,PULocationID,DOLocationID,PaymentType,FareAmount,ExtraCharges,MTATax,TipAmount,TollsAmount,ImprovementSurcharge,TotalAmount,CongestionSurcharge,AirportFee
0,2.0,2023-10-01 00:57:33,2023-10-01 01:07:58,10.416667,1.0,1.45,166.0,74.0,1.0,12.1,1.0,0.5,2.92,0.0,1.0,17.52,0.0,0
1,2.0,2023-10-01 01:00:16,2023-10-01 01:06:13,5.95,1.0,0.89,74.0,42.0,2.0,7.9,1.0,0.5,0.0,0.0,1.0,10.4,0.0,0
2,2.0,2023-10-01 00:51:52,2023-10-01 01:00:32,8.666667,1.0,2.38,83.0,129.0,2.0,13.5,1.0,0.5,0.0,0.0,1.0,16.0,0.0,0
3,2.0,2023-10-01 00:03:39,2023-10-01 00:11:20,7.683333,1.0,2.26,74.0,263.0,1.0,11.4,1.0,0.5,3.33,0.0,1.0,19.98,2.75,0
4,2.0,2023-10-01 00:27:42,2023-10-01 00:39:10,11.466667,1.0,2.14,74.0,236.0,1.0,13.5,1.0,0.5,2.81,0.0,1.0,21.559999,2.75,0


In [6]:
df.size

1277028486

In [26]:
#define functions

#Find all close zones for each zone
def close_zones_fn(intra_zone_matrix):
    close_zones = np.empty(266, dtype=object)
    for p in range(266):
        close_zones[p] = []
        for d in range(266):
            if intra_zone_matrix[p][d][0] < 15:
                close_zones[p].append(int(d))
                
    return close_zones
         
#generate the probability transition matrix
def probability_transition_matrix_fn(df, intra_zone_matrix):
    
    #find all the close zones
    close_zones = close_zones_fn(intra_zone_matrix)
    
    #initialise matrix with shape: (hour of the day, dropoff zone, next likely pickup zone)
    ptm = np.zeros((24,266,266))
    
    #this is only the first 500- if this function seems accurate we can run it on the whole data on a labmachine/desktop 
    for i, row in df.iterrows():

        #Find the Hour and the Pickup Zone
        hour = row["PickupDatetime"].hour
        pu_zone = int(row["PULocationID"])



        # For every zone that is close, we +1 in its relevant slot that this pickup was made, thereby increasing its associate demand
        # Essentially, we accumulate all the pickups that have been made nearby for each zone in each hour.
        for close_zone in close_zones:
            ptm[hour, close_zone, pu_zone]+=1

    row_sums = ptm.sum(axis=2, keepdims=True)

    # normalise but avoid division by zero
    ptm_unshaped = np.divide(ptm, row_sums, where=row_sums != 0)
    
    #row_sums_check = ptm.sum(axis=2) #Veify for sanity that all add to 1
    
    # reshape matrix to shape (24 * 266, 266)
    #ptm = ptm_unshaped.reshape(-1, ptm_unshaped.shape[2])
    
    return ptm_unshaped




In [None]:
# save matrix
filename = "probability_transition_matrix.npy"
#filename_unshaped = "probability_transition_matrix_unshaped.npy"

ptm_result = probability_transition_matrix_fn(df, intra_zone_matrix)
np.save(filename, ptm_result)
#np.save(filename_unshaped, ptm_unshaped)

print("Probability Transition Matrix saved at ", filename)
#print("Unshaped Probability Transition Matrix saved at ", filename_unshaped)

In [13]:
ptm = np.load("probability_transition_matrix.npy")
ptm[12][244]


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01754386, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.01754386, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0877193 , 0.07017544, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.01754386, 0.        , 0.        , 0.        ,
       0.05263158, 0.01754386, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.22807