In [29]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

NUMBER_OF_ZONES = 265

In [None]:
#get distance mat and df
intra_zone_matrix = np.load("intra_zone_matrix.npy")
df = pd.read_parquet("all_cleaned_data/all_cleaned_data.parquet", engine='fastparquet')
df.head()

In [28]:
#define functions

#Find all close zones for each zone
def close_zones(intra_zone_matrix):
    close_zones = np.empty(266, dtype=object)
    for p in range(266):
        close_zones[p] = []
        for d in range(266):
            if intra_zone_matrix[p][d][0] < 15:
                close_zones[p].append(d)
                
    return close_zones
         
#generate the probability transition matrix
def probability_transition_matrix(df, intra_zone_matrix):
    
    #find all the close zones
    close_zones = close_zones(intra_zone_matrix)
    
    #initialise matrix with shape: (hour of the day, dropoff zone, next likely pickup zone)
    ptm = np.zeros((24,266,266))
    
    #this is only the first 500- if this function seems accurate we can run it on the whole data on a labmachine/desktop 
    for i, row in df.head(500).iterrows():

        #Find the Hour and the Pickup Zone
        hour = row["PickupDatetime"].hour
        pu_zone = row["PULocationID"]

        # For every zone that is close, we +1 in its relevant slot that this pickup was made, thereby increasing its associate demand
        # Essentially, we accumulate all the pickups that have been made nearby for each zone in each hour.
        for close_zone in close_zones:
            ptm[hour, close_zone, pu_zone]+=1

    row_sums = ptm.sum(axis=2, keepdims=True)

    # normalise but avoid division by zero
    ptm = np.divide(ptm, row_sums, where=row_sums != 0)
    
    #row_sums_check = ptm.sum(axis=2) #Veify for sanity that all add to 1
    
    # reshape matrix to shape (24 * 266, 266)
    ptm = ptm.reshape(-1, matrix.shape[2])
    
    return ptm




In [None]:
# save matrix
filename = "probability_transition_matrix.npy"

ptm = probability_transition_matrix(df, intra_zone_matrix)
np.save(filename, ptm)

print("Probability Transition Matrix saved at ", filename)