In [1]:
from scipy.stats import chisquare
from collections import defaultdict
import pandas as pd
import numpy as np
import model_helper_functions as mod

In [2]:
df =  pd.read_parquet("all_cleaned_data_augmented.parquet")
df = mod.necessary_fields(df)
df['Hour'] = df['DropoffDatetime'].apply(mod.round_time_to_int)
df = df.drop(columns = ['DropoffDatetime', 'PickupDatetime','TripDuration', 'TripDistance', 'FareAmount', 'TipAmount'])
df['PULocationID'] = df['PULocationID'].astype(int)
df['DOLocationID'] = df['DOLocationID'].astype(int)

df.head()


Unnamed: 0,PULocationID,DOLocationID,NextPU,Hour
0,166,74,74,1
1,74,42,82,1
2,83,129,116,1
3,74,263,7,0
4,74,236,80,1


In [20]:
def run_kl_divergence_unified(df):

    counts_pu = df['DOLocationID'].value_counts().sort_index()
    counts_nextpu = df['PULocationID'].value_counts().sort_index()


    all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

    # Reindex the counts to make sure both distributions align
    dist_pu = counts_pu.reindex(all_ids, fill_value=0).values.astype(float)
    dist_nextpu = counts_nextpu.reindex(all_ids, fill_value=0).values.astype(float)


    if dist_pu.sum() == 0 or dist_nextpu.sum() == 0:
        kl_div = np.nan
    else:
        dist_pu /= dist_pu.sum()
        dist_nextpu /= dist_nextpu.sum()


        epsilon = 1e-10
        dist_pu = np.maximum(dist_pu, epsilon)
        dist_nextpu = np.maximum(dist_nextpu, epsilon)

        # KL Divergence (P || Q)
        kl_div = np.sum(dist_pu * np.log(dist_pu / dist_nextpu))

    return {
        'KL_Divergence': kl_div,
        'N': len(df)
    }

In [11]:
def run_kl_by_hour(df):
    results = []

    for hour, group in df.groupby('Hour'):
        # Get counts of PULocationID and NextPU
        counts_pu = group['DOLocationID'].value_counts().sort_index()
        counts_nextpu = group['PULocationID'].value_counts().sort_index()

        # Union of all location IDs
        all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

        # Align distributions
        dist_pu = counts_pu.reindex(all_ids, fill_value=0).values.astype(float)
        dist_nextpu = counts_nextpu.reindex(all_ids, fill_value=0).values.astype(float)

        # Normalize to probability distributions
        if dist_pu.sum() == 0 or dist_nextpu.sum() == 0:
            kl_div = np.nan
        else:
            dist_pu /= dist_pu.sum()
            dist_nextpu /= dist_nextpu.sum()

            # Add epsilon to avoid log(0)
            epsilon = 1e-10
            dist_pu = np.maximum(dist_pu, epsilon)
            dist_nextpu = np.maximum(dist_nextpu, epsilon)

            # KL(PU || NextPU)
            kl_div = np.sum(dist_pu * np.log(dist_pu / dist_nextpu))

        results.append({
            'Hour': hour,
            'KL_Divergence': kl_div,
            'N': len(group)
        })

    return pd.DataFrame(results).sort_values('Hour')

In [12]:
result = run_kl_by_hour(df)
print(result)

    Hour  KL_Divergence        N
0      0       0.609162  1207203
1      1       0.588672  1763888
2      2       0.572761  1166909
3      3       0.567741   738832
4      4       0.567362   477572
5      5       0.522358   281212
6      6       0.500149   469089
7      7       0.368311  1143866
8      8       0.193835  2073441
9      9       0.155051  2735951
10    10       0.115210  3086430
11    11       0.090138  3360550
12    12       0.071641  3685443
13    13       0.069369  1954812
