In [3]:
from scipy.stats import chisquare
from collections import defaultdict
import pandas as pd
import numpy as np
import model_helper_functions as mod

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


In [4]:
df =  pd.read_parquet("all_cleaned_data_augmented.parquet")
df = mod.necessary_fields(df)
df['Hour'] = df['DropoffDatetime'].apply(mod.round_time_to_int)
df = df.drop(columns = ['DropoffDatetime', 'PickupDatetime','TripDuration', 'TripDistance', 'FareAmount', 'TipAmount'])
df['PULocationID'] = df['PULocationID'].astype(int)
df['DOLocationID'] = df['DOLocationID'].astype(int)

df.head()


Unnamed: 0,PULocationID,DOLocationID,NextPU,Hour
0,166,74,74,1
1,74,42,82,1
2,83,129,116,1
3,74,263,7,0
4,74,236,80,1


In [38]:
def run_chi_squared(group):
    counts_pu = group['PULocationID'].value_counts().sort_index()
    counts_nextpu = group['DOLocationID'].value_counts().sort_index()
    
    all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

    f_obs = counts_nextpu.reindex(all_ids, fill_value=0).values
    f_exp = counts_pu.reindex(all_ids, fill_value=0).values

    # Filter out zero-expected entries
    mask = f_exp > 0
    f_obs = f_obs[mask]
    f_exp = f_exp[mask]

    # Normalize expected to match total observed
    f_exp = f_exp * (f_obs.sum() / f_exp.sum())

    # Only run test if valid
    if len(f_obs) > 0 and np.all(f_exp > 0):
        stat, p = chisquare(f_obs=f_obs, f_exp=f_exp)
    else:
        stat, p = np.nan, np.nan
    return stat, p

In [33]:
def run_chi_squared_by_singular(df):
    

    results = []

    for hour, group in df.groupby('Hour'):
        counts_pu = group['PULocationID'].value_counts().sort_index()
        counts_nextpu = group['DOLocationID'].value_counts().sort_index()

        all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

        f_obs = counts_nextpu.reindex(all_ids, fill_value=0).values
        f_exp = counts_pu.reindex(all_ids, fill_value=0).values

        # Filter out zero expected frequencies to avoid divide-by-zero
        mask = f_exp > 0
        f_obs = f_obs[mask]
        f_exp = f_exp[mask]

        # Normalize expected counts to match total observed
        if f_exp.sum() == 0:
            stat, p = np.nan, np.nan
        else:
            f_exp = f_exp * (f_obs.sum() / f_exp.sum())
            stat, p = chisquare(f_obs=f_obs, f_exp=f_exp)

        results.append({
            'Hour': hour,
            'Chi2_Stat': stat,
            'p_value': p,
            'N': len(group)
        })

    return pd.DataFrame(results)

In [47]:
def run_chi_squared_unified(df):
    # Get counts for PULocationID and NextPU
    counts_pu = df['PULocationID'].value_counts().sort_index()
    counts_nextpu = df['NextPU'].value_counts().sort_index()

    # Combine the counts into a single list of all possible values (PULocationID + NextPU)
    all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

    # Reindex the counts to make sure both distributions align
    f_obs = counts_nextpu.reindex(all_ids, fill_value=0).values
    f_exp = counts_pu.reindex(all_ids, fill_value=0).values

    # Perform Chi-squared test
    stat, p_value = chisquare(f_obs=f_obs, f_exp=f_exp)

    return {
        'Chi2_Stat': stat,
        'p_value': p_value,
        'N': len(df)
    }

result = run_chi_squared_unified(df)
print(result)


{'Chi2_Stat': 724168083.1631432, 'p_value': 0.0, 'N': 24145198}


In [45]:
grouped = df.groupby(['NextPU', 'Hour'])
results = []
for (dropoff_id, hour), group in grouped:
    if len(group) >= 30:  # Filter small groups
        stat, p = run_chi_squared_unified(group)
        results.append({
            'DOLocationID': dropoff_id,
            'Hour': hour,
            'Chi2_Stat': stat,
            'p_value': p,
            'N': len(group)
        })



ValueError: too many values to unpack (expected 2)

In [40]:
results_df = pd.DataFrame(results)

# Optional: filter to groups where distributions are significantly different
significant = results_df[results_df['p_value'] < 0.05]
print(f"Significant differences in {len(significant)} groups out of {len(results_df)} total.")

Significant differences in 310 groups out of 310 total.


In [34]:
results_by_hour = run_chi_squared_by_singular(df)
print(results_by_hour)

    Hour     Chi2_Stat  p_value        N
0      0  4.754270e+06      0.0  1207203
1      1  6.159877e+06      0.0  1763888
2      2  3.448100e+06      0.0  1166909
3      3  2.045648e+06      0.0   738832
4      4  1.241154e+06      0.0   477572
5      5  9.241075e+05      0.0   281212
6      6  1.471172e+06      0.0   469089
7      7  1.553347e+06      0.0  1143866
8      8  1.163970e+06      0.0  2073441
9      9  1.252724e+06      0.0  2735951
10    10  1.101830e+06      0.0  3086430
11    11  9.916578e+05      0.0  3360550
12    12  8.953423e+05      0.0  3685443
13    13  4.879648e+05      0.0  1954812


In [31]:
from scipy.spatial.distance import jensenshannon

def run_jsd_by_zone_and_hour(df):
    results = []

    # Make sure Hour column exists
    if 'Hour' not in df.columns:
        df['Hour'] = pd.to_datetime(df['tpep_dropoff_datetime']).dt.hour

    # Group by DOLocationID and Hour
    grouped = df.groupby(['DOLocationID', 'Hour'])

    for (zone, hour), group in grouped:
        counts_pu = group['PULocationID'].value_counts().sort_index()
        counts_nextpu = group['NextPU'].value_counts().sort_index()

        all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

        dist_pu = counts_pu.reindex(all_ids, fill_value=0).values.astype(float)
        dist_nextpu = counts_nextpu.reindex(all_ids, fill_value=0).values.astype(float)

        # Convert to probability distributions
        if dist_pu.sum() == 0 or dist_nextpu.sum() == 0:
            jsd = np.nan
        else:
            dist_pu /= dist_pu.sum()
            dist_nextpu /= dist_nextpu.sum()
            jsd = jensenshannon(dist_pu, dist_nextpu, base=2)

        results.append({
            'DOLocationID': zone,
            'Hour': hour,
            'JSD': jsd,
            'N': len(group)
        })

    return pd.DataFrame(results)

def run_jsd_by_zone(df):
    results = []

    for zone, group in df.groupby('DOLocationID'):
        counts_pu = group['PULocationID'].value_counts().sort_index()
        counts_nextpu = group['NextPU'].value_counts().sort_index()

        all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

        dist_pu = counts_pu.reindex(all_ids, fill_value=0).values.astype(float)
        dist_nextpu = counts_nextpu.reindex(all_ids, fill_value=0).values.astype(float)

        # Convert to probability distributions
        if dist_pu.sum() == 0 or dist_nextpu.sum() == 0:
            jsd = np.nan
        else:
            dist_pu /= dist_pu.sum()
            dist_nextpu /= dist_nextpu.sum()
            jsd = jensenshannon(dist_pu, dist_nextpu, base=2)  # base-2 gives range 0–1

        results.append({
            'DOLocationID': zone,
            'JSD': jsd,
            'N': len(group)
        })

    return pd.DataFrame(results)

In [32]:
results_jsd = run_jsd_by_zone(df)
print(results_jsd.sort_values('JSD', ascending=False).head(10))  # Most different
print(results_jsd.sort_values('JSD').head(10))  # Most similar

     DOLocationID       JSD     N
83             84  1.000000     8
180           187  1.000000    33
101           105  1.000000    13
196           204  1.000000     3
1               2  1.000000    43
43             44  0.994385   123
0               1  0.994341  3437
150           156  0.991031    56
237           245  0.990774    75
243           251  0.987096   117
     DOLocationID       JSD       N
41             42  0.742455  100164
40             41  0.749688  139706
146           152  0.750927   44692
73             74  0.763960  190686
153           159  0.764365    9116
68             69  0.769780   10071
160           166  0.774174  196921
162           168  0.776722   17236
239           247  0.778511   11219
161           167  0.779427    4046


In [20]:
def run_kl_divergence_unified(df):

    counts_pu = df['DOLocationID'].value_counts().sort_index()
    counts_nextpu = df['PULocationID'].value_counts().sort_index()


    all_ids = sorted(set(counts_pu.index).union(set(counts_nextpu.index)))

    # Reindex the counts to make sure both distributions align
    dist_pu = counts_pu.reindex(all_ids, fill_value=0).values.astype(float)
    dist_nextpu = counts_nextpu.reindex(all_ids, fill_value=0).values.astype(float)


    if dist_pu.sum() == 0 or dist_nextpu.sum() == 0:
        kl_div = np.nan
    else:
        dist_pu /= dist_pu.sum()
        dist_nextpu /= dist_nextpu.sum()


        epsilon = 1e-10
        dist_pu = np.maximum(dist_pu, epsilon)
        dist_nextpu = np.maximum(dist_nextpu, epsilon)

        # KL Divergence (P || Q)
        kl_div = np.sum(dist_pu * np.log(dist_pu / dist_nextpu))

    return {
        'KL_Divergence': kl_div,
        'N': len(df)
    }

In [21]:
result = run_kl_divergence_unified(df)
print(result)

{'KL_Divergence': 0.1213840937824795, 'N': 24145198}
