# Preprocessing

In [1]:
# for nyc (New York) and sz (Shenzhen) datasets

import json
from pathlib import Path

import numpy as np
import pandas as pd

dataset = {
    'nyc': {
        'json_file': 'scenario_nyc_brooklyn.json',
        'tstep': 4,
        'scalar': 9
    },
    'sz': {
        'json_file': 'scenario_shenzhen_downtown_west.json',
        'tstep': 3,
        'scalar': 2.5
    }
}

for name, d in dataset.items():
    datapath = Path.cwd() / name / d['json_file']
    # print(datapath)
    
    data = None
    with open(datapath, 'r') as f:
        data = json.load(f)
    
    assert data is not None
    R = data['nlat'] * data['nlon']
    C = data['totalAcc']
    
    adjacency = np.zeros((R, R))
    edges = [tuple(d.values()) for d in data['topology_graph']]
    for e in edges:
        i, j = e
        adjacency[i, j] = 1
        adjacency[j, i] = 1 # comment this line for directed graph
    
    demand_df = pd.DataFrame(data['demand'])
    T = int((demand_df['time_stamp'].max() - demand_df['time_stamp'].min() + 1) / d['tstep'])
    
    plam = np.zeros((T, R, R))
    prices = np.zeros((T, R, R))
    tt_f = np.zeros((T, R, R))
    
    scalar = d['scalar']
    tstep = d['tstep']
    
    demand_df['time_stamp'] = ((demand_df['time_stamp'] - demand_df['time_stamp'].min()) / tstep).astype(int)
    demands = demand_df.groupby('time_stamp')
    for t, t_group in demands:
        s_groups = t_group.groupby('origin')
        for s, s_group in s_groups:
            d_groups = s_group.groupby('destination')
            for d, d_group in d_groups:
                plam[t, s, d] = (d_group['demand'] * scalar).sum()
                prices[t, s, d] = (d_group['demand'] * d_group['price'] * scalar).sum()
                tt_f[t, s, d] = (d_group['demand'] * d_group['travel_time'] * scalar).sum()    
    tt_f /= tstep
    
    prices = np.divide(prices, plam, out=np.zeros_like(prices), where=plam!=0)
    tt_f = np.divide(tt_f, plam, out=np.zeros_like(tt_f), where=plam!=0)
    masked_ttf = np.ma.array(tt_f, mask=tt_f == 0)
    mean_ttf = masked_ttf.mean(axis=0)
    masked_ttf = np .maximum(np.round(masked_ttf).astype(int), 1)
    mean_ttf = np.maximum(np.round(mean_ttf).astype(int), 1)
    tt_f = masked_ttf.filled(999)
    mean_ttf = mean_ttf.filled(999)
    
    ########### rebalance ################
    reb_df = pd.DataFrame(data['rebTime'])
    hour_min = reb_df['time_stamp'].min()
    reb_df = reb_df[reb_df['time_stamp'] == hour_min]
    
    tt_e = np.zeros((R, R))
    for idx, row in reb_df.iterrows():
        s, d = int(row['origin']), int(row['destination'])
        tt_e[s, d] = max(int(round(row['reb_time'] / tstep)), 1)
    
    output_path = Path.cwd() / name
    # print(output_path)

    Path.mkdir(output_path, parents=True, exist_ok=True)
    np.save(output_path / 'plam.npy', plam)
    np.save(output_path / 'prices.npy', prices)
    eps = 1e-7
    np.save(output_path / 'mu_f.npy', np.where(mean_ttf != 0, 1 / (mean_ttf + eps), 1e-3))
    np.save(output_path / 'mu_e.npy', np.where(tt_e != 0, 1 / (tt_e + eps), 1e-3))
    np.save(output_path / 'adjacency.npy', adjacency)
    


## statistics
1. average customer revenue
2. average travel time
3. average revenue per minute

In [2]:
tstep = {
    'nyc': 4,
    'sz': 3,
    'didi9': 10,
    'didi20': 10
}

for path in Path.cwd().iterdir():
    if path.is_dir():
        adjacency = np.load(path / 'adjacency.npy')
        plam = np.load(path / 'plam.npy')
        plam *= adjacency
        travel_time = np.maximum(1, np.round(np.reciprocal(np.load(path / 'mu_f.npy'))).astype(int))
        prices = np.load(path / 'prices.npy')
        
        avg_revenue = (prices * plam).sum() / plam.sum()
        avg_time = (travel_time * plam).sum() / plam.sum()        
        revenue_per_min = avg_revenue / (avg_time * tstep[path.name])
        
        print(path.name, "====================")
        print('avg revenue', avg_revenue)
        print('avg travel time', avg_time)
        print('revenue per min', revenue_per_min)

avg revenue 15.59968726984466
avg travel time 1.1863014353259984
revenue per min 1.314985113000207
avg revenue 2.0544682891637356
avg travel time 1.243455149501661
revenue per min 0.16522254863692543
avg revenue 19.027974366118695
avg travel time 4.530231262190025
revenue per min 1.4000738052770774
avg revenue 15.782570422535212
avg travel time 2.9119718309859155
revenue per min 1.354972793228537
