In [17]:
from DOPU_given_timerange import get_combined_cleaned_one_month_df

get_combined_cleaned_one_month_df()

In [32]:
from zone_coords import get_coord_as_csv
# Required packages
import pandas as pd
import numpy as np
import json

# === Step 1: Load Parquet taxi trip data ===
df = pd.read_parquet('./all_cleaned_data/combined_cleaned_data_2024-01.parquet')

# === Step 2: Load zone coordinates ===
# CSV should contain: LocationID, Latitude, Longitude
get_coord_as_csv()
zone_coords_df = pd.read_csv('zone_coords.csv')
zone_coords = {
    int(row['LocationID']): (row['Latitude'], row['Longitude'])
    for _, row in zone_coords_df.iterrows()
}

# === Step 3: Load zone neighbor dictionary from JSON ===
with open('zone_neighbors.json', 'r') as f:
    neighbor_dict = json.load(f)

# Convert string keys and values to integers
neighbor_dict = {int(k): [int(n) for n in v] for k, v in neighbor_dict.items()}

# === Step 4: Helper functions ===

# Convert a timestamp into a 30-minute time bin index (0 to 47)
def get_time_bin_half_hour(dt):
    return dt.hour * 2 + dt.minute // 30


# Convert a timestamp into a 15-minute time bin index (0 to 95)
def get_time_bin_fifteen_minutes(timestamp):
    return timestamp.hour * 4 + timestamp.minute // 15  # 15min per bin


# Compute Euclidean distance between two (lat, lon) coordinates
def euclidean_distance(coord1, coord2):
    return np.linalg.norm(np.array(coord1) - np.array(coord2))

# Compute reward: fare minus 0.5 * empty travel distance
def compute_reward(fare, empty_distance):
    return fare - 0.5 * empty_distance

# === Step 5: Construct (s, a, r, s′) tuples ===

samples = []

num_time_bins = 96

for _, row in df.iterrows():
    try:
        # Extract key fields
        s_zone = int(row['PULocationID'])
        pickup_time = pd.to_datetime(row['PickupDatetime'])
        fare = float(row['FareAmount'])

        # Skip invalid or missing data
        if s_zone not in zone_coords or s_zone not in neighbor_dict:
            continue

        # Define current state
        t_bin = get_time_bin_fifteen_minutes(pickup_time)
        s = (s_zone, t_bin)

        # Loop over each possible neighbor as an action
        for a_zone in neighbor_dict[s_zone]:
            if a_zone not in zone_coords:
                continue

            # Calculate distance between zones
            dist = euclidean_distance(zone_coords[s_zone], zone_coords[a_zone])

            # Calculate reward for moving to that zone
            reward = compute_reward(fare, dist)

            # Next state: new zone, 30/15 minutes later
            s_prime = (a_zone, (t_bin + 1) % num_time_bins)

            # Save sample
            samples.append({
                's_zone': s_zone,
                't_bin': t_bin,
                'a_zone': a_zone,
                'reward': reward,
                's_prime_zone': s_prime[0],
                's_prime_t_bin': s_prime[1]
            })

    except Exception as e:
        # Silently skip problematic rows
        continue

# Convert to DataFrame and save
samples_df = pd.DataFrame(samples)

KeyboardInterrupt: 

In [10]:
df = pd.read_csv("./data_for_q_learning/q_learning_samples_96.csv")
df

Unnamed: 0,s_zone,t_bin,a_zone,reward,s_prime_zone,s_prime_t_bin
0,236,3,263,12.796885,263,4
1,236,3,43,12.795609,43,4
2,236,3,262,12.794287,262,4
3,236,3,141,12.793130,141,4
4,236,3,140,12.792438,140,4
...,...,...,...,...,...,...
13526230,140,92,141,8.597445,141,93
13526231,140,92,202,8.597010,202,93
13526232,140,92,237,8.594332,237,93
13526233,140,92,263,8.593102,263,93


In [40]:
import os

os.makedirs("./Q_learning", exist_ok=True)
output_path = f'./data_for_q_learning/q_learning_samples_{num_time_bins}.csv'
samples_df.to_csv(output_path, index=False)
print(f"✅ Done! Generated {len(samples_df)} (s, a, r, s′) samples and saved to {output_path}")

✅ Done! Generated 13526235 (s, a, r, s′) samples and saved to ./data_for_q_learning/q_learning_samples_96.csv


In [41]:
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# === Step 1: Load sample data ===
num_time_bins = 96 #15min time bin
input_path = f'./data_for_q_learning/q_learning_samples_{num_time_bins}.csv'
df = pd.read_csv(input_path)
# Encode all zone IDs into integer IDs
all_zone_ids = pd.unique(df[['s_zone', 'a_zone', 's_prime_zone']].values.ravel())
zone_id_map = {z: i for i, z in enumerate(sorted(all_zone_ids))}
num_zones = len(zone_id_map)


# Map original zone IDs to encoded integers
df['s_zone'] = df['s_zone'].map(zone_id_map)
df['a_zone'] = df['a_zone'].map(zone_id_map)
df['s_prime_zone'] = df['s_prime_zone'].map(zone_id_map)

# === Step 2: Create PyTorch Dataset ===
class QLearningDataset(Dataset):
    def __init__(self, df):
        self.s_zone = torch.tensor(df['s_zone'].values, dtype=torch.long)
        self.t_bin = torch.tensor(df['t_bin'].values, dtype=torch.long)
        self.a_zone = torch.tensor(df['a_zone'].values, dtype=torch.long)
        self.reward = torch.tensor(df['reward'].values, dtype=torch.float32)
        self.s_prime_zone = torch.tensor(df['s_prime_zone'].values, dtype=torch.long)
        self.s_prime_t_bin = torch.tensor(df['s_prime_t_bin'].values, dtype=torch.long)

    def __len__(self):
        return len(self.s_zone)

    def __getitem__(self, idx):
        return (self.s_zone[idx], self.t_bin[idx], self.a_zone[idx],
                self.reward[idx], self.s_prime_zone[idx], self.s_prime_t_bin[idx])

dataset = QLearningDataset(df)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

# === Step 3: Q-network definition ===
class QNetwork(nn.Module):
    def __init__(self, num_zones, num_time_bins, embed_dim=16):
        super().__init__()
        self.zone_embed = nn.Embedding(num_zones, embed_dim)
        self.time_embed = nn.Embedding(num_time_bins, embed_dim)

        self.net = nn.Sequential(
            nn.Linear(embed_dim * 3, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, s_zone, t_bin, a_zone):
        s_embed = self.zone_embed(s_zone)
        t_embed = self.time_embed(t_bin)
        a_embed = self.zone_embed(a_zone)
        x = torch.cat([s_embed, t_embed, a_embed], dim=1)
        return self.net(x).squeeze(1)  # Output shape: (batch,)

# Instantiate networks
q_net = QNetwork(num_zones, num_time_bins)
target_q_net = QNetwork(num_zones, num_time_bins)
target_q_net.load_state_dict(q_net.state_dict())  # Copy initial weights

# === Step 4: Training setup ===
optimizer = optim.Adam(q_net.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()
gamma = 0.99


# === Step 5: Training loop ===
epoch_losses = []
for epoch in range(5):
    q_net.train()
    total_loss = 0
    tqdm_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for s, t, a, r, s_p, t_p in tqdm_bar:
        # Compute target: r + γ * max_a′ Q(s′, a′)
        with torch.no_grad():
            a_prime = torch.arange(num_zones).repeat(len(s_p), 1).to(torch.long)
            s_prime_rep = s_p.unsqueeze(1).repeat(1, num_zones).flatten()
            t_prime_rep = t_p.unsqueeze(1).repeat(1, num_zones).flatten()
            a_prime_flat = a_prime.flatten()

            q_values = target_q_net(s_prime_rep, t_prime_rep, a_prime_flat)
            q_values = q_values.view(len(s_p), num_zones)
            max_q = q_values.max(dim=1)[0]

        # Use Bellman loss
        target = r + gamma * max_q
        pred_q = q_net(s, t, a)
        loss = loss_fn(pred_q, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        tqdm_bar.set_postfix(batch_loss=loss.item())
    
    epoch_losses.append(total_loss)
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# === Step 6: Save trained model and loss history ===
torch.save(q_net.state_dict(), "Q_learning/q_network.pt")
print("✅ Q-network model saved to 'q_network.pt'")

# Save loss history
pd.DataFrame({'epoch': list(range(1, len(epoch_losses)+1)), 'loss': epoch_losses})\
  .to_csv("./Q_learning/q_training_loss.csv", index=False)
print("📄 Training loss saved to 'q_training_loss.csv'")


Epoch 1: 100%|██████████| 13210/13210 [18:17<00:00, 12.04it/s, batch_loss=66.2]


Epoch 1, Loss: 1348152.0779


Epoch 2: 100%|██████████| 13210/13210 [18:51<00:00, 11.67it/s, batch_loss=84.9]


Epoch 2, Loss: 1282722.9682


Epoch 3: 100%|██████████| 13210/13210 [18:03<00:00, 12.20it/s, batch_loss=62.9]


Epoch 3, Loss: 1280436.8355


Epoch 4: 100%|██████████| 13210/13210 [17:50<00:00, 12.34it/s, batch_loss=94.5]


Epoch 4, Loss: 1279344.0014


Epoch 5: 100%|██████████| 13210/13210 [19:55<00:00, 11.05it/s, batch_loss=82.8]

Epoch 5, Loss: 1278608.2782
✅ Q-network model saved to 'q_network.pt'
📄 Training loss saved to 'q_training_loss.csv'





In [2]:
import pandas as pd
import torch
import torch.nn as nn
import json
import random

class QNetwork(nn.Module):
    def __init__(self, num_zones, num_time_bins, embed_dim=16):
        super().__init__()
        self.zone_embed = nn.Embedding(num_zones, embed_dim)
        self.time_embed = nn.Embedding(num_time_bins, embed_dim)
        self.net = nn.Sequential(
            nn.Linear(embed_dim * 3, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, s_zone, t_bin, a_zone):
        s_embed = self.zone_embed(s_zone)
        t_embed = self.time_embed(t_bin)
        a_embed = self.zone_embed(a_zone)
        x = torch.cat([s_embed, t_embed, a_embed], dim=1)
        return self.net(x).squeeze(1)

# === Load the trained model ===
df = pd.read_csv("./data_for_q_learning/q_learning_samples_96.csv")
zone_ids = pd.unique(df[['s_zone', 'a_zone', 's_prime_zone']].values.ravel())
zone_id_map = {z: i for i, z in enumerate(sorted(zone_ids))}
inv_zone_map = {v: k for k, v in zone_id_map.items()}
num_zones = len(zone_id_map)
num_time_bins = 96

df['s_zone'] = df['s_zone'].map(zone_id_map)
df['a_zone'] = df['a_zone'].map(zone_id_map)

# Calculate reward average table (s_zone, a_zone) → avg_reward
reward_lookup = df.groupby(['s_zone', 'a_zone'])['reward'].mean().to_dict()

# Load adjacent zone information
with open("zone_neighbors.json", "r") as f:
    neighbor_dict_raw = json.load(f)
neighbor_dict = {zone_id_map[int(k)]: [zone_id_map[int(n)] for n in v if int(n) in zone_id_map] 
                 for k, v in neighbor_dict_raw.items() if int(k) in zone_id_map}

# Initialize the model
q_net = QNetwork(num_zones, num_time_bins)
q_net.load_state_dict(torch.load("Q_learning/q_network.pt"))
q_net.eval()

# === rollout Simulating functions ===
def rollout(start_zone, start_time_bin, steps=32):
    s_zone = zone_id_map[start_zone]
    t_bin = start_time_bin
    total_reward = 0
    path = [(start_zone, t_bin)]

    for _ in range(steps):
        neighbors = neighbor_dict.get(s_zone, [])
        if not neighbors:
            break

        s_tensor = torch.tensor([s_zone] * len(neighbors), dtype=torch.long)
        t_tensor = torch.tensor([t_bin] * len(neighbors), dtype=torch.long)
        a_tensor = torch.tensor(neighbors, dtype=torch.long)

        with torch.no_grad():
            q_values = q_net(s_tensor, t_tensor, a_tensor)
        best_idx = torch.argmax(q_values).item()
        best_a = neighbors[best_idx]

        reward = reward_lookup.get((s_zone, best_a), 0.0)
        total_reward += reward

        s_zone = best_a
        t_bin = (t_bin + 1) % num_time_bins
        path.append((inv_zone_map[s_zone], t_bin))

    return total_reward, path

# === Run a strategy simulation ===
start_row = df.sample(1).iloc[0]
start_zone = inv_zone_map[start_row['s_zone']]
start_time = int(start_row['t_bin'])

total_income, visited_path = rollout(start_zone, start_time)

print("🧭 Simulated Path:")
for zone, t in visited_path:
    print(f"Zone {zone} at TimeBin {t}")
print(f"\n💰 Total Estimated Income over {len(visited_path)-1} steps: ${total_income:.2f}")


  q_net.load_state_dict(torch.load("Q_learning/q_network.pt"))


🧭 Simulated Path:
Zone 186 at TimeBin 43
Zone 100 at TimeBin 44
Zone 164 at TimeBin 45
Zone 107 at TimeBin 46
Zone 224 at TimeBin 47
Zone 4 at TimeBin 48
Zone 224 at TimeBin 49
Zone 79 at TimeBin 50
Zone 107 at TimeBin 51
Zone 224 at TimeBin 52
Zone 79 at TimeBin 53
Zone 224 at TimeBin 54
Zone 79 at TimeBin 55
Zone 107 at TimeBin 56
Zone 79 at TimeBin 57
Zone 107 at TimeBin 58
Zone 79 at TimeBin 59
Zone 224 at TimeBin 60
Zone 79 at TimeBin 61
Zone 107 at TimeBin 62
Zone 79 at TimeBin 63
Zone 107 at TimeBin 64
Zone 79 at TimeBin 65
Zone 107 at TimeBin 66
Zone 224 at TimeBin 67
Zone 79 at TimeBin 68
Zone 224 at TimeBin 69
Zone 79 at TimeBin 70
Zone 224 at TimeBin 71
Zone 79 at TimeBin 72
Zone 107 at TimeBin 73
Zone 224 at TimeBin 74
Zone 79 at TimeBin 75

💰 Total Estimated Income over 32 steps: $454.35


In [None]:
def evaluate_policy(df, rollout_fn, num_rollouts=100, steps=32):
    """
    Evaluate average income over multiple rollouts from random starting points.
    
    Args:
        df: The DataFrame with original training samples (for zone/time sampling)
        rollout_fn: The rollout function
        num_rollouts: Number of rollouts to simulate
        steps: Steps per rollout (32 = 8 hours with 15min bin)
    
    Returns:
        avg_reward: average total income over all rollouts
        all_rewards: list of individual rollout incomes
    """
    rewards = []

    for _ in range(num_rollouts):
        row = df.sample(1).iloc[0]
        s_zone = inv_zone_map[row['s_zone']]
        t_bin = int(row['t_bin'])
        
        total_r, _ = rollout_fn(s_zone, t_bin, steps=steps)
        rewards.append(total_r)

    avg_reward = sum(rewards) / len(rewards)
    std_dev = np.std(rewards)

    print(f"\n📊 Evaluated over {num_rollouts} random rollouts:")
    print(f"→ Average income: ${avg_reward:.2f}")
    print(f"→ Std deviation: ±${std_dev:.2f}")
    return avg_reward, rewards
evaluate_policy(df, rollout, num_rollouts=100, steps=32)