Data loading section

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
path = "./data/taxi_log_2008_by_id"
os.chdir(path)
files = os.listdir()

total_files = len(files[1:])
target_files = int(total_files * 0.1)

gps_data=pd.read_csv(files[0],names=['taxi_id','time','latitude','longitude'])

for i,file in enumerate(tqdm(files[1:], desc="Processing files")):
    tmp = pd.read_csv(file, names=['taxi_id', 'time', 'longitude', 'latitude'])
    if not tmp.empty:
        gps_data = pd.concat([gps_data, tmp])
    if i + 1 >= target_files:
        break
 
gps_data


Processing files:  10%|▉         | 1034/10356 [00:24<03:38, 42.70it/s]


Unnamed: 0,taxi_id,time,latitude,longitude
0,1766,2008-02-02 13:47:24,116.42342,39.83735
1,1766,2008-02-02 13:57:25,116.42343,39.83725
2,1766,2008-02-02 14:07:24,116.42339,39.83720
3,1766,2008-02-02 14:17:24,116.42334,39.83726
4,1766,2008-02-02 14:27:24,116.42342,39.83728
...,...,...,...,...
1656,2061,2008-02-08 17:23:38,40.31350,116.68677
1657,2061,2008-02-08 17:28:37,40.31353,116.68677
1658,2061,2008-02-08 17:33:37,40.31353,116.68677
1659,2061,2008-02-08 17:38:37,39.96895,116.68678


In [2]:
gps_data1=gps_data.sort_values(by=['taxi_id','time'],ignore_index=True)
gps_data1.drop_duplicates(inplace=True,ignore_index=True)
gps_data1['time'] = pd.to_datetime(gps_data1['time'])
gps_data1

Unnamed: 0,taxi_id,time,latitude,longitude
0,3,2008-02-02 13:39:08,39.88957,116.35743
1,3,2008-02-02 13:44:08,39.89726,116.35732
2,3,2008-02-02 13:49:09,39.90712,116.35060
3,3,2008-02-02 13:54:09,39.91145,116.35171
4,3,2008-02-02 13:59:08,39.89655,116.34366
...,...,...,...,...
1697459,10356,2008-02-07 22:10:49,40.21196,116.24457
1697460,10356,2008-02-07 22:15:51,40.21237,116.25047
1697461,10356,2008-02-07 22:20:53,40.22385,116.23035
1697462,10356,2008-02-07 22:24:45,40.22432,116.23075


In [3]:
gps_data1=gps_data1[(gps_data1['latitude']>39.4)&
                  (gps_data1['latitude']<41.6)&
                  (gps_data1['longitude']>115.7)&
                  (gps_data1['longitude']<117.4)]
gps_data1

Unnamed: 0,taxi_id,time,latitude,longitude
0,3,2008-02-02 13:39:08,39.88957,116.35743
1,3,2008-02-02 13:44:08,39.89726,116.35732
2,3,2008-02-02 13:49:09,39.90712,116.35060
3,3,2008-02-02 13:54:09,39.91145,116.35171
4,3,2008-02-02 13:59:08,39.89655,116.34366
...,...,...,...,...
1697459,10356,2008-02-07 22:10:49,40.21196,116.24457
1697460,10356,2008-02-07 22:15:51,40.21237,116.25047
1697461,10356,2008-02-07 22:20:53,40.22385,116.23035
1697462,10356,2008-02-07 22:24:45,40.22432,116.23075


Data pre-processing section

In [7]:
window_size = 3

total_samples = 0
for _, group in gps_data1.groupby('taxi_id'):
    n_samples = len(group) - window_size
    if n_samples > 0:
        total_samples += n_samples

X = np.zeros((total_samples, window_size, 3))
y = np.zeros((total_samples, 2))

sample_idx = 0

for taxi_id, group in gps_data1.groupby('taxi_id'):
    n_samples = len(group) - window_size
    if n_samples <= 0:
        continue
    
    loc_data = group[['latitude', 'longitude']].values
    time_data = group['time'].values
    
    for i in range(n_samples):
        loc_features = loc_data[i:i+window_size]
        
        time_diff = (time_data[i:i+window_size] - time_data[i]).astype('timedelta64[ms]').astype(float) / 1000
        time_diff = time_diff.reshape(-1, 1)
        
        seq_input = np.hstack([loc_features, time_diff])
        
        target = loc_data[i+window_size]
        
        if sample_idx < total_samples:
            X[sample_idx] = seq_input
            y[sample_idx] = target
            sample_idx += 1
        else:
            print(f"warning: sample_idx={sample_idx} beyond {total_samples}")
            break

In [12]:
n = len(X)
train_X, train_y = X[:int(n*0.8)], y[:int(n*0.8)]
val_X, val_y = X[int(n*0.8):int(n*0.9)], y[int(n*0.8):int(n*0.9)]
test_X, test_y = X[int(n*0.9):], y[int(n*0.9):]

Data loader section

In [13]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np


class TaxiDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TaxiDataset(train_X, train_y)
val_dataset = TaxiDataset(val_X, val_y)
test_dataset = TaxiDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Model section

In [19]:
class TaxiTransformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super(TaxiTransformer, self).__init__()
        self.input_linear = nn.Linear(input_dim, d_model)  # 将输入映射到 d_model 维度
        
        # 位置编码
        self.positional_encoding = nn.Parameter(torch.zeros(1, 3, d_model))  # 位置编码参数

        # 设置 batch_first=True 以优化计算
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, 
                                                   dim_feedforward=dim_feedforward, dropout=dropout, 
                                                   batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.output_layer = nn.Linear(d_model, 2)  # 输出经纬度
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        # x: [batch_size, window_size, input_dim]
        x = self.input_linear(x)  # [batch_size, window_size, d_model]
        # 添加位置编码
        x = self.norm(x)
        x = x + self.positional_encoding
        # 直接传入 Transformer（不需要 .transpose(0,1)）
        x = self.transformer_encoder(x)
        # 提取最后一个时间步的输出
        x_last = x[:, -1, :]  # [batch_size, d_model]
        output = self.output_layer(x_last)  # [batch_size, 2]
        return output

# Model hyperparameters
input_dim = 3
d_model = 64
nhead = 8
num_layers = 2
dim_feedforward = 128
model = TaxiTransformer(input_dim, d_model, nhead, num_layers, dim_feedforward)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Training section

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        preds = model(batch_X)
        loss = criterion(preds, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * batch_X.size(0)
    train_loss /= len(train_loader.dataset)
    
    # 验证阶段
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            preds = model(batch_X)
            loss = criterion(preds, batch_y)
            val_loss += loss.item() * batch_X.size(0)
    val_loss /= len(val_loader.dataset)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


KeyboardInterrupt: 

Evaluation section

In [16]:
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        preds = model(batch_X)
        loss = criterion(preds, batch_y)
        test_loss += loss.item() * batch_X.size(0)
test_loss /= len(test_loader.dataset)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 0.0218
