In [1]:
import os
import csv
import torch
import argparse
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from GameFormer.predictor import GameFormer
from torch.utils.data import DataLoader
from GameFormer.train_utils import *
import matplotlib.pyplot as plt
from GameFormer.predictor_modules import *

torch.cuda.set_device(3)

num_neighbors = 20
batch_size = 32
# set up data loaders
train_path = '/data/fyy/GameFormer-Planner/nuplan/processed_data/train'
train_files = [f for d in os.listdir(train_path) for f in glob.glob(os.path.join(train_path, d, "*.npz"))]
train_set = DrivingData(train_files, num_neighbors)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())

count = 0
with tqdm(train_loader, desc="Training", unit="batch") as data_epoch:
    for batch in data_epoch:
        count += 1
        if count == 2:
            break
        # prepare data
        inputs = {
            'ego_agent_past': batch[0].to('cuda'),
            'neighbor_agents_past': batch[1].to('cuda'),
            'map_lanes': batch[2].to('cuda'),
            'map_crosswalks': batch[3].to('cuda'),
            'route_lanes': batch[4].to('cuda')
        }

        ego_future = batch[5].to('cuda')
        neighbors_future = batch[6].to('cuda')
        neighbors_future_valid = torch.ne(neighbors_future[..., :2], 0)
        
        break

RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 804: forward compatibility was attempted on non supported HW

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## traj

In [33]:
ego = inputs['ego_agent_past']    
neighbors = inputs['neighbor_agents_past']    
actors = torch.cat([ego[:, None, :, :5], neighbors[..., :5]], dim=1)

ego_encoder = AgentEncoder(agent_dim=7).cuda() 
encoded_ego = ego_encoder(ego)
agent_encoder = AgentEncoder(agent_dim=11).cuda()
encoded_neighbors = [agent_encoder(neighbors[:, i]) for i in range(neighbors.shape[1])]

encoded_actors = torch.stack([encoded_ego] + encoded_neighbors, dim=1)  
actors_mask = torch.eq(actors[:, :, -1].sum(-1), 0)

# print(actors.shape)
print(encoded_actors.shape)

torch.Size([32, 21, 21, 5])
torch.Size([32, 21, 256])


## lanes

In [12]:
from GameFormer.predictor_modules import *
_lane_len = 50
_lane_feature = 7
_route_len = 50
_route_feature = 3

nbr_lanes = inputs['map_lanes']
route_lanes = inputs['route_lanes']
crosswalk = inputs['map_crosswalks']
print(nbr_lanes.shape, route_lanes.shape, crosswalk.shape)

torch.Size([32, 40, 50, 7]) torch.Size([32, 10, 50, 3]) torch.Size([32, 5, 30, 3])


In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LaneNet(nn.Module):
    def __init__(self):
        super(LaneNet, self).__init__()
        input_size = 7
        output_size = 256
        self.fc1 = nn.Linear(input_size, 64)
        self.bn1 = nn.BatchNorm1d(64)  
        self.fc2 = nn.Linear(64, 64)
        self.bn2 = nn.BatchNorm1d(64)  
        self.fc3 = nn.Linear(64, output_size)
        self._lane_feature = 7
        
    def forward(self, x):
        x = x.reshape(32, -1, self._lane_feature)
        batch_size, seq_len, _ = x.size()
        
        # pooling
        x = x.transpose(1, 2) 
        x = F.avg_pool1d(x, kernel_size=2, stride=2)  
        x = x.transpose(1, 2)  
        
        # 更新维度信息
        _, seq_len, _ = x.size()  
        x = x.view(-1, self._lane_feature).float()
        
        x = self.fc1(x)
        x = self.bn1(x) 
        x = torch.relu(x)
        x = self.fc2(x)
        x = self.bn2(x) 
        x = torch.relu(x)
        x = self.fc3(x)
        
        # 将x的形状重塑回 [32, 1000, 256]
        x = x.view(batch_size, seq_len, -1)
        mask = torch.eq(x[:, :, ::].sum(-1), 0)
        return x, mask


class RouteNet(nn.Module):
    def __init__(self):
        super(RouteNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(64) 
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        self.linear = nn.Linear(128, 256)
        self._route_len = 50
        self._route_feature = 3

    def forward(self, x):
        x = x.reshape(32, -1, self._route_feature)
        # 输入形状: [32, 500, 3]
        x = x.permute(0, 2, 1) 
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x) 
        x = F.relu(x)  
        batch_size, channels, seq_len = x.shape
        x = x.permute(0, 2, 1) 
        x = x.contiguous().view(-1, channels)
        
        x = self.linear(x)
        x = x.view(batch_size, seq_len, -1)
        x = F.max_pool1d(x.permute(0, 2, 1), kernel_size=500, stride=500).permute(0, 2, 1)
        return x
 
    
lanenet = LaneNet().cuda()
routenet = RouteNet().cuda()

encoded_lane, map_mask = lanenet(nbr_lanes)
encoded_route = routenet(route_lanes)
print(encoded_lane.shape, encoded_route.shape)


torch.Size([32, 1000, 256]) torch.Size([32, 1, 256])


In [35]:
class M2M(nn.Module):
    def __init__(self):
        super(M2M, self).__init__()
        self.linear1 = nn.Linear(512, 1024)
        self.bn1 = nn.BatchNorm1d(1024)  
        self.linear2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)  
        self.linear3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256) 

        # 初始化模型参数
        for layer in [self.linear1, self.linear2, self.linear3]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, lanes, route):
        batch_size, num_lanes, len_feat = lanes.size()
        
        route_features = []
        for i in range(batch_size):
            route_feature = route[i].repeat(num_lanes, 1)
            route_features.append(route_feature)

        route_features = torch.cat(route_features, dim=0)
        # print(route_features.shape, lanes.reshape(-1, len_feat).shape)
        lanes_concat = torch.cat((lanes.reshape(-1, len_feat), route_features), dim=1)
        
        x = F.relu(self.bn1(self.linear1(lanes_concat)))
        x = F.relu(self.bn2(self.linear2(x)))
        x = F.relu(self.bn3(self.linear3(x)))
        x = x.reshape(batch_size, num_lanes, len_feat)
        return x

m2m = M2M().to(device)
encoded_map = m2m(encoded_lane, encoded_route)
print(encoded_map.size())

torch.Size([32, 1000, 256])


In [13]:
# lane_encoder = VectorMapEncoder(_lane_feature, _lane_len).cuda()
# encoded_nbr_lanes, nbr_lanes_mask = lane_encoder(nbr_lanes)

# route_encoder = VectorMapEncoder(_route_feature, _route_len).cuda()
# encoded_route_lanes, route_lanes_mask = route_encoder(route_lanes)

# print(encoded_nbr_lanes.shape, encoded_route_lanes.shape)

torch.Size([32, 200, 256]) torch.Size([32, 50, 256])


## attention fusion encoding

In [36]:
# shape = (32, 236, 256)
input = torch.cat([encoded_actors, encoded_map], dim=1)
mask = torch.cat([actors_mask, map_mask], dim=1)

dim, layers, heads, dropout = 256, 6, 8, 0.1
attention_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=dim*4,
                                                activation='gelu', dropout=dropout, batch_first=True)
fusion_encoder = nn.TransformerEncoder(attention_layer, layers).cuda()
encoding = fusion_encoder(input, src_key_padding_mask=mask)
print(encoding.shape)

torch.Size([32, 1021, 256])


In [39]:
encoder_outputs = {
    'actors': actors,
    'encoding': encoding,
    'mask': mask,
}

## Decoder

In [40]:
print(encoder_outputs['actors'].shape)
print(encoder_outputs['encoding'].shape)
print(encoder_outputs['mask'].shape)

torch.Size([32, 21, 21, 5])
torch.Size([32, 1021, 256])
torch.Size([32, 1021])


In [41]:
ego_future = batch[5].to('cuda')
neighbors_future = batch[6].to('cuda')

print(ego_future.unsqueeze(1).permute(0, 2, 1, 3).reshape(32, 80, -1).shape, neighbors_future.permute(0, 2, 1, 3).reshape(32, 80, -1).shape)
fut_actors = torch.cat([ego_future.unsqueeze(1).permute(0, 2, 1, 3).reshape(32, 80, -1), 
                        neighbors_future.permute(0, 2, 1, 3).reshape(32, 80, -1)], dim=-1)
print(fut_actors.shape)

torch.Size([32, 80, 3]) torch.Size([32, 80, 60])
torch.Size([32, 80, 63])


In [42]:
print(fut_actors.shape, ego_future.shape,encoding.shape)

torch.Size([32, 80, 63]) torch.Size([32, 80, 3]) torch.Size([32, 1021, 256])


In [43]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, input_dim, memory_dim, num_heads, hidden_dim, num_layers, output_dim):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        self.memory_embedding = nn.Linear(memory_dim, hidden_dim)
        self.pos_encoder_tgt = PositionalEncoding(hidden_dim)
        decoder_layer = nn.TransformerDecoderLayer(hidden_dim, num_heads, hidden_dim)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, tgt, memory):
        tgt = self.embedding(tgt)
        memory = self.memory_embedding(memory)
        tgt = self.pos_encoder_tgt(tgt)
        # input is [sequence_length, batch_size, features]
        tgt = tgt.permute(1, 0, 2)  
        memory = memory.permute(1, 0, 2) 
        decoding = self.transformer_decoder(tgt, memory)
        decoding = decoding.permute(1, 0, 2)  #  [batch_size, sequence_length, features]
        output = self.fc(decoding)
        return decoding, output

decoder = TransformerDecoder(input_dim=63, memory_dim=256, num_heads=8, hidden_dim=256, num_layers=3, output_dim=63).to(device)

decoding, output = decoder(fut_actors, encoding)
print(decoding.shape, output.shape)


torch.Size([32, 80, 256]) torch.Size([32, 80, 63])


In [44]:
class Planner(nn.Module):
    def __init__(self):
        super(Planner, self).__init__()
        self._future_len = 80
        self.route_fusion = CrossTransformer()
        self.plan_decoder = nn.Sequential(nn.Linear(512, 256), nn.ELU(), nn.Dropout(0.1), nn.Linear(256, self._future_len*2))
        self.route_encoder = VectorMapEncoder(3, 50)

    def dynamics_layer(self, controls, initial_state):       
        dt = 0.1 # discrete time period [s]
        max_a = 5 # vehicle's accleration limits [m/s^2]
        max_d = 0.5 # vehicle's steering limits [rad]
        
        vel_init = torch.hypot(initial_state[..., 3], initial_state[..., 4])
        vel = vel_init[:, None] + torch.cumsum(controls[..., 0].clamp(-max_a, max_a) * dt, dim=-1)
        vel = torch.clamp(vel, min=0)

        yaw_rate = controls[..., 1].clamp(-max_d, max_d) * vel
        yaw = initial_state[:, None, 2] + torch.cumsum(yaw_rate * dt, dim=-1)
        pi = torch.tensor(math.pi)
        yaw = torch.fmod(yaw, 2 * pi)
        # yaw = torch.fmod(yaw, 2*torch.pi)

        vel_x = vel * torch.cos(yaw)
        vel_y = vel * torch.sin(yaw)

        x = initial_state[:, None, 0] + torch.cumsum(vel_x * dt, dim=-1)
        y = initial_state[:, None, 1] + torch.cumsum(vel_y * dt, dim=-1)

        return torch.stack((x, y, yaw), dim=-1)

    def forward(self, env_encoding, route_lanes, initial_state):
        route_lanes, mask = self.route_encoder(route_lanes)
        mask[:, 0] = False
        route_encoding = self.route_fusion(env_encoding, route_lanes, route_lanes, mask)
        env_route_encoding = torch.cat([env_encoding, route_encoding], dim=-1)
        # dim0是最大值value，dim1是索引
        env_route_encoding = torch.max(env_route_encoding, dim=1)[0] # max pooling over modalities
        control = self.plan_decoder(env_route_encoding)
        control = control.reshape(-1, self._future_len, 2)
        plan = self.dynamics_layer(control, initial_state)
        return plan

initial_state = encoder_outputs['actors'][:, 0, -1]
planner = Planner().to(device)
ego_plan = planner(decoding, route_lanes, initial_state)

print(ego_plan.shape)

torch.Size([32, 80, 3])
