In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd GroundingDINO
!pip install -e .

Cloning into 'GroundingDINO'...
remote: Enumerating objects: 463, done.[K
remote: Counting objects: 100% (240/240), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 463 (delta 176), reused 137 (delta 137), pack-reused 223 (from 1)[K
Receiving objects: 100% (463/463), 12.87 MiB | 9.23 MiB/s, done.
Resolving deltas: 100% (241/241), done.
/content/GroundingDINO
Obtaining file:///content/GroundingDINO
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting addict (from groundingdino==0.1.0)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting yapf (from groundingdino==0.1.0)
  Downloading yapf-0.43.0-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting supervision>=0.22.0 (from groundingdino==0.1.0)
  Downloading supervision-0.25.1-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->g

In [3]:
!mkdir weights
!wget -O weights/groundingdino_swint_ogc.pth https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth


--2025-02-21 19:32:34--  https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/611591640/f221e500-c2fc-4fd3-b84e-8ad92a6923f3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250221T193234Z&X-Amz-Expires=300&X-Amz-Signature=178a4ea4859b8472691849c00e1f5c946991ab2ff5b0340475d575eb9b629f2d&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dgroundingdino_swint_ogc.pth&response-content-type=application%2Foctet-stream [following]
--2025-02-21 19:32:35--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/611591640/f221e500-c2fc-4fd3-b84e-8ad92a6923f3

In [12]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import numpy as np

class SPOTDataLoader(Dataset):
    def __init__(self, root_dir, labels_file, transform=None, preload=False):
        print("Initializing SPOTDataLoader...")
        self.root_dir = root_dir
        self.transform = transform
        self.labels = np.load(labels_file)

        self.preload = preload
        self.cached_data = {}

        if self.preload:
            print("Preloading dataset into memory... This may take time.")
            for idx in range(len(self.labels)):
                folder_name = format(idx, '05d')
                folder_path = os.path.join(self.root_dir, folder_name)

                input_images = []
                for i in range(5):
                    img_path = os.path.join(folder_path, f"{i}.jpg")
                    image = Image.open(img_path).convert('RGB')
                    if self.transform:
                        image = self.transform(image)
                    input_images.append(image)

                goal_image_path = os.path.join(folder_path, f"goal.jpg")
                goal_image = Image.open(goal_image_path).convert('RGB')
                if self.transform:
                    goal_image = self.transform(goal_image)

                self.cached_data[idx] = (torch.stack(input_images, dim=0), goal_image.unsqueeze(0), torch.tensor(self.labels[idx]))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.preload:
            return self.cached_data[idx]

        folder_name = format(idx, '05d')
        folder_path = os.path.join(self.root_dir, folder_name)

        input_images = []
        for i in range(5):
            img_path = os.path.join(folder_path, f"{i}.jpg")
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            input_images.append(image)

        goal_image_path = os.path.join(folder_path, f"goal.jpg")
        goal_image = Image.open(goal_image_path).convert('RGB')
        if self.transform:
            goal_image = self.transform(goal_image)

        return torch.stack(input_images, dim=0), goal_image.unsqueeze(0), torch.tensor(self.labels[idx])



In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from groundingdino.util.slconfig import SLConfig
from groundingdino.models import build_model

class CrossAttentionBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8):
        super(CrossAttentionBlock, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

    def forward(self, query, key_value):
        attn, _ = self.mha(query, key_value, key_value)
        return attn

class GroundingDinoFeatureExtractor(nn.Module):
    def __init__(self, base_model, device='cuda'):
        super(GroundingDinoFeatureExtractor, self).__init__()
        self.model = base_model
        self.device = device
        self._features = None
        self.hook_handle = self.model.transformer.encoder.layers[-1].register_forward_hook(self.hook_fn)

    def hook_fn(self, module, input, output):
        self._features = output

    def forward(self, images, text_prompts):
        images = images.to(self.device)
        _ = self.model(images, captions=text_prompts)
        return self._features

class DINOCrossAttentionMLP(nn.Module):
    def __init__(self, config_file, weight_file, num_cameras=5, embed_dim=256, device='cuda'):
        super(DINOCrossAttentionMLP, self).__init__()
        self.device = device
        self.num_cameras = num_cameras

        cfg = SLConfig.fromfile(config_file)
        base_model = build_model(cfg)
        checkpoint = torch.load(weight_file, map_location=device)
        state_dict = checkpoint['model'] if 'model' in checkpoint else checkpoint
        state_dict = { (k[len("module."): ] if k.startswith("module.") else k): v for k, v in state_dict.items() }
        base_model.load_state_dict(state_dict, strict=False)
        base_model.to(device)

        for param in base_model.parameters():
            param.requires_grad = False

        base_model.eval()

        self.feature_extractor = GroundingDinoFeatureExtractor(base_model, device=device)
        self.cross_attention = CrossAttentionBlock(embed_dim, num_heads=8)

        self.fc_layer1 = nn.Sequential(
            nn.Linear(2 * embed_dim * num_cameras, 1024),
            nn.ReLU()
        )

        self.fc_layer2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU()
        )

        self.fc_layer3 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU()
        )

        self.fc_layer4 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU()
        )

        self.fc_layer5 = nn.Linear(1024, 3)

        self.to(device)

    def forward(self, current_images, goal_images, text_prompts):
        if goal_images.size(1) == 1 and self.num_cameras > 1:
            goal_images = goal_images.expand(-1, self.num_cameras, -1, -1, -1)

        current_features_list = []
        goal_features_list = []

        for cam in range(self.num_cameras):
            curr_img = current_images[:, cam, :, :, :]
            goal_img = goal_images[:, cam, :, :, :]
            curr_feat = self.feature_extractor(curr_img, text_prompts)
            goal_feat = self.feature_extractor(goal_img, text_prompts)

            if curr_feat is None or goal_feat is None:
                print(f"[Camera {cam}] Warning: Feature extraction returned None!")
                continue

            curr_attn = curr_feat + self.cross_attention(curr_feat, goal_feat)
            goal_attn = goal_feat + self.cross_attention(goal_feat, curr_feat)
            curr_pool = curr_attn.mean(dim=1)
            goal_pool = goal_attn.mean(dim=1)
            current_features_list.append(curr_pool)
            goal_features_list.append(goal_pool)

        current_features = torch.cat(current_features_list, dim=1)
        goal_features = torch.cat(goal_features_list, dim=1)
        features = torch.cat([current_features, goal_features], dim=1)
        x = self.fc_layer1(features)
        x = self.fc_layer2(x)
        x = self.fc_layer3(x)
        x = self.fc_layer4(x)
        output = self.fc_layer5(x)
        return output



In [1]:
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import transforms
from groundingdino.util.slconfig import SLConfig
from groundingdino.models import build_model

# === Paths ===
SPLIT_DATA_PATH = '/content/drive/MyDrive/Spot_IL/Real World Dataset'
LABEL_PATH = os.path.join(SPLIT_DATA_PATH, 'map01_01_train_5_1/labels.npy')
TRAIN_PATH = os.path.join(SPLIT_DATA_PATH, 'map01_01_train_5_1')

WEIGHT_PATH = os.path.join(SPLIT_DATA_PATH, 'weights/map01_01_DinoMlp')
os.makedirs(WEIGHT_PATH, exist_ok=True)

FIGURE_PATH = os.path.join(SPLIT_DATA_PATH, 'Results/map01_01_DinoMlp')
os.makedirs(FIGURE_PATH, exist_ok=True)


# === Data Transforms ===
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# === Device Setup ===
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {DEVICE}')


# === Dataset ===
full_dataset = SPOTDataLoader(
    root_dir=TRAIN_PATH,
    labels_file=LABEL_PATH,
    transform=data_transforms,
    preload=True
)
print(f"Total training samples: {len(full_dataset)}")

BATCH_SIZE = 32
train_dataloader = DataLoader(full_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

# === Hyperparameters & Loss ===
LEARNING_RATE = 1e-3
NUM_EPOCHS = 500
START_EPOCH = 100  # Start training from the 101st epoch
LOSS_SCALE = 1e3
TOLERANCE = 1e-1
loss_fn = torch.nn.MSELoss()

config_file = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
weight_file = "weights/groundingdino_swint_ogc.pth"

# === Model, Optimizer, and Scheduler ===
model = DINOCrossAttentionMLP(
    config_file=config_file,
    weight_file=weight_file,
    num_cameras=5,
    embed_dim=256
)
model.to(DEVICE)

optimizer = torch.optim.Adam(
    [p for p in model.parameters() if p.requires_grad],
    lr=LEARNING_RATE
)

scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=10, verbose=True
)

# --- Loading weights from the 100th epoch ---
checkpoint_path = os.path.join(WEIGHT_PATH, 'epoch_100.pth')
if os.path.exists(checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path, map_location=DEVICE))
    print(f"Loaded weights from {checkpoint_path}")
else:
    print(f"No checkpoint found at {checkpoint_path}, starting from scratch.")

training_losses = []
train_accuracies = []

# === Training Loop ===
for epoch in range(START_EPOCH, NUM_EPOCHS + 1):
    model.train()
    running_loss = 0.0
    train_correct = 0
    train_total = 0

    for current_images, goal_images, labels in train_dataloader:
        current_images = current_images.to(DEVICE)
        goal_images = goal_images.to(DEVICE)
        labels = labels.to(DEVICE)
        text_prompts = ["green chair." for _ in range(current_images.size(0))]

        optimizer.zero_grad()
        output = model(current_images, goal_images, text_prompts)

        # Computing loss
        loss = loss_fn(output, labels.float()) * LOSS_SCALE
        loss.backward()

        # Applying gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        running_loss += loss.item()

        # Computing accuracy
        errors = torch.norm(output - labels.float(), dim=1, p=2)
        train_correct += (errors < TOLERANCE).sum().item()
        train_total += errors.numel()

    # Loss and accuracy per epoch
    epoch_loss = running_loss / len(train_dataloader)
    train_accuracy = (train_correct / train_total) * 100
    training_losses.append(epoch_loss)
    train_accuracies.append(train_accuracy)

    print(f"Epoch {epoch}/{NUM_EPOCHS} -- Training Loss: {epoch_loss:.6f} -- Training Accuracy: {train_accuracy:.2f}%")

    # Adjusting learning rate if model is not improving
    scheduler.step(epoch_loss)

    # --- Saving Weights Every 25 Epochs ---
    if epoch % 25 == 0:
        weight_file = os.path.join(WEIGHT_PATH, f'epoch_{epoch}.pth')
        torch.save(model.state_dict(), weight_file)
        print(f"Weights saved at epoch {epoch}")

# === Final Model Save ===
final_weight_file = os.path.join(WEIGHT_PATH, f'final_epoch_{NUM_EPOCHS}.pth')
torch.save(model.state_dict(), final_weight_file)
print("Training complete. Final model saved.")


ModuleNotFoundError: No module named 'groundingdino'

In [10]:
import matplotlib.pyplot as plt
import os

WEIGHT_SAVING_STEP = 10
DPI = 120
FIGURE_SIZE_PIXEL = [2490, 1490]
FIGURE_SIZE = [fsp / DPI for fsp in FIGURE_SIZE_PIXEL]

def plot_graph(training_losses, train_accuracies, figure_path=None,
               loss_filename='Training_loss.png', accuracy_filename='Training_accuracy.png',
               start_plot=0, end_plot=None):
    if end_plot is None or end_plot > len(training_losses):
        end_plot = len(training_losses)

    epochs = range(start_plot + 1, end_plot + 1)

    # ===== Training Loss =====
    plt.figure(figsize=FIGURE_SIZE, dpi=DPI)
    plt.scatter(epochs, training_losses[start_plot:end_plot], color='blue', label='Training Loss')
    plt.plot(epochs, training_losses[start_plot:end_plot], color='cyan', linestyle='-', label='Loss Trend')
    plt.title("Training Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss (scaled)")
    plt.legend()

    lowest_loss = min(training_losses[start_plot:end_plot])
    for i, loss in enumerate(training_losses[start_plot:end_plot], start=start_plot+1):
        if (i % WEIGHT_SAVING_STEP == 0) or (i == end_plot):
            plt.annotate(str(round(loss, 6)), xy=(i, loss))

    plt.text(0, plt.gca().get_ylim()[1], f'Lowest Loss: {lowest_loss:.6f}')

    if figure_path is not None:
        plt.savefig(os.path.join(figure_path, loss_filename))
    plt.show()

    # ===== Training Accuracy =====
    plt.figure(figsize=FIGURE_SIZE, dpi=DPI)
    plt.plot(epochs, train_accuracies[start_plot:end_plot], color='green', linestyle='-', marker='o',
             label='Training Accuracy')
    plt.title("Training Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy (%)")
    plt.legend()

    for i, acc in enumerate(train_accuracies[start_plot:end_plot], start=start_plot+1):
        if (i % WEIGHT_SAVING_STEP == 0) or (i == end_plot):
            plt.annotate(f"{round(acc, 2)}", xy=(i, acc))

    if figure_path is not None:
        plt.savefig(os.path.join(figure_path, accuracy_filename))
    plt.show()


In [None]:
plot_graph(training_losses, train_accuracies, FIGURE_PATH,
               loss_filename='Training_loss.png', accuracy_filename='Training_accuracy.png',
               start_plot=0, end_plot=500)

In [None]:
training_history = {
    'losses': training_losses,
    'accuracies': train_accuracies
}

history_file = os.path.join(FIGURE_PATH, 'dinomlp_training_history.pth')
torch.save(training_history, history_file)
print(f"Training history saved to {history_file}")