Prepare Training Data
1. Set up /  Prepare and format the data correctly for input into the model.


# -----------------------------------------------------------------------------
# License Notice
# Copyright (c) 2025 Duc Huy Vu, Quan Hieu Tran
#
# This code is for personal and academic use only.
# Only the authors may modify, distribute, or reuse it.
# Viewing for grading purposes is allowed.
# -----------------------------------------------------------------------------


In [None]:
import pickle
import random

import torch
import torch.nn                   as nn
import torch.optim                as optim

import torchvision.transforms     as transforms

import numpy                      as np
import seaborn                    as sns
import matplotlib.pyplot          as plt

from tqdm                         import tqdm

from collections                  import Counter


from sklearn.metrics              import confusion_matrix
from sklearn.metrics              import precision_recall_fscore_support
from torch.utils.data             import Dataset, DataLoader
from torch.optim.lr_scheduler     import CosineAnnealingLR
from sklearn.model_selection      import train_test_split

from google.colab import drive
from google.colab import files

In [None]:
# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

In [None]:
# Configuration for easy modification
# Configuration for easy modification
CONFIG = {
    'path9': '/content/drive/MyDrive/AI Robotic/dataset/9.pkl',
    'path8': '/content/drive/MyDrive/AI Robotic/dataset/8.pkl',
    'path7': '/content/drive/MyDrive/AI Robotic/dataset/7.pkl',
    'path6': '/content/drive/MyDrive/AI Robotic/dataset/6.pkl',
    'path5': '/content/drive/MyDrive/AI Robotic/dataset/5.pkl',
    'path4': '/content/drive/MyDrive/AI Robotic/dataset/4.pkl',
    'path3': '/content/drive/MyDrive/AI Robotic/dataset/3.pkl',
    'path2': '/content/drive/MyDrive/AI Robotic/dataset/2.pkl',
    'path1': '/content/drive/MyDrive/AI Robotic/dataset/1.pkl',

    'batch_size':                                             32,
    'num_epochs':                                             60,  # Consider increasing to 10 for better training
    'learning_rate':                                          1e-3,
    'train_ratio':                                            0.8,
    'num_classes':                                            9,
    'hidden_dim':                                             256,
    'patience':                                               100,
    'seq_len':                                                10,
    'class_weights':                                          [1, 1, 1, 1, 1, 1, 1, 1, 1],
    'debug':                                                  True,
    'use_augmentation':                                       False,  # Consider setting to True
    'plot_loss_curves':                                       True,
    'plot_f1_scores':                                         True,
}

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Part II: Load the Data
2. Data loading
  - Use pickle to load depth_sequences and labels from disk
  - Or
  - Create a Dataset class and DataLoader to feed the data into the model in batches.

- Requirements:
  - A class that inherits from torch.utils.data.Dataset:
   - __getitem__ returns a sequence of images and the corresponding label.
  - A DataLoader to retrieve batches during training.

In [None]:
import pandas as pd
# Label mapping
label_map = {
    0: 'forward', 1: 'backward', 2: 'up', 3: 'down',
    4: 'left', 5: 'right', 6: 'stop',
    7: 'rotate_left', 8: 'rotate_right'
}

total_counter = Counter()

for i in range(1, 10):
    path = f'/content/drive/MyDrive/AI Robotic/dataset/{i}.pkl'
    label_counter, label_str_counter, total = Counter(), Counter(), 0
    try:
        with open(path, 'rb') as f:
            while True:
                try:
                    frame = pickle.load(f)
                    total += 1
                    label = frame.get("label")
                    label_str = frame.get("label_str")
                    if label is not None: label_counter[label] += 1; total_counter[label] += 1
                    if label_str: label_str_counter[label_str] += 1
                except EOFError: break
        print(f" {i}.pkl | Frames: {total} | Labels: {dict(label_counter)}")
        print(f" Strings: {dict(label_str_counter)}\n")

    except Exception as e:
        print(f" {i}.pkl: {e}")

# Ensure all labels 0–8 exist
for i in range(9): total_counter.setdefault(i, 0)

# Summary table
summary = pd.DataFrame({
    'Label': list(range(9)),
    'Label String': [label_map[i] for i in range(9)],
    'Count': [total_counter[i] for i in range(9)]
})
summary.loc[len(summary.index)] = ['', 'Total', summary['Count'].sum()]
print("\n Summary (sorted by label):")

display(summary)


In [None]:
def load_depth_sequences_from_pkl(pkl_path, seq_len=10, debug=False):
    """
    Load a .pkl file containing UAV frame data and convert it into sequences for CNN-LSTM training.

    Args:
        pkl_path (str): Path to the .pkl file
        seq_len (int): Number of consecutive frames in a sequence
        debug (bool): If True, prints stats and first sample

    Returns:
        depth_sequences (list of np.array): Shape (N, seq_len, 1, H, W)
        labels (list of int): Corresponding labels for each sequence
    """
    # Load all frames
    frames = []
    with open(pkl_path, "rb") as f:
        try:
            while True:
                frame = pickle.load(f)
                frames.append(frame)
        except EOFError:
            pass

    # Convert to sequences
    depth_sequences = []
    labels = []

    for i in range(len(frames) - seq_len):
        seq = frames[i:i + seq_len]
        # Check all frames in seq are valid and labeled
        if any("depth" not in f or "label" not in f for f in seq):
            continue
        try:
            seq_depths = np.array([f["depth"] for f in seq])  # shape: (seq_len, H, W)
            seq_depths = seq_depths[:, np.newaxis, :, :]      # add channel dim → (seq_len, 1, H, W)
            depth_sequences.append(seq_depths)
            labels.append(seq[-1]["label"])                   # label is based on last frame
        except Exception as e:
            if debug:
                print(f"Skipped sequence at {i} due to error: {e}")

    if debug:
        print(f"Loaded {len(depth_sequences)} sequences of length {seq_len}")
        print("First sequence shape:", depth_sequences[0].shape)
        print("First label:", labels[0])

    return depth_sequences, labels


In [None]:
depth_seqs, labels = load_depth_sequences_from_pkl(CONFIG["path8"], seq_len=10, debug=True)


# Split Data


In [None]:
def split_data(depth_sequences, labels, train_ratio=0.8, debug=False):
    """
    Split depth sequences and labels into training and validation sets.

    Args:
        depth_sequences (list of np.array): Input sequences
        labels (list of int): Corresponding labels
        train_ratio (float): Ratio of training data
        debug (bool): Print summary

    Returns:
        X_train, X_val, y_train, y_val
    """
    X_train, X_val, y_train, y_val = train_test_split(
        depth_sequences, labels,
        train_size=train_ratio,
        stratify=labels,  # Ensure balanced label distribution
        random_state=42
    )

    if debug:
        print(f"Training samples: {len(X_train)}")
        print(f"Validation samples: {len(X_val)}")
        print("Train label distribution:", Counter(y_train))
        print("Val label distribution:", Counter(y_val))

    return X_train, X_val, y_train, y_val


In [None]:
train_sequences, val_sequences, train_labels, val_labels = split_data(depth_seqs, labels, train_ratio=0.8, debug=True)


# Data Loader

In [None]:
# 3. Preprocessing
# Data Augmentation for Depth Images
def augment_depth_image(image, debug=False):
    """
    Apply random augmentations to a depth image (H x W).
    Args:
        image (array): Input image [H, W] (or [1, H, W])
        debug (bool): Print debug info if True
    Returns:
        array: Augmented image [H, W]
    """
    # Handle potential (1, H, W) input from some stages
    if len(image.shape) == 3 and image.shape[0] == 1:
        image = image.squeeze(0)

    if len(image.shape) != 2:
         if debug:
             print(f"ERROR: Augmentation received invalid shape {image.shape}, expected 2D.")
         raise ValueError("Invalid image shape for augmentation, expected 2D.")

    # Get original shape
    H, W = image.shape

    # Convert to tensor and add channel dimension → shape: (1, H, W)
    image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)

    # Define augmentation transforms - ensure they work on (1, H, W)
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(degrees=10),
        # Adjust translate based on actual image size, not hardcoded 0.1 for 256
        # Max translation should be a fraction of image size
        # Adjusted translation scale based on 256x256 assumption
        transforms.RandomAffine(degrees=0, translate=(0.1 * (256/W), 0.1 * (256/H))),

        # Add random noise
        transforms.Lambda(lambda x: x + torch.randn_like(x) * 0.01)
    ])

    # Apply transforms
    augmented_image = transform(image)

    # Remove channel dimension → shape: (H, W) and convert back to numpy
    return augmented_image.squeeze(0).numpy()


#Load Data

In [None]:
class UAVDepthDataset(Dataset):
    def __init__(self, depth_sequences, labels, augment=True, debug=False):
        self.depth_sequences = depth_sequences
        self.labels = labels
        self.augment = augment
        self.debug = debug

        if len(self.depth_sequences) == 0:
            raise ValueError("Empty dataset!")

        # Validate labels
        num_classes = CONFIG.get('num_classes', 9)
        invalid = [l for l in labels if not (0 <= l < num_classes)]
        if invalid:
            raise ValueError(f"Invalid labels: {set(invalid)}; expected in [0,{num_classes-1}]")

    def __len__(self):
        return len(self.depth_sequences)

    def __getitem__(self, idx):
      raw = self.depth_sequences[idx]  # maybe (T,1,1,H,W) or (T,1,H,W)
      seq = []

      if self.augment:
          for frame in raw:
              # frame: (1,1,H,W) or (1,H,W)
              f2d = np.squeeze(frame)                # → (H,W)
              f2d = augment_depth_image(f2d, self.debug)
              seq.append(f2d[np.newaxis, ...])       # → (1,H,W)
      else:
          # just collapse all singleton dims on each frame
          for frame in raw:
              f2d = np.squeeze(frame)                # → (H,W)
              seq.append(f2d[np.newaxis, ...])       # → (1,H,W)

      # Now stack into (T,1,H,W)
      proc = np.stack(seq, axis=0)
      proc = torch.tensor(proc, dtype=torch.float32)  # (T,1,H,W)
      lbl  = torch.tensor(self.labels[idx], dtype=torch.long)
      return proc, lbl


In [None]:
# 1) Load sequences from disk
depth_seqs, labels = load_depth_sequences_from_pkl(
    CONFIG["path8"],
    seq_len=CONFIG["seq_len"],
    debug=True
)
print(f"Total sequences: {len(depth_seqs)}, Total labels: {len(labels)}")
print(f"  • One sequence shape: {depth_seqs[0].shape}")   # expect (seq_len, 1, H, W)
print(f"  • First labels: {labels[:5]}")

# 2) Split into train / val
from sklearn.model_selection import train_test_split
train_seqs, val_seqs, train_lbls, val_lbls = train_test_split(
    depth_seqs,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)
print(f"Train seqs: {len(train_seqs)}, Val seqs: {len(val_seqs)}")

# 3) Build datasets
train_ds = UAVDepthDataset(
    depth_sequences=train_seqs,
    labels=train_lbls,
    augment=CONFIG["use_augmentation"]
)
val_ds   = UAVDepthDataset(
    depth_sequences=val_seqs,
    labels=val_lbls,
    augment=False
)
print(f"Dataset lengths → train: {len(train_ds)}, val: {len(val_ds)}")

# 4) Build loaders and inspect one batch
from torch.utils.data import DataLoader
train_loader = DataLoader(train_ds, batch_size=CONFIG["batch_size"], shuffle=True)
for batch_inputs, batch_labels in train_loader:
    print(f"Batch inputs shape: {batch_inputs.shape}")   # expect (B, seq_len, 1, H, W)
    print(f"Batch labels shape: {batch_labels.shape}")   # expect (B,)
    break


#EfficientCNNEncoder

In [None]:
from torch.utils.data import DataLoader

# 1) Create your Dataset objects
train_dataset = UAVDepthDataset( # Use the correct Dataset class
    depth_sequences = train_sequences,
    labels          = train_labels,
    augment         = CONFIG['use_augmentation'],
    debug           = CONFIG['debug']
)
val_dataset = UAVDepthDataset(   # Use the correct Dataset class
    depth_sequences = val_sequences,
    labels          = val_labels,
    augment         = False,
    debug           = CONFIG['debug']
)

# 2) Wrap them in DataLoaders
train_loader = DataLoader(       # Use the correct DataLoader class
    train_dataset,               # Pass the dataset object
    batch_size  = CONFIG['batch_size'],
    shuffle     = True,
    num_workers = 4,
    pin_memory  = True
)
val_loader = DataLoader(         # Use the correct DataLoader class
    val_dataset,                 # Pass the dataset object
    batch_size  = CONFIG['batch_size'],
    shuffle     = False,
    num_workers = 2,
    pin_memory  = True
)

In [None]:
import torch
import torch.nn as nn

class DepthwiseSeparableConv(nn.Module):
    """
    Depthwise separable convolution block: depthwise conv + pointwise conv
    Reduces computations compared to standard conv
    """
    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1):
        super().__init__()
        self.depthwise = nn.Conv2d(
            in_channels, in_channels, kernel_size=kernel_size,
            padding=padding, groups=in_channels, bias=False
        )
        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        x = self.bn(x)
        return self.relu(x)

In [None]:
class EfficientCNNEncoder(nn.Module):
    """
    Lightweight encoder using depthwise separable convolutions
    """
    def __init__(self, in_channels=1, out_channels=8):
        super().__init__()
        self.encoder = nn.Sequential(
            DepthwiseSeparableConv(in_channels, 16),
            DepthwiseSeparableConv(16, out_channels),
        )

    def forward(self, x):
        return self.encoder(x)


#Efficient Model


In [None]:
class EfficientUAVNavigationModel(nn.Module):
    """
    Optimized UAV navigation model:
     - Depthwise separable convs to reduce FLOPs
     - Global avg pooling to compress spatial dims
     - Reduced GRU hidden size
    """
    def __init__(
        self,
        num_classes: int = 9,
        hidden_dim: int = 128,            # reduced hidden dimension
        encoder_out: int = 8,
        image_size: tuple = (256, 256),
        debug: bool = False
    ):
        super().__init__()
        self.debug = debug

        # Encoder: lightweight depthwise separable convs
        self.encoder = EfficientCNNEncoder(in_channels=1, out_channels=encoder_out)

        # Feature extractor: two separable-conv blocks
        self.cnn = nn.Sequential(
            DepthwiseSeparableConv(encoder_out, 64),
            nn.MaxPool2d(2),
            DepthwiseSeparableConv(64, 128),
            nn.MaxPool2d(2),
        )

        # Global pooling to reduce spatial dims to channel vector
        self.global_pool = nn.AdaptiveAvgPool2d(1)

        # Linear projection to match GRU input size
        self.proj = nn.Linear(128, 128)

        # GRU for temporal modeling
        self.gru = nn.GRU(
            input_size=128,
            hidden_size=hidden_dim,
            batch_first=True,
        )

        # Simple attention: weighted sum over time
        self.attn = nn.Linear(hidden_dim, 1)
        self.softmax = nn.Softmax(dim=1)

        # Final classification layer
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, T, 1, H, W)
        B, T, C, H, W = x.shape
        # Merge batch and time for per-frame processing
        x = x.view(B * T, C, H, W)

        # Encode spatial features
        x = self.encoder(x)            # (B*T, encoder_out, H, W)
        x = self.cnn(x)                # (B*T, 128, H', W')
        x = self.global_pool(x)        # (B*T, 128, 1, 1)
        x = x.view(B, T, 128)          # (B, T, 128)

        # Optional projection (identity if dims match)
        x = self.proj(x)               # (B, T, 128)

        # Temporal modeling
        x, _ = self.gru(x)             # (B, T, hidden_dim)

        # Attention pooling
        weights = self.attn(x)         # (B, T, 1)
        weights = self.softmax(weights)
        context = torch.sum(weights * x, dim=1)  # (B, hidden_dim)

        # Classification
        return self.fc(context)

#Part IV: Loss Function and Optimization
- Set up:
  - Loss Function: e.g., CrossEntropyLoss for classification tasks.
  - Optimizer: e.g., Adam, SGD.
  - Learning Rate, Epoch, Batch size.

In [None]:
# Initialize model
print("Initializing model...")
model = EfficientUAVNavigationModel(
    num_classes=CONFIG['num_classes'],
    hidden_dim=CONFIG['hidden_dim'],
)

In [None]:
class Trainer:
    def __init__(self, model, train_loader, val_loader, config):
        self.device       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model        = model.to(self.device)
        self.train_loader = train_loader
        self.val_loader   = val_loader
        self.config       = config

        # checks
        if len(train_loader)==0 or len(val_loader)==0:
            raise ValueError("Empty DataLoader!")
        if len(config['class_weights'])!=config['num_classes']:
            raise ValueError("class_weights length mismatch")

        cw = torch.tensor(config['class_weights'], device=self.device, dtype=torch.float32)
        self.criterion = nn.CrossEntropyLoss(weight=cw)
        self.optimizer = optim.Adam(self.model.parameters(), lr=config['learning_rate'])
        self.scheduler = CosineAnnealingLR(self.optimizer, T_max=config['num_epochs'])

        self.train_losses     = []
        self.val_losses       = []
        self.train_accuracies = []
        self.val_accuracies   = []

    def train_epoch(self):
        self.model.train()
        total_loss, correct, total = 0., 0, 0
        for inputs, labels in self.train_loader:
            inputs, labels = inputs.to(self.device), labels.to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss    = self.criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds==labels).sum().item()
            total   += labels.size(0)

        avg_loss = total_loss / len(self.train_loader)
        acc      = 100 * correct / max(total,1)
        return avg_loss, acc

    def validate_epoch(self):
        self.model.eval()
        total_loss, correct, total = 0., 0, 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in self.val_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss    = self.criterion(outputs, labels)

                total_loss += loss.item()
                preds = outputs.argmax(dim=1)
                correct += (preds==labels).sum().item()
                total   += labels.size(0)

                all_preds.extend(preds.cpu().tolist())
                all_labels.extend(labels.cpu().tolist())

        avg_loss = total_loss / len(self.val_loader)
        acc      = 100 * correct / max(total,1)
        return avg_loss, acc, all_preds, all_labels

    def plot_learning_curves(self):
        epochs = range(1, len(self.train_losses)+1)
        plt.figure(figsize=(12,4))
        # Loss
        plt.subplot(1,2,1)
        plt.plot(epochs, self.train_losses, label='Train Loss')
        plt.plot(epochs, self.val_losses,   label='Val Loss')
        plt.title('Loss Curves')
        plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend()
        # Accuracy
        plt.subplot(1,2,2)
        plt.plot(epochs, self.train_accuracies, label='Train Acc')
        plt.plot(epochs, self.val_accuracies,   label='Val Acc')
        plt.title('Accuracy Curves')
        plt.xlabel('Epoch'); plt.ylabel('Accuracy (%)'); plt.legend()
        plt.tight_layout(); plt.show()

    def evaluate(self, labels, preds):
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average=None,
            labels=list(range(self.config['num_classes'])),
            zero_division=0
        )
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            labels, preds, average='macro', zero_division=0
        )
        cm = confusion_matrix(labels, preds, labels=list(range(self.config['num_classes'])))

        action_names = [
            "Forward","Up","Down","Left","Right","Stop",
            "Rotate Left","Rotate Right","Backward"
        ]
        print("\nPer-class metrics:")
        for i in range(self.config['num_classes']):
            name = action_names[i] if i<len(action_names) else f"Class_{i}"
            print(f"{name}: P={precision[i]:.2f}, R={recall[i]:.2f}, F1={f1[i]:.2f}")
        print(f"Macro F1: {macro_f1:.2f}")

        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                    xticklabels=action_names[:self.config['num_classes']],
                    yticklabels=action_names[:self.config['num_classes']])
        plt.title("Confusion Matrix"); plt.xlabel("Predicted"); plt.ylabel("True"); plt.show()

        return {
            'precision': precision.tolist(),
            'recall': recall.tolist(),
            'f1': f1.tolist(),
            'macro_f1': macro_f1,
            'confusion_matrix': cm.tolist()
        }

    def train(self):
        best_loss = float('inf')
        no_improve = 0
        for epoch in tqdm(range(self.config['num_epochs']), desc="Epochs"):
            train_l, train_acc = self.train_epoch()
            val_l, val_acc, preds, labels = self.validate_epoch()

            self.train_losses.append(train_l)
            self.val_losses.append(val_l)
            self.train_accuracies.append(train_acc)
            self.val_accuracies.append(val_acc)
            self.scheduler.step()

            print(f"Epoch {epoch+1}: Train L={train_l:.3f}, A={train_acc:.1f}% | "
                  f"Val L={val_l:.3f}, A={val_acc:.1f}%")

            if val_l < best_loss:
                best_loss = val_l
                torch.save(self.model.state_dict(), "best_model.pt")
                no_improve = 0
            else:
                no_improve += 1
            if no_improve >= self.config['patience']:
                print("Early stopping."); break

        # final evaluation & plots
        print("\n--- Final Evaluation ---")
        _, _, preds, labels = self.validate_epoch()
        results = self.evaluate(labels, preds)
        self.plot_learning_curves()
        return results

In [None]:
# 3) Pass the loaders—not the raw lists—into your Trainer
trainer = Trainer(model, train_loader, val_loader, CONFIG)

# 4) Train!
results = trainer.train()
print("Final eval results:", results)

In [None]:
# Download the model
print("Training complete. Downloading best_model.pt...")
files.download('/content/best_model.pt')