# Continuation of the last ntoebook to address some issue with training during the first

In [80]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn.functional as F
import numpy as np
import os
from PIL import Image
import torchvision.models.video as video_models
import torch.nn as nn
import tqdm
import matplotlib.pyplot as plt

### Dataset

In [81]:
import video_dataset as Dataset
transform = transforms.Compose([
    transforms.Lambda(lambda x: Dataset.correct_num_frames(x, 16)),  # (T, C, H, W)
    transforms.Lambda(lambda x: x.float() / 255.0),  # Convert to float and normalize to [0,1]
    transforms.Lambda(lambda x: F.interpolate(x, size=(112, 112), mode='bilinear', align_corners=False)),  # Resize after normalization
    transforms.Lambda(lambda x: Dataset.normalise(x, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])),  # Normalize per channel
    transforms.Lambda(lambda x: x.permute(1, 0, 2, 3)),  # (T, C, H, W) -> (C, T, H, W)
]) #This time trying to benefit from ImageNet pretraining
raw_path = '../data/WLASL2000'
instances_path = './preprocessed_labels/asl100/train_instances_fixed_bboxes_short.json'
classes_path = './wlasl_class_list.json'
train_set = Dataset.VideoDataset(
    root=raw_path,
    instances_path=instances_path,
    classes_path=classes_path,
    transform=transform
)

print(f"Number of training samples: {len(train_set)}")

Number of training samples: 1442


### Setup Loader

In [82]:
torch.manual_seed(42)  # For reproducibility
train_loader = DataLoader(
  train_set,
  batch_size=2, #theoretically 32, but will trial first
  shuffle=True,
  num_workers=0, #reproducibility, but can be set to 4 for speed
  drop_last=True, 
)
  
train_loader
    

<torch.utils.data.dataloader.DataLoader at 0x771d13f50550>

### Model setup: r3d_18

In [83]:
model = video_models.r3d_18(pretrained=True)
num_classes = 100
model.fc = nn.Linear(model.fc.in_features, num_classes)



### Training setup

In [84]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)



Using device: cuda


In [85]:
#Dane recommended only unfreezign last conv layer, and last fully connected layer
for param in model.parameters():
  param.requires_grad = True
  
#unfreeze some of the last layers


  
# model.fc.requires_grad = True  # Unfreeze the final fully connected layer  
  
# for layer_name in ['layer4', 'fc']:
#   if hasattr(model, layer_name):
#     for param in getattr(model, layer_name).parameters():
#       param.requires_grad = True
      
# for name, param in model.named_parameters():
#   if param.requires_grad:
#     print(f"Training parameter: {name}")
#   else:
#     print(f"Freezing parameter: {name}")

# for name, module in model.named_modules():
#   if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
#     # Check if this BatchNorm is in a frozen layer
#     is_in_frozen_layer = not any(unfreeze_layer in name for unfreeze_layer in ['layer4', 'fc'])
    
#     if is_in_frozen_layer:
#       module.eval()
#       module.track_running_stats = False
#       print(f"Set {name} to eval mode (frozen layer)")


In [86]:
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(trainable_params, lr=1e-4)  # Adjust learning rate as needed
# optimizer = torch.optim.Adam([
#     {'params': model.layer4.parameters(), 'lr': 1e-4},
#     {'params': model.fc.parameters(), 'lr': 1e-3}  # Higher LR for new classifier
# ])
print(len(trainable_params), "trainable parameters")
loss_func = nn.CrossEntropyLoss()

62 trainable parameters


In [87]:
from torch.utils.tensorboard import SummaryWriter
import json

In [88]:
def train_model_3(model, train_loader, optimizer, loss_func, epochs=10,val_loader=None,
                  output='runs/exp_0', logs='logs', save='checkpoints', save_every=1):
  if os.path.exists(output) and output[-1].isdigit():
    output = output[:-1] + str(int(output[-1])+ 1) #enumerate file name
  if save:
    save_path = os.path.join(output, save)
    os.makedirs(save_path,exist_ok=True)
  logs_path = os.path.join(output, logs)
  writer = SummaryWriter(logs_path) #watching loss
  train_losses = []
  val_losses = []
  best_val_loss = float('inf')
  
  model.train()
  for epoch in tqdm.tqdm(range(epochs), desc="Training R3D"):
    #Training phase
    running_loss = 0.0
    train_samples = 0
    
    for data, target in train_loader:
      data, target = data.to(device), target.to(device)
      
      optimizer.zero_grad()
      model_output = model(data)
      loss = loss_func(model_output, target)
      loss.backward()
      optimizer.step()
      
      running_loss += loss.item() * data.size(0) #weight by batch size
      train_samples += data.size(0)
      
    avg_train_loss = running_loss / train_samples
    train_losses.append(avg_train_loss)
    writer.add_scalar('Loss/Train', avg_train_loss, epoch)
    #Validation phase
    if val_loader:
      model.eval()
      val_loss = 0.0
      val_samples = 0
      
      with torch.no_grad():
        for data, target in val_loader:
          data, target = data.to(device), target.to(device)
          
          model_output = model(data)
          loss = loss_func(model_output, target)
          
          val_loss += loss.item() * data.size(0) #weight by batch size
          val_samples += data.size(0)
          
      avg_val_loss = val_loss / val_samples
      val_losses.append(avg_val_loss)
      writer.add_scalar('Loss/Val', avg_val_loss, epoch)
      
      if save and avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(),
                   os.path.join(save_path, 'best.pth'))
      
      print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
      model.train() # return back to train
    else:
      print(f'Epoch [{epoch+1}/{epochs}], Average Loss: {avg_train_loss:.4f}')
    
    if save and epoch % save_every == 0:
      avg_train_loss = avg_train_loss if avg_train_loss else 'N/A'
      avg_val_loss = avg_val_loss if avg_val_loss else 'N/A'
      torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train loss': avg_train_loss,
        'val loss': avg_val_loss,
        'train losses': train_losses,
        'val losses': val_losses
        }, os.path.join(save_path, f'checkpoint_{epoch}.pth'))
    
    with open(os.path.join(logs_path, 'train_losses.json'), "w") as f:
      json.dump(train_losses, f)
    if val_loader:
      with open(os.path.join(logs_path, 'val_losses.json'), "w") as f:
        json.dump(val_losses, f)
    
  return train_losses, val_losses

### Validation loader


In [89]:
instances_path = './preprocessed_labels/asl100/val_instances_fixed_bboxes_short.json'
val_set = Dataset.VideoDataset(
  root=raw_path,
  instances_path=instances_path,
  classes_path=classes_path,
  transform=transform
)
val_loader = DataLoader(
  val_set,
  batch_size=2, #apparently can be mroe than train batch size
  shuffle=False,
  drop_last=False,
  num_workers=0 #to use manual seed
)
val_loader

<torch.utils.data.dataloader.DataLoader at 0x771d105c0790>

In [90]:
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"cuDNN enabled: {torch.backends.cudnn.enabled}")

Using device: cuda
CUDA available: True
cuDNN enabled: True


In [91]:
train_losses, val_losses = train_model_3(
  model=model,
  train_loader=train_loader,
  optimizer=optimizer,
  loss_func=loss_func,
  epochs=30,
  val_loader=val_loader,
  output='runs/exp1_r3d18'
)

Training R3D:   0%|          | 0/30 [00:00<?, ?it/s]


RuntimeError: GET was unable to find an engine to execute this computation