# PyTorch 3D ResNet

In [1]:
import torch
from tqdm import tqdm
from time import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

from torch.utils.data import DataLoader, Subset, random_split

from models.ThreeDResNet import get_3dResNet, get_resnet_transformer
from colorVideoDataset import ColorVideoDataset



In [None]:
MODEL = get_3dResNet()

In [2]:
DATASET = ColorVideoDataset('./colors')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Train, Test, Val Sets

In [3]:
train_dataset, test_dataset = random_split(DATASET, [int(0.8 * len(DATASET)), len(DATASET) - int(0.8 * len(DATASET))])
test_dataset, val_dataset = random_split(test_dataset, [int(0.5 * len(test_dataset)), len(test_dataset) - int(0.5 * len(test_dataset))])

## Dataloader and Training

In [None]:
def get_dataloader(dataset, subset_ratio : float | None = 0.1, batch_size : int = 2):
    transform = get_resnet_transformer()

    def collate_fn(batch):
        videos = []
        labels = []
        for video, label, _ in batch:
            video = video.permute(1, 0, 2, 3)
            video = transform({"video": video})["video"]
            videos.append(video)
            labels.append(torch.tensor(label, dtype=torch.long))
        
        videos = torch.stack(videos)
        labels = torch.stack(labels)
        return videos, labels
    
    if subset_ratio is not None:
        num_samples = int(len(dataset) * subset_ratio) 
        subset_indices = list(range(num_samples))
        subset = Subset(dataset, subset_indices)
        dataloader = DataLoader(subset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    else:
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    return dataloader

In [None]:
def infer_loop(model, dataloader, criterion = None) -> tuple:
    if criterion is None:
        criterion = nn.CrossEntropyLoss()
    
    running_loss = 0.0
    all_preds = []
    all_labels = []
    model.eval()
    for videos, labels in dataloader:
        videos, labels = videos.to(DEVICE), labels.to(DEVICE)

        outputs = model(videos)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        all_preds.extend(outputs.argmax(dim=1).cpu().detach().numpy())
        all_labels.extend(labels.cpu().detach().numpy())
        
    val_accuracy = accuracy_score(all_preds, all_labels)
    avg_loss = running_loss / len(dataloader)

    return (val_accuracy, avg_loss)

In [None]:
def training_loop(model, train_dataloader, val_dataloader: DataLoader | None = None, 
                  epochs=5, learning_rate=2e-4, early_stopping_patience=10,
                  checkpoint_path='best_model_3DResNet.pth', lr_patience=2):

    results_dict = {
        'time_per_batch': [],
        'time_per_epoch': [],
        'train_accuracy': [],
        'train_loss': [],
        'val_accuracy': [],
        'val_loss': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'learning_rate': [],
    }
    
    model = model.to(DEVICE)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # ReduceLROnPlateau - tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.1, patience=lr_patience, verbose=True, min_lr=1e-7
    )
    
    # EarlyStopping - tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    best_val_loss = float('inf')
    epochs_without_improvement = 0
    
    epoch_pbar = tqdm(range(epochs), desc="Training Epochs")

    for epoch in epoch_pbar:
        # Training phase
        model.train()
        running_loss = 0.0
        epoch_start = time()
        all_preds = []
        all_labels = []
        
        for videos, labels in train_dataloader:
            videos, labels = videos.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            all_preds.extend(outputs.argmax(dim=1).cpu().detach().numpy())
            all_labels.extend(labels.cpu().detach().numpy())
        
        epoch_time = time() - epoch_start
        results_dict['time_per_epoch'].append(epoch_time)
        
        # MetricsCallback - tracks precision, recall, f1
        results_dict['train_accuracy'].append(accuracy_score(all_labels, all_preds))
        results_dict['precision'].append(precision_score(all_labels, all_preds, average='weighted', zero_division=0))
        results_dict['recall'].append(recall_score(all_labels, all_preds, average='weighted', zero_division=0))
        results_dict['f1'].append(f1_score(all_labels, all_preds, average='weighted', zero_division=0))
        
        avg_train_loss = running_loss / len(train_dataloader)
        results_dict['train_loss'].append(avg_train_loss)
        
        val_accuracy, val_loss = infer_loop(model, val_dataloader, criterion)
        
        results_dict['val_accuracy'].append(val_accuracy)
        results_dict['val_loss'].append(val_loss)
        results_dict['learning_rate'].append(optimizer.param_groups[0]['lr'])
        
        # ReduceLROnPlateau callback
        scheduler.step(val_loss)
        
        # ModelCheckpoint - tf.keras.callbacks.ModelCheckpoint('best_model_TVN.h5', monitor='val_loss', save_best_only=True)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), checkpoint_path)
            epoch_pbar.write(f"✓ Epoch {epoch+1}: Saved best model with val_loss: {val_loss:.4f}")
        else:
            epochs_without_improvement += 1
        
        # Update progress bar
        epoch_pbar.set_postfix({
            'train_loss': f"{avg_train_loss:.4f}",
            'val_loss': f"{val_loss:.4f}",
            'val_acc': f"{val_accuracy:.4f}",
            'lr': f"{optimizer.param_groups[0]['lr']:.2e}"
        })
        
        # EarlyStopping check
        if epochs_without_improvement >= early_stopping_patience:
            epoch_pbar.write(f"\n⚠ Early stopping triggered after {epoch + 1} epochs")
            break
    
    # Load best model
    model.load_state_dict(torch.load(checkpoint_path))
    epoch_pbar.write(f"\n✓ Loaded best model from {checkpoint_path}")
    
    return results_dict

In [None]:
dataloader = get_dataloader(DATASET, subset_ratio=0.1, batch_size=16)
x, y = next(iter(dataloader))
print(x.shape, y.shape)
del x, y

In [None]:
train_dataloader =  get_dataloader(train_dataset, subset_ratio=None, batch_size=16)
val_dataloader =  get_dataloader(val_dataset, subset_ratio=None, batch_size=16)
test_dataloader =  get_dataloader(test_dataset, subset_ratio=None, batch_size=16)

In [None]:
results = training_loop(
    MODEL, 
    train_dataloader, 
    val_dataloader, 
    epochs=100, 
    learning_rate=2e-4,
    early_stopping_patience=10,              
    checkpoint_path='best_model_3DResNet.pth',  
    lr_patience=2
)

In [None]:
model_save_path = './models/trained/3dResnet_Trained.pth'
torch.save(MODEL.state_dict(), model_save_path)

In [None]:
results_df = pd.DataFrame.from_dict(results, orient="index").T
results_df.to_csv("3D_ResNet_Results.csv")

In [None]:
model = torch.load(model_save_path)

model.eval()
test_acc, test_loss = infer_loop(model, test_loader)
test_acc, test_loss

# TensorFlow TinyVideoNet

In [4]:
import tensorflow as tf
import tensorflow.keras as k
import numpy as np
from models.TinyVideoNet import TinyVideoNetTransfer

  from pkg_resources import parse_version






In [5]:
def tf_data_generator(torch_dataset):
    def generator():
        for i in range(len(torch_dataset)):
            video, label, _ = torch_dataset[i]
            yield video, label
    return generator

def create_dataset(torch_ds, batch_size=4):
    ds = tf.data.Dataset.from_generator(
        tf_data_generator(torch_ds),
        output_signature=(
            tf.TensorSpec(shape=(30, 3, 480, 640), dtype=tf.float32),
            tf.TensorSpec(shape=(), dtype=tf.int64)
        )
    )
    return ds.batch(batch_size)

In [6]:
# Create TF Datasets
tf_train = create_dataset(train_dataset, batch_size=4)
tf_val = create_dataset(val_dataset, batch_size=4)
tf_test = create_dataset(test_dataset, batch_size=4)

In [None]:
def to_one_hot(x, y, num_classes=8):
    y_one_hot = tf.one_hot(y, depth=num_classes)
    return x, y_one_hot

tf_train = tf_train.map(lambda x, y: to_one_hot(x, y, 8))
tf_val = tf_val.map(lambda x, y: to_one_hot(x, y, 8))
tf_test = tf_test.map(lambda x, y: to_one_hot(x, y, 8))

In [8]:
model_handle = 'https://kaggle.com/models/google/tiny-video-net/frameworks/TensorFlow1/variations/tvn1/versions/1'
tf_model = TinyVideoNetTransfer(model_handle, num_classes=8)

In [None]:
x, y = next(iter(tf_train))
print("x.shape:", x.shape)  # Should be (B, 30, 3, 480, 640)
print("y.shape:", y.shape)  # Should be (B,)
print("y.dtype:", y.dtype)  # Should be int32 or int64
print("Sample label:", y[0].numpy())  # Should be integer in [0, 7]

del x, y

x.shape: (4, 30, 3, 480, 640)
y.shape: (4, 8)
y.dtype: <dtype: 'float32'>
Sample label: [0. 0. 0. 0. 1. 0. 0. 0.]


In [10]:
class MetricsCallback(k.callbacks.Callback):
    def __init__(self):
        self.time_per_epoch = []  # Track epoch times
        self.epoch_start_time = None
        
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time()
        
    def on_epoch_end(self, epoch, logs=None):
        # Calculate epoch time
        epoch_time = time() - self.epoch_start_time
        self.time_per_epoch.append(epoch_time)
        logs['time_per_epoch'] = epoch_time

# Create the callback
metrics_callback = MetricsCallback()

In [13]:
tf_model.compile(
    optimizer=k.optimizers.Adam(learning_rate=2e-4), 
    loss='categorical_crossentropy', 
    metrics=[
        'accuracy',
        k.metrics.Precision(),
        k.metrics.Recall(), 
        k.metrics.F1Score(average='macro')
    ])

In [14]:
history = tf_model.fit(
    tf_test,
    validation_data=tf_val_oh, 
    epochs=5, 
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10), 
        tf.keras.callbacks.ModelCheckpoint('./models/trained_models/TVN_best_model.weights.h5', monitor='val_loss', save_best_only=True, save_weights_only=True), 
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2), 
        metrics_callback
        ]
    )

Epoch 1/5
      8/Unknown [1m8s[0m 859ms/step - accuracy: 0.3141 - f1_score: 0.1696 - loss: 2.4431 - precision: 0.1079 - recall: 0.0381   



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step - accuracy: 0.2812 - f1_score: 0.1873 - loss: 2.2561 - precision: 0.1250 - recall: 0.0312 - val_accuracy: 0.1875 - val_f1_score: 0.0691 - val_loss: 2.1706 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 2.0000e-04 - time_per_epoch: 15.1407
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2s/step - accuracy: 0.2500 - f1_score: 0.1849 - loss: 2.4227 - precision: 0.2857 - recall: 0.0625 - val_accuracy: 0.0938 - val_f1_score: 0.0460 - val_loss: 2.1545 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 2.0000e-04 - time_per_epoch: 13.6725
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2s/step - accuracy: 0.1875 - f1_score: 0.1184 - loss: 2.1613 - precision: 0.2222 - recall: 0.0625 - val_accuracy: 0.0938 - val_f1_score: 0.0420 - val_loss: 2.1684 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 2.0000e-04 - time_

In [17]:
history.history

{'accuracy': [0.28125, 0.25, 0.1875, 0.15625, 0.25],
 'f1_score': [0.18733972311019897,
  0.18492060899734497,
  0.11837119609117508,
  0.09940474480390549,
  0.16011902689933777],
 'loss': [2.256065845489502,
  2.422661066055298,
  2.161315441131592,
  2.2509493827819824,
  2.0161006450653076],
 'precision': [0.125,
  0.2857142984867096,
  0.2222222238779068,
  0.4285714328289032,
  0.3333333432674408],
 'recall': [0.03125, 0.0625, 0.0625, 0.09375, 0.09375],
 'val_accuracy': [0.1875, 0.09375, 0.09375, 0.09375, 0.09375],
 'val_f1_score': [0.06905370205640793,
  0.045955877751111984,
  0.041958037763834,
  0.045454539358615875,
  0.04417292773723602],
 'val_loss': [2.170637845993042,
  2.1545023918151855,
  2.168384075164795,
  2.115834951400757,
  2.214994192123413],
 'val_precision': [0.0, 0.0, 0.0, 0.0, 0.0],
 'val_recall': [0.0, 0.0, 0.0, 0.0, 0.0],
 'learning_rate': [0.00019999999494757503,
  0.00019999999494757503,
  0.00019999999494757503,
  0.00019999999494757503,
  0.0001999999

In [18]:
results_df = pd.DataFrame.from_dict(history.history, orient="index").T
results_df.to_csv("TVN_results.csv")

In [19]:
results_df

Unnamed: 0,accuracy,f1_score,loss,precision,recall,val_accuracy,val_f1_score,val_loss,val_precision,val_recall,learning_rate,time_per_epoch
0,0.28125,0.18734,2.256066,0.125,0.03125,0.1875,0.069054,2.170638,0.0,0.0,0.0002,15.140651
1,0.25,0.184921,2.422661,0.285714,0.0625,0.09375,0.045956,2.154502,0.0,0.0,0.0002,13.672521
2,0.1875,0.118371,2.161315,0.222222,0.0625,0.09375,0.041958,2.168384,0.0,0.0,0.0002,14.063581
3,0.15625,0.099405,2.250949,0.428571,0.09375,0.09375,0.045455,2.115835,0.0,0.0,0.0002,13.223387
4,0.25,0.160119,2.016101,0.333333,0.09375,0.09375,0.044173,2.214994,0.0,0.0,0.0002,13.032985
