In [1]:
import os
path = 'path/to/video_folder/' # video_folder contains video.mp4
from collections import Counter
from torchvision.io.video import read_video
from torchvision.models.video import r3d_18 , R3D_18_Weights
import cv2
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data.distributed import DistributedSampler

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

from torchlars import LARS
from torch.optim.lr_scheduler import CosineAnnealingLR

import numpy as np
import pandas as pd
import av
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, balanced_accuracy_score

import pickle
import random

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed value
seed = 7
set_seed(seed)
torch.cuda.is_available()

True

In [13]:
device0 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device1 = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device0, device1)

cuda:0 cuda:1


In [None]:
# Set the default CUDA device to GPU 0 (for example)
torch.cuda.set_device(0)  # Change the index to the desired GPU

# load video encoder: Video ResNet

In [45]:
# Load the weights 
weights = R3D_18_Weights.DEFAULT

encoder_anchor = r3d_18()

# remove the classification head. Keep the backbone
encoder_anchor = torch.nn.Sequential(*(list(encoder_anchor.children())[:-1]))

# require gradient for training
for param in encoder_anchor.parameters():
    param.requires_grad = True

# load data

In [None]:
events = pd.read_csv('Example_video_label.csv') # change to your label file

file_names = os.listdir(path)
len(file_names)

In [8]:
label = events['EVENTSEVERITY1'].tolist()
y = {'Crash': 2, 'Baseline': 0, 'Near-Crash': 1}

# Convert character elements to integers
label = [y[char] for char in label]

In [9]:
# Initialize the inference transforms
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()


def process_video(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    return preprocess(vid[:77]) # extract first 77 frames

In [10]:
# Split the data
train_files, temp_files, train_labels, temp_labels = train_test_split(file_names, label, test_size=0.3, 
                                                                      random_state=7)

val_files, test_files, val_labels, test_labels = train_test_split(temp_files, temp_labels, test_size=0.667, 
                                                                  random_state=7)

In [12]:
# split training set for DataLoader
num_splits = 4
train_files_split = [train_files[i::num_splits] for i in range(num_splits)]
train_labels_split = [train_labels[i::num_splits] for i in range(num_splits)]

In [13]:
# load validation sets
with ThreadPoolExecutor() as executor:
    X_val = list(executor.map(process_video, val_files))

X_val = torch.stack(X_val)#.to(device1)
Y_val = torch.as_tensor(val_labels)#.to(device1)

# Create a DataLoader for batching
val_dataset = TensorDataset(X_val, Y_val)
del X_val, Y_val
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

# loss function

In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=10.0, base_temperature=10.0): # temperature can be adjusted
        super(ContrastiveLoss, self).__init__()
        self.temperature = nn.Parameter(torch.tensor(temperature))# Make temperature learnable
        self.base_temperature = base_temperature

    def forward(self, features, labels):
        features = torch.nn.functional.normalize(features, p=2, dim=1)
        features = features.unsqueeze(1)
        batch_size = features.shape[0]
        labels = labels.contiguous().view(-1, 1)
        mask = torch.eq(labels, labels.T).float().to(device1)

        contrast_count = features.shape[1]
        # Unbinds the features tensor along dimension 1 and concatenates the resulting tensors along dimension 0.
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)

        anchor_feature = contrast_feature
        anchor_count = contrast_count

        # compute logits
        anchor_dot_contrast = torch.div(torch.matmul(anchor_feature, contrast_feature.T), self.temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device1), 
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mask_pos_pairs = mask.sum(1)
        mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, 1, mask_pos_pairs)
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs

        # loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss

# train encoder

In [None]:
num_epochs = 500
encoder_anchor = encoder_anchor.to(device1)

encoder_anchor.train()

In [17]:
# Define loss function and optimizer
ContrastLoss = ContrastiveLoss(temperature=10.0, base_temperature=10.0)
optimizer_encoder = optim.Adam(params = encoder_anchor.parameters(), lr=1e-3)

In [9]:
num_splits = 4
train_datasets = []

def process_split(split):
    # load preprocessed video file
    X_split = torch.load(f'Processed_video_r3d18/train/X_train_frame77_{split}.pt', weights_only=True)
    Y_split = torch.load(f'Processed_video_r3d18/train/Y_train_frame77_{split}.pt', weights_only=True)
    return X_split, Y_split

with ThreadPoolExecutor() as executor:
    results = executor.map(process_split, range(num_splits))

for X_split, Y_split in results:
    train_datasets.append(TensorDataset(X_split, Y_split))

In [19]:
BatchSize = 64 
train_loaders = []

for split in range(num_splits):
    train_loaders.append(DataLoader(dataset=train_datasets[split], batch_size=BatchSize, shuffle=True))

In [None]:
#### start training ####
best_val_ss = 0.0
best_val_loss = 99999.9 
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer_encoder, T_max=num_epochs, eta_min=0)

for epoch in range(num_epochs):
    results_file = open('results_encoder_ScratchLearn.txt', 'a') # create a txt file for saving the training records
    running_loss = 0
    for split in range(num_splits):
        for data, targets in train_loaders[split]:
            optimizer_encoder.zero_grad()
            data = data.to(device1)
            targets = targets.to(device1)

            h = encoder_anchor(data)
            h = h.view(h.size(0), -1)
            contrastive_loss = ContrastLoss(h, targets)
            running_loss += contrastive_loss

            contrastive_loss.backward()
            optimizer_encoder.step()
        
        del data, targets

    # Validation phase
    ss_result = []
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_Y in val_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)

            h_val = encoder_anchor(batch_X)
            h_val = h_val.view(h_val.size(0), -1)
            val_loss += ContrastLoss(h_val, batch_Y)
            ss = silhouette_score(h_val.cpu().numpy(), batch_Y.cpu().numpy())
            ss_result.append(ss)


    mean_ss = sum(ss_result) / len(ss_result)
    results_file.write(f'Epoch [{epoch+1}/{num_epochs}], Contrastive Loss: {running_loss.item():.5f}, Val loss: {val_loss:.5f}, Val Silht score: {mean_ss:.5f}\n')
    
    if mean_ss > best_val_ss:
        best_val_ss = mean_ss
        
        torch.save(encoder_anchor.state_dict(), 'best_encoder_ScratchLearn.pth') # save the best video encoder
        results_file.write(f'New best encoder saved with Silht scores: {best_val_ss}\n')

    del batch_X, batch_Y
    torch.cuda.empty_cache()
    results_file.close()
    scheduler.step()
    
results_file.close()

In [19]:
torch.cuda.empty_cache()

# train classification head

In [None]:
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()

trained_encoder = r3d_18()
fc_layer = torch.nn.Linear(trained_encoder.fc.in_features, 3)

trained_encoder = torch.nn.Sequential(*(list(trained_encoder.children())[:-1]))
trained_encoder.load_state_dict(torch.load(
    'best_encoder_ScratchLearn.pth',
    weights_only = False,
    map_location = device1))

for param in trained_encoder.parameters():
    param.requires_grad = False # don't require gradient for encoder
    
for param in fc_layer.parameters():
    param.requires_grad = True

# eval mode
trained_encoder.eval()

In [84]:
# Define loss function and optimizer
class_weights = torch.tensor([0.667, 0.854, 3.1], device=device1) # weighted cross entropy loss
CELoss = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')

optimizer_fc = optim.Adam(fc_layer.parameters(), lr=1e-3) 

In [None]:
num_epochs = 500
trained_encoder = trained_encoder.to(device1)
fc_layer = fc_layer.to(device1)

fc_layer.train()

In [9]:
with open('Output/SCL/train_X_encoder.pkl', 'rb') as f:  # load the learned representations from the encoder
    X_train = pickle.load(f) 

with open('Output/SCL/train_Y_encoder.pkl', 'rb') as f:
    Y_train = pickle.load(f)
    
X = torch.tensor(X_train, dtype=torch.float32)
Y = torch.tensor(Y_train, dtype=torch.long) 

In [70]:
BatchSize = 64

train_dataset = TensorDataset(X, Y)
train_loader = DataLoader(train_dataset, batch_size=BatchSize, shuffle=True)

In [11]:
with open('Output/SCL/val_X_encoder.pkl', 'rb') as f: # load the learned representations from the encoder
    val_X = pickle.load(f)

with open('Output/SCL/val_Y_encoder.pkl', 'rb') as f:
    val_Y = pickle.load(f)
    
val_X = torch.tensor(val_X, dtype=torch.float32)
val_Y = torch.tensor(val_Y, dtype=torch.long) 

val_dataset = TensorDataset(val_X, val_Y)

val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False) 

In [None]:
best_accuracy = 0.0
best_val_loss = 99999.9
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer_fc, T_max=num_epochs, eta_min=0)

for epoch in range(num_epochs):
    results_file = open('results_SCL_fc_layer.txt', 'a') # create a txt file to save the training records
    running_loss = 0

    for data, targets in train_loader:
        data = data.to(device1)
        targets = targets.to(device1)
        pred = fc_layer(data)
        supervised_loss = CELoss(pred, targets)
        optimizer_fc.zero_grad()
        supervised_loss.backward()
        optimizer_fc.step()
        running_loss += supervised_loss
    
    # Validation phase
    total_val_correct = 0
    total_val = 0
    val_loss = 0

    with torch.no_grad():
        for batch_X, batch_Y in test_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)

            val_scores = fc_layer(batch_X).softmax(-1) 
            _, val_predicted = torch.max(val_scores.data, 1)
            total_val_correct += (val_predicted == batch_Y).sum().item()
            total_val += batch_Y.size(0) 
            val_loss += CELoss(fc_layer(batch_X), batch_Y)

    val_accuracy = total_val_correct / total_val 
    results_file.write(f'Epoch [{epoch+1}/{num_epochs}], Training loss: {running_loss.item():.4f}, Val loss: {val_loss:.4f}, Val accuracy: {val_accuracy:.3f}\n')
    
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(fc_layer.state_dict(), f'best_fc_layer.pth')
        print(f'New best fc_layer saved with val accuracy: {val_accuracy}')
        results_file.write(f'New best fc_layer saved with val accuracy: {val_accuracy}\n')
        
    
    torch.cuda.empty_cache()
    results_file.close()
    scheduler.step()

results_file.close()

# test

In [31]:
with open('Output/SCL/test_X_encoder.pkl', 'rb') as f: # load the learned representations from the encoder
    X_test = pickle.load(f)

with open('Output/SCL/test_Y_encoder.pkl', 'rb') as f:
    Y_test = pickle.load(f)
    
test_X = torch.tensor(X_test, dtype=torch.float32)
test_Y = torch.tensor(Y_test, dtype=torch.long) 

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

In [32]:
trained_encoder = r3d_18()
fc_layer = torch.nn.Linear(trained_encoder.fc.in_features, 3)

trained_encoder = torch.nn.Sequential(*(list(trained_encoder.children())[:-1]))
trained_encoder.load_state_dict(torch.load(
    'best_encoder_ScratchLearn.pth', weights_only=True,
                                          map_location=device1
                                          ))
trained_encoder = trained_encoder.to(device1)
trained_encoder.eval()


fc_layer.load_state_dict(torch.load(
    'best_fc_layer.pth', weights_only=True,
                                   map_location=device1
                                   ))
fc_layer = fc_layer.to(device1)
fc_layer.eval()
type(test_loader)

torch.utils.data.dataloader.DataLoader

In [None]:
total_test_correct = 0
total_test = 0
pred_score = []
pred_label = []

with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)

        test_scores = fc_layer(batch_X).softmax(-1)
        _, test_predicted = torch.max(test_scores.data, 1)  # Get predictions
        total_test_correct += (test_predicted == batch_Y).sum().item()  # Count correct predictions
        total_test += batch_Y.size(0)  # Total predictions
        pred_score.append(test_scores)
        pred_label.append(test_predicted)

In [34]:
def unpack_tensors(tensor_list):
    unpacked_tensors = []
    for tensor in tensor_list:
        for element in tensor:
            unpacked_tensors.append(element.cpu().numpy())
    return np.array(unpacked_tensors)

y_score = unpack_tensors(pred_score)
y_label = unpack_tensors(pred_label)

In [None]:
# Compute Accuracy
accuracy = accuracy_score(np.array(Y_test), y_label)
# Compute Confusion Matrix
cm = confusion_matrix(np.array(Y_test), y_label)

# Extract values for category y=1
TP_y1 = cm[1, 1]  # True Positives for y=1
FP_y1 = cm[0, 1] + cm[2, 1]  # False Positives for y=1
FN_y1 = cm[1, 0] + cm[1, 2]  # False Negatives for y=1 
# Calculate Precision and Recall for y=1
precision_y1 = TP_y1 / (TP_y1 + FP_y1) if (TP_y1 + FP_y1) > 0 else 0
recall_y1 = TP_y1 / (TP_y1 + FN_y1) if (TP_y1 + FN_y1) > 0 else 0

# Extract values for category y=2
TP_y2 = cm[2, 2]  # True Positives for y=2
FP_y2 = cm[0, 2] + cm[1, 2]  # False Positives for y=2
FN_y2 = cm[2, 0] + cm[2, 1]  # False Negatives for y=2
# Calculate Precision and Recall for y=2
precision_y2 = TP_y2 / (TP_y2 + FP_y2) if (TP_y2 + FP_y2) > 0 else 0
recall_y2 = TP_y2 / (TP_y2 + FN_y2) if (TP_y2 + FN_y2) > 0 else 0

# Compute F1 Score for Near-Crash
f1_y1 = 2 * (precision_y1 * recall_y1) / (precision_y1 + recall_y1) if (precision_y1 + recall_y1) > 0 else 0
# Compute F1 Score for Crash
f1_y2 = 2 * (precision_y2 * recall_y2) / (precision_y2 + recall_y2) if (precision_y2 + recall_y2) > 0 else 0

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall for Crash, Near-Crash: {recall_y2:.3f}, {recall_y1:.3f}")
print(f"Precision for Crash, Near-Crash: {precision_y2:.3f}, {precision_y1:.3f}")
print(f"F1 Score for Crash, Near-Crash: {f1_y2:.3f}, {f1_y1:.3f}")
print(cm)