In [1]:
import os
path = 'path/to/video_folder/' # video_folder contains video.mp4
from collections import Counter
from torchvision.io.video import read_video
from torchvision.models.video import r3d_18 , R3D_18_Weights
import cv2
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data.distributed import DistributedSampler

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

from torchlars import LARS
from torch.optim.lr_scheduler import CosineAnnealingLR

import numpy as np
import pandas as pd
import av
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, balanced_accuracy_score

import pickle
import random

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed value
seed = 7
set_seed(seed)
torch.cuda.is_available()

True

In [13]:
device0 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device1 = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device0, device1)

cuda:0 cuda:1


## load model

In [20]:
# Load the weights
weights = R3D_18_Weights.DEFAULT
#model = r3d_18(weights=weights)
model = r3d_18()
# model.load_state_dict(torch.load('r3d_18-b3b3357e.pth', weights_only=True))

#import torch.nn.init as init

model.fc = torch.nn.Linear(model.fc.in_features, 3)
#init.kaiming_uniform_(model.fc.weight, nonlinearity='relu')  # Initialize weights
#model.fc.bias.data.fill_(0)  # Initialize biases to zero

# require gradient for training
for param in model.parameters():
    param.requires_grad = True

# Set the model to eval mode and move to desired device.
# model.eval()

## load data

In [5]:
# load labels
events = pd.read_csv('EventType.csv')
# events

In [6]:
events['EVENTSEVERITY1'].value_counts()

EVENTSEVERITY1
Baseline      7860
Near-Crash    6174
Crash         1686
Name: count, dtype: int64

In [7]:
file_names = os.listdir(path)
len(file_names)

15720

In [8]:
import random
# Set the seed
random.seed(7)
np.random.seed(7)

# Your code that generates random values
# label = pd.get_dummies(events, columns=['EVENTSEVERITY1'])
# label = label.drop(columns=['EVENT_ID']).astype(int)
label = events['EVENTSEVERITY1'].tolist()
# Create a mapping from characters to integers
#y = {char: idx for idx, char in enumerate(set(label))}
y = {'Crash': 2, 'Baseline': 0, 'Near-Crash': 1}

# Convert character elements to integers
label = [y[char] for char in label]

In [9]:
from collections import Counter
Counter(label)

Counter({0: 7860, 1: 6174, 2: 1686})

In [10]:
del events
len(label)

15720

In [22]:
# Initialize the inference transforms
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()

R3D_18_Weights.KINETICS400_V1.transforms() perform the following preprocessing operations: 

Accepts batched (B, T, C, H, W) and single (T, C, H, W) video frame torch.Tensor objects. 

The frames are resized to resize_size=[128, 171] using interpolation=InterpolationMode.BILINEAR, followed by a central crop of crop_size=[112, 112]. 

The values are first rescaled to [0.0, 1.0] and then normalized using mean=[0.43216, 0.394666, 0.37645] and std=[0.22803, 0.22145, 0.216989]. 

The output dimensions are permuted to (..., C, T, H, W) tensors.

In [21]:
from concurrent.futures import ThreadPoolExecutor

def process_video(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    if vid.size(0) == 0:  # Check if the first dimension (time) is zero
        return torch.ones(3, 76, 112, 112)  # Return a zero tensor with the specified shape
    return preprocess(vid[:76]) 

In [13]:
# torch.save(X, '/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/All_video_tensor_ResNet3D_18.pt')

torch.Size([13, 3, 77, 112, 112])


In [None]:
# X = torch.load('/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/All_video_tensor_ResNet3D_18.pt')

In [13]:
# Gives easier dataset managment and creates mini batches
'''from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
'''

In [13]:
from sklearn.model_selection import train_test_split

# Split the data
train_files, temp_files, train_labels, temp_labels = train_test_split(file_names, label, test_size=0.3, 
                                                                      #stratify=label, 
                                                                      random_state=7)

val_files, test_files, val_labels, test_labels = train_test_split(temp_files, temp_labels, test_size=0.667, 
                                                                  #stratify=temp_labels, 
                                                                  random_state=7)

# train_files, train_labels for training
# val_files, val_labels for validation
# test_files, test_labels for testing

In [14]:
from collections import Counter
print(len(train_labels), Counter(train_labels))
print(len(val_labels), Counter(val_labels))
print(len(test_labels), Counter(test_labels))

11004 Counter({0: 5485, 1: 4337, 2: 1182})
1570 Counter({0: 802, 1: 609, 2: 159})
3146 Counter({0: 1573, 1: 1228, 2: 345})


In [15]:
# split training set for DataLoader
num_splits = 4
train_files_split = [train_files[i::num_splits] for i in range(num_splits)]
train_labels_split = [train_labels[i::num_splits] for i in range(num_splits)]

In [16]:
# load validation sets
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    X_val = list(executor.map(process_video, val_files))

X_val = torch.stack(X_val)#.to(device1)
Y_val = torch.as_tensor(val_labels)#.to(device1)


from torch.utils.data import DataLoader, TensorDataset
# Create a DataLoader for batching
val_dataset = TensorDataset(X_val, Y_val)
del X_val, Y_val
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

In [18]:
type(val_loader)

torch.utils.data.dataloader.DataLoader

## train

In [19]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1) # 1e-1

In [None]:
results_file = open('/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/results_SL_ScratchLearn.txt', 'a')
num_epochs = 100
model = model.to(device1)
model.train()
BatchSize = 32 # 32 is feasible
best_val_accuracy = 0 #
#best_val_ss = 0

for epoch in range(num_epochs):
    #correct = 0  # Initialize correct predictions counter
    #total = 0    # Initialize total predictions counter
    running_loss = 0
    for split in range(num_splits):
        # Process each split
        with ThreadPoolExecutor() as executor:
            X_split = list(executor.map(process_video, train_files_split[split]))
            
        X_split = torch.stack(X_split)
        Y_split = torch.as_tensor(train_labels_split[split])
    
        train_dataset = TensorDataset(X_split, Y_split)
        del X_split, Y_split
        train_loader = DataLoader(dataset=train_dataset, batch_size=BatchSize, shuffle=True)
        del train_dataset
   
        for data, targets in train_loader:
            # Get data to cuda
            data = data.to(device1) #.float()
            targets = targets.to(device1)

            # forward
            scores = model(data) # !!!no softmax needed! Already in the CrossEntropyLoss
            loss = criterion(scores, targets) ## can be changed
            running_loss += loss

            # backward
            optimizer.zero_grad()
            loss.backward()
            # gradient descent or adam step
            optimizer.step()

            # Calculate accuracy
            #_, predicted = torch.max(scores.data, 1)
            #total += targets.size(0)
            #correct += (predicted == targets).sum().item()
            
        
        del train_loader, data, targets

    #accuracy = correct / total  # Calculate accuracy
    
    # Validation phase
    total_val_correct = 0
    total_val = 0
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_Y in val_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)
            val_scores = model(batch_X).softmax(-1)  # Forward pass on validation set
            _, val_predicted = torch.max(val_scores.data, 1)  # Get predictions
            total_val_correct += (val_predicted == batch_Y).sum().item()  # Count correct predictions
            total_val += batch_Y.size(0)  # Total predictions
            val_loss += criterion(model(batch_X), batch_Y)

    val_accuracy = total_val_correct / total_val  # Calculate validation accuracy
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.3f}')
    results_file.write(
        f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.3f}\n')
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/best_ScratchLearn_weights.pth')  # Save the model weights
        #torch.save(model, f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/Epoch {epoch+1}.pth')  # Save the entire model
        print(f'New best model saved with val accuracy: {best_val_accuracy}')
        results_file.write(f'New best model saved with val accuracy: {best_val_accuracy}\n')
        
        
    torch.cuda.empty_cache()
    
results_file.close()

Epoch [1/100], Training Loss: 402.6621, Val Loss: 148.8505, Val Accuracy: 0.511
New best model saved with val accuracy: 0.510828025477707
Epoch [2/100], Training Loss: 329.6675, Val Loss: 150.3648, Val Accuracy: 0.511
Epoch [3/100], Training Loss: 330.1610, Val Loss: 149.8094, Val Accuracy: 0.388
Epoch [4/100], Training Loss: 330.2533, Val Loss: 148.0592, Val Accuracy: 0.511
Epoch [5/100], Training Loss: 329.3377, Val Loss: 148.9544, Val Accuracy: 0.511


In [None]:
results_file.close()

In [22]:
# Free up unused memory
torch.cuda.empty_cache()

On validation set

In [21]:
# check the quality of learned latent representations
model_test = r3d_18()
model_test.fc = torch.nn.Linear(model_test.fc.in_features, 3)
model_test.load_state_dict(torch.load(f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/best_ScratchLearn_weights.pth', weights_only=True))

model_test = torch.nn.Sequential(*(list(model_test.children())[:-1]))

model_test = model_test.to(device1)
model_test.eval()
type(val_loader)

torch.utils.data.dataloader.DataLoader

In [26]:
ss_result = []
#val_loss = 0
with torch.no_grad():
    for batch_X, batch_Y in val_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)
        h_val = model_test(batch_X)
        h_val = h_val.view(h_val.size(0), -1)
        #val_loss += ContrastLoss(h_val, batch_Y)
        ss = silhouette_score(h_val.cpu().numpy(), batch_Y.cpu().numpy())
        #print(ss)
        ss_result.append(ss)
        #val_scores = fc_layer(h_val).softmax(-1)  # Forward pass on validation set
        #_, val_predicted = torch.max(val_scores.data, 1)  # Get predictions
        #total_val_correct += (val_predicted == batch_Y).sum().item()  # Count correct predictions
        #total_val += batch_Y.size(0)  # Total predictions

        #val_accuracy = total_val_correct / total_val  # Calculate validation accuracy
mean_ss = sum(ss_result) / len(ss_result)
#print(f'Epoch [{epoch+1}/{num_epochs}], Val loss: {val_loss:.3f}, Val Silht score: {mean_ss:.3f}')
print(f'Val loss: {val_loss:.3f}, Val Silht score: {mean_ss:.3f}')
        #results_file.write(f'Epoch [{epoch+1}/{num_epochs}], Val loss: {val_loss:.3f}, Val Silht score: {mean_ss:.3f}\n')


Val loss: 0.000, Val Silht score: 0.269


on test set

In [27]:
# load test sets
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    X_test = list(executor.map(process_video, test_files))

X_test = torch.stack(X_test)#.to(device1)
Y_test = torch.as_tensor(test_labels)#.to(device1)


from torch.utils.data import DataLoader, TensorDataset
# Create a DataLoader for batching
test_dataset = TensorDataset(X_test, Y_test)
del X_test, Y_test
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

In [28]:
model_test = r3d_18()
model_test.fc = torch.nn.Linear(model_test.fc.in_features, 3)
model_test.load_state_dict(torch.load(f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/best_ScratchLearn_weights.pth', weights_only=True))

model_test = torch.nn.Sequential(*(list(model_test.children())[:-1]))

model_test = model_test.to(device1)
model_test.eval()
type(test_loader)

torch.utils.data.dataloader.DataLoader

In [29]:
ss_result = []
test_loss = 0
with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)
        h_test = model_test(batch_X)
        h_test = h_test.view(h_test.size(0), -1)
        #test_loss += ContrastLoss(h_test, batch_Y)
        ss = silhouette_score(h_test.cpu().numpy(), batch_Y.cpu().numpy())
        ss_result.append(ss)
        
mean_ss = sum(ss_result) / len(ss_result)
print(f'Test loss: {test_loss:.3f}, Test Silht score: {mean_ss:.3f}')

Test loss: 0.000, Test Silht score: 0.242


On test set, t-SNE plot

In [52]:
trained_encoder = r3d_18()
trained_encoder.fc = torch.nn.Linear(trained_encoder.fc.in_features, 3)
trained_encoder.load_state_dict(torch.load(f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/best_ScratchLearn_weights.pth', weights_only=True))
trained_encoder = torch.nn.Sequential(*(list(trained_encoder.children())[:-1]))
trained_encoder = trained_encoder.to(device1)

h_result = []
with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        #batch_Y = batch_Y.to(device1)
        h_test = trained_encoder(batch_X)
        h_test = h_test.view(h_test.size(0), -1)
        h_result.append(h_test)

In [53]:
def unpack_tensors(tensor_list):
    unpacked_tensors = []
    for tensor in tensor_list:
        for element in tensor:
            unpacked_tensors.append(element.cpu().numpy())
    return np.array(unpacked_tensors)

h_result = unpack_tensors(h_result)
tSNE_labels = np.array(test_labels)

In [54]:
## t-SNE plot
# Fit t-SNE 
random.seed(7)

perplexities = [5, 10, 20, 30, 40, 50]
learning_rates = [10, 50, 100, 250, 500, 1000]

# Create a mask for each category
mask0 = tSNE_labels == 0 #'Baseline'
mask1 = tSNE_labels == 1 #'Near-Crash'
mask2 = tSNE_labels == 2 #'Crash'

for i in perplexities:
    for j in learning_rates:
        tsne = TSNE(n_components=2, perplexity=i, learning_rate=j)
        X_tsne = tsne.fit_transform(h_result)
        # Create separate scatter plots for each category
        plt.scatter(X_tsne[mask0, 0], X_tsne[mask0, 1], c='orange', label='Baseline', alpha=0.5, marker='o')
        plt.scatter(X_tsne[mask1, 0], X_tsne[mask1, 1], c='green', label='Near-Crash', alpha=0.5, marker='^')
        plt.scatter(X_tsne[mask2, 0], X_tsne[mask2, 1], c='red', label='Crash', alpha=0.5, marker='^')

        # Add labels and title
        plt.xlabel('t-SNE 1')
        plt.ylabel('t-SNE 2')
        plt.title(f'Supervised learning_CE loss')
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
        plt.tight_layout()
        plt.savefig(f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/t-SNE plot/Perplexity_{i}_LearningRate_{j}.jpg')
        #plt.show()
        plt.close()

## test

In [14]:
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()

In [26]:
del X_test,Y_test

In [19]:
# load test set
# test_files, test_labels for testing
from concurrent.futures import ThreadPoolExecutor
from torch.utils.data import DataLoader, TensorDataset

with ThreadPoolExecutor() as executor:
    X_test = list(executor.map(process_video, test_files)) #val_files))

X_test = torch.stack(X_test)#.to(device1)
Y_test = torch.as_tensor(test_labels)#.to(device1) #val_labels).to(device1)

In [19]:
# Create a DataLoader for batching
test_dataset = TensorDataset(X_test, Y_test)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

In [27]:
del batch_X, batch_Y
torch.cuda.empty_cache()

In [31]:
# load best model to GPU

model_test = r3d_18()
model_test.fc = torch.nn.Linear(model_test.fc.in_features, 3)
model_test.load_state_dict(torch.load(f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/best_ScratchLearn_weights.pth', weights_only=True))

# model_test = torch.load(f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/Epoch 27.pth',weights_only=False)  # Load the entire model

model_test = model_test.to(device1)
model_test.eval()
type(test_loader)

torch.utils.data.dataloader.DataLoader

In [32]:
total_test_correct = 0
total_test = 0
pred_score = []
pred_label = []

with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)
        test_scores = model_test(batch_X).softmax(-1)  # Forward pass on test set
        _, test_predicted = torch.max(test_scores.data, 1)  # Get predictions
        total_test_correct += (test_predicted == batch_Y).sum().item()  # Count correct predictions
        total_test += batch_Y.size(0)  # Total predictions
        pred_score.append(test_scores)
        pred_label.append(test_predicted)
        

test_accuracy = total_test_correct / total_test  # Calculate test accuracy

In [33]:
test_accuracy

0.7930705657978385

In [59]:
def unpack_tensors(tensor_list):
    unpacked_tensors = []
    for tensor in tensor_list:
        for element in tensor:
            unpacked_tensors.append(element.cpu().numpy())
    return np.array(unpacked_tensors)

y_score = unpack_tensors(pred_score)
y_label = unpack_tensors(pred_label)

In [70]:
# Compute Accuracy
accuracy = accuracy_score(np.array(test_labels), y_label)#test_y_true
# Compute Balanced Accuracy
balanced_accuracy = balanced_accuracy_score(np.array(test_labels), y_label)
# Compute Precision
precision = precision_score(np.array(test_labels), y_label, average='macro')
# Compute Recall
#recall = recall_score(np.array(test_labels), y_label, average='macro')
# Compute F1 Score
f1 = f1_score(np.array(test_labels), y_label, average='macro')#test_y_true
# Compute AUC
auc = roc_auc_score(np.array(test_labels), y_score, multi_class='ovo')
# Compute Confusion Matrix
cm = confusion_matrix(np.array(test_labels), y_label)#test_y_true
# Compute Mean Average Precision
mAP = average_precision_score(np.array(test_labels), y_score, average='macro')


# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Mean Average Precision:", mAP)
print("one-vs-one AUC:", auc)
print("Balanced Accuracy:", balanced_accuracy)
print("macro Precision:", precision)
#print("macro Recall:", recall)
print("macro F1:", f1)


cm

Accuracy: 0.7930705657978385
Mean Average Precision: 0.7715611990371394
one-vs-one AUC: 0.883597294615564
Balanced Accuracy: 0.6800913470848196
macro Precision: 0.7301562322883474
macro F1: 0.6953502031443901


array([[1376,  159,   38],
       [ 182,  997,   49],
       [  68,  155,  122]])

In [3]:
device0 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device1 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device0, device1)

cuda:0 cuda:0


## load model

In [4]:
# Load the weights
weights = R3D_18_Weights.DEFAULT
#model = r3d_18(weights=weights)
model = r3d_18()
# model.load_state_dict(torch.load('r3d_18-b3b3357e.pth', weights_only=True))

#import torch.nn.init as init

model.fc = torch.nn.Linear(model.fc.in_features, 3)
#init.kaiming_uniform_(model.fc.weight, nonlinearity='relu')  # Initialize weights
#model.fc.bias.data.fill_(0)  # Initialize biases to zero

# require gradient for training
for param in model.parameters():
    param.requires_grad = True

# Set the model to eval mode and move to desired device.
# model.eval()

## load data

In [9]:
# load labels
events = pd.read_csv('EventType.csv')
path = 'Original_video/'
file_names = os.listdir(path)
len(file_names)


15720

In [10]:
import random
# Set the seed
random.seed(7)
np.random.seed(7)

# Your code that generates random values
# label = pd.get_dummies(events, columns=['EVENTSEVERITY1'])
# label = label.drop(columns=['EVENT_ID']).astype(int)
label = events['EVENTSEVERITY1'].tolist()
# Create a mapping from characters to integers
#y = {char: idx for idx, char in enumerate(set(label))}
y = {'Crash': 2, 'Baseline': 0, 'Near-Crash': 1}

# Convert character elements to integers
label = [y[char] for char in label]

In [11]:
# Initialize the inference transforms
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()

In [12]:
from concurrent.futures import ThreadPoolExecutor

def process_video(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    if vid.size(0) == 0:  # Check if the first dimension (time) is zero
        return torch.ones(3, 77, 112, 112)  # Return a zero tensor with the specified shape
    return preprocess(vid[:77]) 

In [13]:
from sklearn.model_selection import train_test_split

# Split the data
train_files, temp_files, train_labels, temp_labels = train_test_split(file_names, label, test_size=0.3, 
                                                                      #stratify=label, 
                                                                      random_state=7)

val_files, test_files, val_labels, test_labels = train_test_split(temp_files, temp_labels, test_size=0.667, 
                                                                  #stratify=temp_labels, 
                                                                  random_state=7)

# train_files, train_labels for training
# val_files, val_labels for validation
# test_files, test_labels for testing

In [19]:
'''import pickle
with open('train_X.pkl', 'wb') as f:
    pickle.dump(train_files, f)
with open('train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f)

with open('val_X.pkl', 'wb') as f:
    pickle.dump(val_files, f)
with open('val_labels.pkl', 'wb') as f:
    pickle.dump(val_labels, f)

with open('test_X.pkl', 'wb') as f:
    pickle.dump(test_files, f)
with open('test_labels.pkl', 'wb') as f:
    pickle.dump(test_labels, f)
    '''

In [10]:
# split training set for DataLoader
num_splits = 4
train_files_split = [train_files[i::num_splits] for i in range(num_splits)]
train_labels_split = [train_labels[i::num_splits] for i in range(num_splits)]


In [10]:
# load validation sets
import av
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    X_val = list(executor.map(process_video, val_files))

X_val = torch.stack(X_val)#.to(device1)
Y_val = torch.as_tensor(val_labels)#.to(device1)

In [11]:
#torch.save(X_val, 'Processed_video_r3d18/X_val_frame77.pt')
#torch.save(Y_val, 'Processed_video_r3d18/Y_val_frame77.pt')

In [11]:
X_val = torch.load('Processed_video_r3d18/X_val_frame77.pt', weights_only=True)
Y_val = torch.load('Processed_video_r3d18/Y_val_frame77.pt', weights_only=True)

In [12]:
from torch.utils.data import DataLoader, TensorDataset
# Create a DataLoader for batching
val_dataset = TensorDataset(X_val, Y_val)
del X_val, Y_val
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

In [13]:
type(val_loader)

torch.utils.data.dataloader.DataLoader

## train

In [10]:
'''
for split in range(num_splits):
    # Process each split
    with ThreadPoolExecutor() as executor:
        X_split = list(executor.map(process_video, train_files_split[split]))

    X_split = torch.stack(X_split)
    Y_split = torch.as_tensor(train_labels_split[split])
    torch.save(X_split, f'Processed_video_r3d18/train/X_train_frame77_{split}.pt')
    torch.save(Y_split, f'Processed_video_r3d18/train/Y_train_frame77_{split}.pt')
    del X_split, Y_split
''' 

In [27]:
import sklearn
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)

class_weights

array([0.66873291, 0.84574591, 3.10321489])

In [14]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR

# in whole dataset: 0= 7860, 1= 6174, 2= 1686; 1,686 crash, 6,174 near-crash, 7,860 baseline; weight = [2.0, 2.6, 9.3]
# in training dataset: 0: 5485, 1: 4337, 2: 1182; weight = [0.7, 0.8, 3.1]

class_weights = torch.tensor([0.7, 0.8, 3.1], device=device1)
#class_weights = torch.tensor([1.0, 1.0, 1.0], device=device1)

criterion = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=1e-3) # 1e-3

In [None]:
num_epochs = 200
model = model.to(device1)
model.train()
BatchSize = 32 # 32 is feasible
best_val_accuracy = 0.511 #
#best_val_ss = 0
#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)

for epoch in range(num_epochs):
    results_file = open('Output/Supervised L_WLoss/results_SL_WLoss_ScratchLearn.txt', 'a')
    #correct = 0  # Initialize correct predictions counter
    #total = 0    # Initialize total predictions counter
    running_loss = 0
    for split in range(num_splits):
        # Process each split
        #with ThreadPoolExecutor() as executor:
            #X_split = list(executor.map(process_video, train_files_split[split]))
            
        #X_split = torch.stack(X_split)
        #Y_split = torch.as_tensor(train_labels_split[split])
        
        X_split = torch.load(f'Processed_video_r3d18/train/X_train_frame77_{split}.pt', weights_only=True)
        Y_split = torch.load(f'Processed_video_r3d18/train/Y_train_frame77_{split}.pt', weights_only=True)
        train_dataset = TensorDataset(X_split, Y_split)
        del X_split, Y_split
        train_loader = DataLoader(dataset=train_dataset, batch_size=BatchSize, shuffle=True)
        del train_dataset
   
        for data, targets in train_loader:
            # Get data to cuda
            data = data.to(device1) #.float()
            targets = targets.to(device1)
            # forward
            scores = model(data) # !!!no softmax needed! Already in the CrossEntropyLoss
            loss = criterion(scores, targets) ## can be changed
            running_loss += loss
            # backward
            optimizer.zero_grad()
            loss.backward()
            # gradient descent or adam step
            optimizer.step()
            # Calculate accuracy
            #_, predicted = torch.max(scores.data, 1)
            #total += targets.size(0)
            #correct += (predicted == targets).sum().item()
        del train_loader, data, targets
    #accuracy = correct / total  # Calculate accuracy
    
    # Validation phase
    total_val_correct = 0
    total_val = 0
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_Y in val_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)
            val_scores = model(batch_X).softmax(-1)  # Forward pass on validation set
            _, val_predicted = torch.max(val_scores.data, 1)  # Get predictions
            total_val_correct += (val_predicted == batch_Y).sum().item()  # Count correct predictions
            total_val += batch_Y.size(0)  # Total predictions
            val_loss += criterion(model(batch_X), batch_Y)

    val_accuracy = total_val_correct / total_val  # Calculate validation accuracy
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.3f}')
    results_file.write(
        f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.3f}\n')
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), f'Output/Supervised L_WLoss/best_ScratchLearn_weights.pth')  # Save the model weights
        #torch.save(model, f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/Epoch {epoch+1}.pth')  # Save the entire model
        print(f'New best model saved with val accuracy: {best_val_accuracy}')
        results_file.write(f'New best model saved with val accuracy: {best_val_accuracy}\n')
        
        
    torch.cuda.empty_cache()
    results_file.close()
    #scheduler.step()

results_file.close()

on test set

In [10]:
# load test sets
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    X_test = list(executor.map(process_video, test_files))

X_test = torch.stack(X_test)#.to(device1)
Y_test = torch.as_tensor(test_labels)#.to(device1)


from torch.utils.data import DataLoader, TensorDataset
# Create a DataLoader for batching
test_dataset = TensorDataset(X_test, Y_test)
del X_test, Y_test
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

In [20]:
model_test = r3d_18()
model_test.fc = torch.nn.Linear(model_test.fc.in_features, 3)
model_test.load_state_dict(torch.load(f'Output/Supervised L_WLoss/best_ScratchLearn_weights.pth', weights_only=True))

model_test = torch.nn.Sequential(*(list(model_test.children())[:-1]))

model_test = model_test.to(device1)
model_test.eval()
type(test_loader)

torch.utils.data.dataloader.DataLoader

In [21]:
ss_result = []
DB_index = []
CH_index = []
test_loss = 0
h_result = []

with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)
        h_test = model_test(batch_X)
        h_test = h_test.view(h_test.size(0), -1)
        #test_loss += ContrastLoss(h_test, batch_Y)
        ss = silhouette_score(h_test.cpu().numpy(), batch_Y.cpu().numpy())
        ch = calinski_harabasz_score(h_test.cpu().numpy(), batch_Y.cpu().numpy())
        db = davies_bouldin_score(h_test.cpu().numpy(), batch_Y.cpu().numpy())

        ss_result.append(ss)
        CH_index.append(ch)
        DB_index.append(db)
        
mean_ss = sum(ss_result) / len(ss_result)
mean_ch = sum(CH_index) / len(CH_index)
mean_db = sum(DB_index) / len(DB_index)

print(f'Test loss: {test_loss:.3f}, Test Silht score: {mean_ss:.3f}, Test CH index: {mean_ch}, Test DB index: {mean_db}')

Test loss: 0.000, Test Silht score: 0.336, Test CH index: 10.692019894376559, Test DB index: 0.804744944225678


## test

In [None]:
model_test = r3d_18()
model_test.fc = torch.nn.Linear(model_test.fc.in_features, 3)
model_test.load_state_dict(torch.load(f'Output/Supervised L_WLoss/best_ScratchLearn_weights.pth', weights_only=True))

#model_test = torch.nn.Sequential(*(list(model_test.children())[:-1]))

model_test = model_test.to(device1)
model_test.eval()

In [5]:
X_test = torch.load('Processed_video_r3d18/X_test_frame77.pt', weights_only=True)
Y_test = torch.load('Processed_video_r3d18/Y_test_frame77.pt', weights_only=True)

from torch.utils.data import DataLoader, TensorDataset
# Create a DataLoader for batching
test_dataset = TensorDataset(X_test, Y_test)
del X_test, Y_test
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

type(test_loader)

torch.utils.data.dataloader.DataLoader

In [6]:
total_test_correct = 0
total_test = 0
pred_score = []
pred_label = []

with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)
        test_scores = model_test(batch_X).softmax(-1)  # Forward pass on test set
        _, test_predicted = torch.max(test_scores.data, 1)  # Get predictions
        total_test_correct += (test_predicted == batch_Y).sum().item()  # Count correct predictions
        total_test += batch_Y.size(0)  # Total predictions
        pred_score.append(test_scores)
        pred_label.append(test_predicted)
        

test_accuracy = total_test_correct / total_test  # Calculate test accuracy

In [7]:
def unpack_tensors(tensor_list):
    unpacked_tensors = []
    for tensor in tensor_list:
        for element in tensor:
            unpacked_tensors.append(element.cpu().numpy())
    return np.array(unpacked_tensors)

y_score = unpack_tensors(pred_score)
y_label = unpack_tensors(pred_label)

In [14]:
# Compute Accuracy
accuracy = accuracy_score(np.array(test_labels), y_label)#test_y_true
# Compute Balanced Accuracy
balanced_accuracy = balanced_accuracy_score(np.array(test_labels), y_label)
# Compute Precision
precision = precision_score(np.array(test_labels), y_label, average='macro')
# Compute Recall
#recall = recall_score(np.array(test_labels), y_label, average='macro')
# Compute F1 Score
f1 = f1_score(np.array(test_labels), y_label, average='macro')#test_y_true
# Compute AUC
auc = roc_auc_score(np.array(test_labels), y_score, multi_class='ovo')
# Compute Confusion Matrix
cm = confusion_matrix(np.array(test_labels), y_label)#test_y_true
# Compute Mean Average Precision
mAP = average_precision_score(np.array(test_labels), y_score, average='macro')


# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Mean Average Precision:", mAP)
print("one-vs-one AUC:", auc)
print("Balanced Accuracy:", balanced_accuracy)
print("macro Precision:", precision)
#print("macro Recall:", recall)
print("macro F1:", f1)


cm

Accuracy: 0.871900826446281
Mean Average Precision: 0.8933728005063225
one-vs-one AUC: 0.9511113180135401
Balanced Accuracy: 0.839929903967962
macro Precision: 0.8299935171835077
macro F1: 0.834456726191314


array([[1444,   85,   44],
       [ 124, 1038,   66],
       [  35,   49,  261]])

## load model

## load data

## train

In [None]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision.ops import sigmoid_focal_loss

# Define loss function and optimizer
# in whole dataset: 0= 7860, 1= 6174, 2= 1686
# 1,686 crash, 6,174 near-crash, 7,860 baseline
# 0: 2.0, 1: 2.55, 2: 9.32
#class_weights = torch.tensor([2.0, 2.55, 9.32], device=device1)
#criterion = nn.CrossEntropyLoss(weight=class_weights)

criterion = sigmoid_focal_loss
optimizer = optim.Adam(model.parameters(), lr=1e-4) # 1e-1

In [4]:
device0 = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device1 = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device0, device1)

cuda:1 cuda:1


## load model

In [5]:
# Load the weights
#weights = Swin3D_B_Weights.DEFAULT
#model = r3d_18(weights=weights)
model = swin3d_b()
# model.load_state_dict(torch.load('r3d_18-b3b3357e.pth', weights_only=True))
#import torch.nn.init as init

In [5]:
model

SwinTransformer3d(
  (patch_embed): PatchEmbed3d(
    (proj): Conv3d(3, 128, kernel_size=(2, 4, 4), stride=(2, 4, 4))
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (features): Sequential(
    (0): Sequential(
      (0): SwinTransformerBlock(
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): ShiftedWindowAttention3d(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (proj): Linear(in_features=128, out_features=128, bias=True)
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=512, out_features=128, bias=True)
          (4): Dropout(p=0.0, inplace=False)
      

In [6]:
model.head = torch.nn.Linear(model.head.in_features, 3)

model.load_state_dict(torch.load('Output/Supervised L_SwinTrans/best_ScratchLearn_weights.pth', weights_only=True))
#model.head = torch.nn.Linear(model.head.in_features, 512)
#model.fc = torch.nn.Linear(512, 3)

#init.kaiming_uniform_(model.fc.weight, nonlinearity='relu')  # Initialize weights
#model.fc.bias.data.fill_(0)  # Initialize biases to zero

# require gradient for training
for param in model.parameters():
    param.requires_grad = True

# Set the model to eval mode and move to desired device.
# model.eval()

In [7]:
model

SwinTransformer3d(
  (patch_embed): PatchEmbed3d(
    (proj): Conv3d(3, 128, kernel_size=(2, 4, 4), stride=(2, 4, 4))
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (features): Sequential(
    (0): Sequential(
      (0): SwinTransformerBlock(
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): ShiftedWindowAttention3d(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (proj): Linear(in_features=128, out_features=128, bias=True)
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=512, out_features=128, bias=True)
          (4): Dropout(p=0.0, inplace=False)
      

## load data

In [8]:
# load labels
events = pd.read_csv('EventType.csv')

In [9]:
file_names = os.listdir(path)
len(file_names)

15720

In [10]:
import random

# Your code that generates random values
# label = pd.get_dummies(events, columns=['EVENTSEVERITY1'])
# label = label.drop(columns=['EVENT_ID']).astype(int)
label = events['EVENTSEVERITY1'].tolist()
# Create a mapping from characters to integers
#y = {char: idx for idx, char in enumerate(set(label))}
y = {'Crash': 2, 'Baseline': 0, 'Near-Crash': 1}

# Convert character elements to integers
label = [y[char] for char in label]

In [11]:
# Initialize the inference transforms
weights = Swin3D_B_Weights.DEFAULT
preprocess = weights.transforms()

from concurrent.futures import ThreadPoolExecutor

def process_video(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    if vid.size(0) == 0:  # Check if the first dimension (time) is zero
        return torch.ones(3, 77, 224, 224)  # Return a zero tensor with the specified shape
    return preprocess(vid[:77]) 

In [12]:
from sklearn.model_selection import train_test_split

# Split the data
train_files, temp_files, train_labels, temp_labels = train_test_split(file_names, label, test_size=0.3, 
                                                                      #stratify=label, 
                                                                      random_state=7)

val_files, test_files, val_labels, test_labels = train_test_split(temp_files, temp_labels, test_size=0.667, 
                                                                  #stratify=temp_labels, 
                                                                  random_state=7)

# train_files, train_labels for training
# val_files, val_labels for validation
# test_files, test_labels for testing

In [13]:
# split training set for DataLoader
num_splits = 8 #4
train_files_split = [train_files[i::num_splits] for i in range(num_splits)]
train_labels_split = [train_labels[i::num_splits] for i in range(num_splits)]

In [14]:
# load validation sets
'''from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    X_val = list(executor.map(process_video, val_files))

X_val = torch.stack(X_val)#.to(device1)
Y_val = torch.as_tensor(val_labels)#.to(device1)


#from torch.utils.data import DataLoader, TensorDataset
#val_dataset = TensorDataset(X_val, Y_val)
#del X_val, Y_val
#val_loader = DataLoader(val_dataset, batch_size=5, shuffle=False)  # batch_size = 10
#del val_dataset
'''

In [15]:
#torch.save(X_val, 'Processed_video_SwinTransformer/X_val_frame77.pt')
#torch.save(Y_val, 'Processed_video_SwinTransformer/Y_val_frame77.pt')

In [14]:
X_val = torch.load('Processed_video_SwinTransformer/X_val_frame77.pt', weights_only=True)
Y_val = torch.load('Processed_video_SwinTransformer/Y_val_frame77.pt', weights_only=True)
val_dataset = TensorDataset(X_val, Y_val)
del X_val, Y_val
val_loader = DataLoader(val_dataset, batch_size=5, shuffle=False) #, num_workers=16
#del val_dataset

In [15]:
type(val_loader)

torch.utils.data.dataloader.DataLoader

In [16]:
'''
for split in range(num_splits):
    # Process each split
    with ThreadPoolExecutor() as executor:
        X_split = list(executor.map(process_video, train_files_split[split]))

    X_split = torch.stack(X_split)
    Y_split = torch.as_tensor(train_labels_split[split])
    torch.save(X_split, f'Processed_video_SwinTransformer/train/X_train_frame77_{split}.pt')
    torch.save(Y_split, f'Processed_video_SwinTransformer/train/Y_train_frame77_{split}.pt')
    del X_split, Y_split
'''

## train

In [None]:
'''num_splits = 8
train_datasets = []

def process_split(split):
    X_split = torch.load(f'Processed_video_SwinTransformer/train/X_train_frame77_{split}.pt', weights_only=True)
    Y_split = torch.load(f'Processed_video_SwinTransformer/train/Y_train_frame77_{split}.pt', weights_only=True)
    return X_split, Y_split

with ThreadPoolExecutor() as executor:
    results = executor.map(process_split, range(num_splits))

for X_split, Y_split in results:
    #Xs.append(X_split)
    #Ys.append(Y_split)
    train_datasets.append(TensorDataset(X_split, Y_split))
    '''

In [16]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr= 1e-3) # 1e-1


In [None]:
#results_file = open('/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L_SwinTrans/results_SL_SwinTrans_ScratchLearn.txt', 'a')
num_epochs = 100
model = model.to(device1)
model.train()
BatchSize = 5 # 32
best_val_accuracy = 0.51146 #
#best_val_ss = 0
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)


for epoch in range(num_epochs):
    results_file = open('Output/Supervised L_SwinTrans/results_SL_SwinTrans_ScratchLearn.txt', 'a')
    #correct = 0  # Initialize correct predictions counter
    #total = 0    # Initialize total predictions counter
    running_loss = 0
    for split in range(num_splits):
        # Process each split
        #with ThreadPoolExecutor() as executor:
         #   X_split = list(executor.map(process_video, train_files_split[split]))
        
        X_split = torch.load(f'Processed_video_SwinTransformer/train/X_train_frame77_{split}.pt', weights_only=True)
        Y_split = torch.load(f'Processed_video_SwinTransformer/train/Y_train_frame77_{split}.pt', weights_only=True)
    
        train_dataset = TensorDataset(X_split, Y_split)
        del X_split, Y_split
        train_loader = DataLoader(dataset=train_dataset, batch_size=BatchSize, shuffle=True)
        del train_dataset
   
        for data, targets in train_loader:
            # Get data to cuda
            data = data.to(device1) #.float()
            targets = targets.to(device1)

            # forward
            scores = model(data)
            loss = criterion(scores, targets) ## can be changed
            running_loss += loss

            # backward
            optimizer.zero_grad()
            loss.backward()
            # gradient descent or adam step
            optimizer.step()
        
        del train_loader, data, targets
        torch.cuda.empty_cache()

    # Validation phase
    total_val_correct = 0
    total_val = 0
    val_loss = 0
    with torch.no_grad():
        for batch_X, batch_Y in val_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)
            val_scores = model(batch_X).softmax(-1)  # Forward pass on validation set
            _, val_predicted = torch.max(val_scores.data, 1)  # Get predictions
            total_val_correct += (val_predicted == batch_Y).sum().item()  # Count correct predictions
            total_val += batch_Y.size(0)  # Total predictions
            val_loss += criterion(model(batch_X), batch_Y)
            del batch_X, batch_Y
            torch.cuda.empty_cache()

    val_accuracy = total_val_correct / total_val  # Calculate validation accuracy
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.3f}')
    results_file.write(
        f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.3f}\n')
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), f'Output/Supervised L_SwinTrans/best_ScratchLearn_weights.pth')  # Save the model weights
        #torch.save(model, f'/home/boyuj/VTTI_Boyu/Working/Boyu/SHRP 2 video/Output/Supervised L/Epoch {epoch+1}.pth')  # Save the entire model
        print(f'New best model saved with val accuracy: {best_val_accuracy}')
        results_file.write(f'New best model saved with val accuracy: {best_val_accuracy}\n')
        
    torch.cuda.empty_cache()
    results_file.close()
    scheduler.step()
    
    
results_file.close()

## test

In [8]:
device0 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device1 = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device0, device1)

cuda:0 cuda:1


In [3]:
# Set the default CUDA device to GPU 0 (for example)
torch.cuda.set_device(0)  # Change the index to the desired GPU

# load video encoder: Video ResNet

In [45]:
# Load the weights 
weights = R3D_18_Weights.DEFAULT

encoder_anchor = r3d_18()

# remove the classification head. Keep the backbone
encoder_anchor = torch.nn.Sequential(*(list(encoder_anchor.children())[:-1]))

# require gradient for training
for param in encoder_anchor.parameters():
    param.requires_grad = True

# load data

In [None]:
events = pd.read_csv('Example_video_label.csv') # change to your label file

file_names = os.listdir(path)
len(file_names)

In [8]:
label = events['EVENTSEVERITY1'].tolist()
y = {'Crash': 2, 'Baseline': 0, 'Near-Crash': 1}

# Convert character elements to integers
label = [y[char] for char in label]

In [9]:
# Initialize the inference transforms
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()


def process_video(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    return preprocess(vid[:77]) # extract first 77 frames

In [10]:
# Split the data
train_files, temp_files, train_labels, temp_labels = train_test_split(file_names, label, test_size=0.3, 
                                                                      random_state=7)

val_files, test_files, val_labels, test_labels = train_test_split(temp_files, temp_labels, test_size=0.667, 
                                                                  random_state=7)

In [12]:
# split training set for DataLoader
num_splits = 4
train_files_split = [train_files[i::num_splits] for i in range(num_splits)]
train_labels_split = [train_labels[i::num_splits] for i in range(num_splits)]

In [13]:
# load validation sets
with ThreadPoolExecutor() as executor:
    X_val = list(executor.map(process_video, val_files))

X_val = torch.stack(X_val)#.to(device1)
Y_val = torch.as_tensor(val_labels)#.to(device1)

# Create a DataLoader for batching
val_dataset = TensorDataset(X_val, Y_val)
del X_val, Y_val
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

# loss function

In [None]:
weights = R3D_18_Weights.DEFAULT
encoder_anchor = r3d_18(weights=weights)
encoder_anchor = torch.nn.Sequential(*(list(encoder_anchor.children())[:-1]))
encoder_anchor.eval()

In [19]:
val_files[:6]

['151610518_b.mp4',
 '151590941_b.mp4',
 '138350656.mp4',
 '131795642.mp4',
 '134035084_b.mp4',
 '142007427_b.mp4']

In [20]:
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    example = list(executor.map(process_video, val_files[:6]))

example = torch.stack(example)#.to(device1)

In [24]:
with torch.no_grad():
    features = encoder_anchor(example)
    features = features.view(features.size(0), -1)

In [25]:
features.shape

torch.Size([6, 512])

In [26]:
features

tensor([[0.5386, 0.7802, 0.3420,  ..., 2.6961, 0.3758, 0.4522],
        [1.3586, 0.3065, 0.9339,  ..., 1.0016, 0.6122, 0.3963],
        [0.5657, 0.3217, 0.8942,  ..., 0.8140, 0.5058, 0.1867],
        [1.0516, 0.9417, 0.6237,  ..., 1.2344, 1.2062, 0.5748],
        [1.8407, 0.5805, 0.4934,  ..., 0.4045, 1.4958, 0.2209],
        [1.0796, 0.4901, 0.9105,  ..., 2.0791, 1.2935, 0.4218]])

In [27]:

features = torch.nn.functional.normalize(features, p=2, dim=1)

In [28]:
torch.manual_seed(7)
# features = torch.rand(6, 512)
features = features.unsqueeze(1)
features.shape
# features: [bsz, n_views, f_dim]
# `n_views` is the number of crops from each image
# better be L2 normalized in f_dim dimension

torch.Size([6, 1, 512])

In [29]:
features

tensor([[[0.0213, 0.0309, 0.0136,  ..., 0.1068, 0.0149, 0.0179]],

        [[0.0676, 0.0153, 0.0465,  ..., 0.0499, 0.0305, 0.0197]],

        [[0.0390, 0.0222, 0.0616,  ..., 0.0561, 0.0349, 0.0129]],

        [[0.0572, 0.0512, 0.0339,  ..., 0.0671, 0.0656, 0.0312]],

        [[0.1043, 0.0329, 0.0279,  ..., 0.0229, 0.0847, 0.0125]],

        [[0.0522, 0.0237, 0.0440,  ..., 0.1005, 0.0625, 0.0204]]])

In [30]:
labels = torch.as_tensor([1, 0, 2, 1, 2 ,1])

In [31]:
batch_size = features.shape[0]
labels = labels.contiguous().view(-1, 1)
mask = torch.eq(labels, labels.T).float()#.to(device)

contrast_count = features.shape[1]
# Unbinds the features tensor along dimension 1 and concatenates the resulting tensors along dimension 0, resulting in a tensor of shape (5, 512).
contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)

anchor_feature = contrast_feature
anchor_count = contrast_count

temperature=10
base_temperature=10

In [32]:
mask

tensor([[1., 0., 0., 1., 0., 1.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 1., 0.],
        [1., 0., 0., 1., 0., 1.],
        [0., 0., 1., 0., 1., 0.],
        [1., 0., 0., 1., 0., 1.]])

In [33]:
anchor_feature

tensor([[0.0213, 0.0309, 0.0136,  ..., 0.1068, 0.0149, 0.0179],
        [0.0676, 0.0153, 0.0465,  ..., 0.0499, 0.0305, 0.0197],
        [0.0390, 0.0222, 0.0616,  ..., 0.0561, 0.0349, 0.0129],
        [0.0572, 0.0512, 0.0339,  ..., 0.0671, 0.0656, 0.0312],
        [0.1043, 0.0329, 0.0279,  ..., 0.0229, 0.0847, 0.0125],
        [0.0522, 0.0237, 0.0440,  ..., 0.1005, 0.0625, 0.0204]])

In [34]:
anchor_count

1

In [35]:
contrast_feature.shape

torch.Size([6, 512])

In [36]:
# compute logits
anchor_dot_contrast = torch.div(
    torch.matmul(anchor_feature, contrast_feature.T),
    temperature)

# for numerical stability
logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
logits = anchor_dot_contrast - logits_max.detach()

# tile mask
#Repeats the mask tensor anchor_count times along the first dimension and contrast_count times along the second dimension.
mask = mask.repeat(anchor_count, contrast_count)

In [37]:
anchor_dot_contrast

tensor([[0.1000, 0.0788, 0.0748, 0.0772, 0.0759, 0.0881],
        [0.0788, 0.1000, 0.0821, 0.0766, 0.0798, 0.0835],
        [0.0748, 0.0821, 0.1000, 0.0806, 0.0872, 0.0820],
        [0.0772, 0.0766, 0.0806, 0.1000, 0.0810, 0.0828],
        [0.0759, 0.0798, 0.0872, 0.0810, 0.1000, 0.0805],
        [0.0881, 0.0835, 0.0820, 0.0828, 0.0805, 0.1000]])

In [38]:
logits

tensor([[ 0.0000, -0.0212, -0.0252, -0.0228, -0.0241, -0.0119],
        [-0.0212,  0.0000, -0.0179, -0.0234, -0.0202, -0.0165],
        [-0.0252, -0.0179,  0.0000, -0.0194, -0.0128, -0.0180],
        [-0.0228, -0.0234, -0.0194,  0.0000, -0.0190, -0.0172],
        [-0.0241, -0.0202, -0.0128, -0.0190,  0.0000, -0.0195],
        [-0.0119, -0.0165, -0.0180, -0.0172, -0.0195,  0.0000]])

In [39]:
mask

tensor([[1., 0., 0., 1., 0., 1.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 1., 0.],
        [1., 0., 0., 1., 0., 1.],
        [0., 0., 1., 0., 1., 0.],
        [1., 0., 0., 1., 0., 1.]])

In [40]:
# mask-out self-contrast cases, Creates a mask to zero out the diagonal elements (self-contrast cases) in the logits tensor.
logits_mask = torch.scatter(
    torch.ones_like(mask),
    1,
    torch.arange(batch_size * anchor_count).view(-1, 1),#.to(device),
    0
)

In [41]:
logits_mask

tensor([[0., 1., 1., 1., 1., 1.],
        [1., 0., 1., 1., 1., 1.],
        [1., 1., 0., 1., 1., 1.],
        [1., 1., 1., 0., 1., 1.],
        [1., 1., 1., 1., 0., 1.],
        [1., 1., 1., 1., 1., 0.]])

In [42]:
mask = mask * logits_mask

In [43]:
mask
# labels = torch.as_tensor([0, 1, 2, 2 ,1])

tensor([[0., 0., 0., 1., 0., 1.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0.],
        [1., 0., 0., 1., 0., 0.]])

In [44]:
# compute log_prob by exponentiating the logits, applying the mask, and normalizing.
exp_logits = torch.exp(logits) * logits_mask
log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

In [45]:
exp_logits

tensor([[0.0000, 0.9790, 0.9751, 0.9774, 0.9762, 0.9882],
        [0.9790, 0.0000, 0.9823, 0.9769, 0.9800, 0.9837],
        [0.9751, 0.9823, 0.0000, 0.9808, 0.9873, 0.9821],
        [0.9774, 0.9769, 0.9808, 0.0000, 0.9812, 0.9829],
        [0.9762, 0.9800, 0.9873, 0.9812, 0.0000, 0.9807],
        [0.9882, 0.9837, 0.9821, 0.9829, 0.9807, 0.0000]])

In [46]:
exp_logits.sum(1, keepdim=True)

tensor([[4.8959],
        [4.9019],
        [4.9077],
        [4.8993],
        [4.9055],
        [4.9177]])

In [47]:
log_prob

tensor([[-1.5884, -1.6096, -1.6136, -1.6112, -1.6125, -1.6003],
        [-1.6109, -1.5896, -1.6075, -1.6130, -1.6098, -1.6061],
        [-1.6160, -1.6087, -1.5908, -1.6102, -1.6036, -1.6088],
        [-1.6119, -1.6125, -1.6084, -1.5891, -1.6081, -1.6063],
        [-1.6144, -1.6105, -1.6031, -1.6093, -1.5903, -1.6098],
        [-1.6047, -1.6093, -1.6108, -1.6101, -1.6123, -1.5928]])

In [48]:
# compute mean of log-likelihood over positive
# modified to handle edge cases when there is no positive pair for an anchor point. 
# Edge case e.g.:- 
# features of shape: [4,1,...]
# labels:            [0,1,1,2]
# loss before mean:  [nan, ..., ..., nan] 

#Sums the mask along dimension 1 to count positive pairs and ensures no division by zero by replacing small values with 1.
mask_pos_pairs = mask.sum(1)

In [49]:
mask_pos_pairs

tensor([2., 0., 1., 2., 1., 2.])

In [50]:
mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, 1, mask_pos_pairs)
#mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, torch.tensor(1, dtype=mask_pos_pairs.dtype), mask_pos_pairs)

In [51]:
mask_pos_pairs

tensor([2., 1., 1., 2., 1., 2.])

In [52]:
#Computes the mean log probability of positive pairs.
mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs

In [53]:
mask * log_prob

tensor([[-0.0000, -0.0000, -0.0000, -1.6112, -0.0000, -1.6003],
        [-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000],
        [-0.0000, -0.0000, -0.0000, -0.0000, -1.6036, -0.0000],
        [-1.6119, -0.0000, -0.0000, -0.0000, -0.0000, -1.6063],
        [-0.0000, -0.0000, -1.6031, -0.0000, -0.0000, -0.0000],
        [-1.6047, -0.0000, -0.0000, -1.6101, -0.0000, -0.0000]])

In [54]:
(mask * log_prob).sum(1)

tensor([-3.2115,  0.0000, -1.6036, -3.2182, -1.6031, -3.2148])

In [55]:
mean_log_prob_pos

tensor([-1.6057,  0.0000, -1.6036, -1.6091, -1.6031, -1.6074])

In [56]:
# Computes the loss by scaling the mean log probability and reshapes it to (anchor_count, batch_size).
loss = - (temperature / base_temperature) * mean_log_prob_pos
loss = loss.view(anchor_count, batch_size)

In [57]:
loss

tensor([[1.6057, -0.0000, 1.6036, 1.6091, 1.6031, 1.6074]])

In [58]:
#Averages the loss over the batch.
loss = loss.mean()

In [59]:
loss

tensor(1.3382)

In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=10.0, base_temperature=10.0): # temperature can be adjusted
        super(ContrastiveLoss, self).__init__()
        self.temperature = nn.Parameter(torch.tensor(temperature))# Make temperature learnable
        self.base_temperature = base_temperature

    def forward(self, features, labels):
        features = torch.nn.functional.normalize(features, p=2, dim=1)
        features = features.unsqueeze(1)
        batch_size = features.shape[0]
        labels = labels.contiguous().view(-1, 1)
        mask = torch.eq(labels, labels.T).float().to(device1)

        contrast_count = features.shape[1]
        # Unbinds the features tensor along dimension 1 and concatenates the resulting tensors along dimension 0.
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)

        anchor_feature = contrast_feature
        anchor_count = contrast_count

        # compute logits
        anchor_dot_contrast = torch.div(torch.matmul(anchor_feature, contrast_feature.T), self.temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device1), 
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mask_pos_pairs = mask.sum(1)
        mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, 1, mask_pos_pairs)
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs

        # loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss

# train encoder

In [7]:
num_epochs = 500
encoder_anchor = encoder_anchor.to(device1)

encoder_anchor.train()

Sequential(
  (0): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNor

In [17]:
# Define loss function and optimizer
ContrastLoss = ContrastiveLoss(temperature=10.0, base_temperature=10.0)
optimizer_encoder = optim.Adam(params = encoder_anchor.parameters(), lr=1e-3)

In [9]:
num_splits = 4
train_datasets = []

def process_split(split):
    # load preprocessed video file
    X_split = torch.load(f'Processed_video_r3d18/train/X_train_frame77_{split}.pt', weights_only=True)
    Y_split = torch.load(f'Processed_video_r3d18/train/Y_train_frame77_{split}.pt', weights_only=True)
    return X_split, Y_split

with ThreadPoolExecutor() as executor:
    results = executor.map(process_split, range(num_splits))

for X_split, Y_split in results:
    train_datasets.append(TensorDataset(X_split, Y_split))

In [19]:
BatchSize = 64 
train_loaders = []

for split in range(num_splits):
    train_loaders.append(DataLoader(dataset=train_datasets[split], batch_size=BatchSize, shuffle=True))

In [None]:
#### start training ####
best_val_ss = 0.0
best_val_loss = 99999.9 
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer_encoder, T_max=num_epochs, eta_min=0)

for epoch in range(num_epochs):
    results_file = open('results_encoder_ScratchLearn.txt', 'a') # create a txt file for saving the training records
    running_loss = 0
    for split in range(num_splits):
        for data, targets in train_loaders[split]:
            optimizer_encoder.zero_grad()
            data = data.to(device1)
            targets = targets.to(device1)

            h = encoder_anchor(data)
            h = h.view(h.size(0), -1)
            contrastive_loss = ContrastLoss(h, targets)
            running_loss += contrastive_loss

            contrastive_loss.backward()
            optimizer_encoder.step()
        
        del data, targets

    # Validation phase
    ss_result = []
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_Y in val_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)

            h_val = encoder_anchor(batch_X)
            h_val = h_val.view(h_val.size(0), -1)
            val_loss += ContrastLoss(h_val, batch_Y)
            ss = silhouette_score(h_val.cpu().numpy(), batch_Y.cpu().numpy())
            ss_result.append(ss)


    mean_ss = sum(ss_result) / len(ss_result)
    results_file.write(f'Epoch [{epoch+1}/{num_epochs}], Contrastive Loss: {running_loss.item():.5f}, Val loss: {val_loss:.5f}, Val Silht score: {mean_ss:.5f}\n')
    
    if mean_ss > best_val_ss:
        best_val_ss = mean_ss
        
        torch.save(encoder_anchor.state_dict(), 'best_encoder_ScratchLearn.pth') # save the best video encoder
        results_file.write(f'New best encoder saved with Silht scores: {best_val_ss}\n')

    del batch_X, batch_Y
    torch.cuda.empty_cache()
    results_file.close()
    scheduler.step()
    
results_file.close()

In [19]:
torch.cuda.empty_cache()

# train classification head

In [None]:
weights = R3D_18_Weights.DEFAULT
preprocess = weights.transforms()

trained_encoder = r3d_18()
fc_layer = torch.nn.Linear(trained_encoder.fc.in_features, 3)

trained_encoder = torch.nn.Sequential(*(list(trained_encoder.children())[:-1]))
trained_encoder.load_state_dict(torch.load(
    'best_encoder_ScratchLearn.pth',
    weights_only = False,
    map_location = device1))

for param in trained_encoder.parameters():
    param.requires_grad = False # don't require gradient for encoder
    
for param in fc_layer.parameters():
    param.requires_grad = True

# eval mode
trained_encoder.eval()

In [84]:
# Define loss function and optimizer
class_weights = torch.tensor([0.667, 0.854, 3.1], device=device1)
CELoss = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')

optimizer_fc = optim.Adam(fc_layer.parameters(), lr=1e-3) 

In [None]:
num_epochs = 500
trained_encoder = trained_encoder.to(device1)
fc_layer = fc_layer.to(device1)

fc_layer.train()

In [9]:
with open('Output/SCL/train_X_encoder.pkl', 'rb') as f:  # load the learned representations from the encoder
    X_train = pickle.load(f) 

with open('Output/SCL/train_Y_encoder.pkl', 'rb') as f:
    Y_train = pickle.load(f)
    
X = torch.tensor(X_train, dtype=torch.float32)
Y = torch.tensor(Y_train, dtype=torch.long) 

In [70]:
BatchSize = 64

train_dataset = TensorDataset(X, Y)
train_loader = DataLoader(train_dataset, batch_size=BatchSize, shuffle=True)

In [11]:
with open('Output/SCL/val_X_encoder.pkl', 'rb') as f: # load the learned representations from the encoder
    val_X = pickle.load(f)

with open('Output/SCL/val_Y_encoder.pkl', 'rb') as f:
    val_Y = pickle.load(f)
    
val_X = torch.tensor(val_X, dtype=torch.float32)
val_Y = torch.tensor(val_Y, dtype=torch.long) 

val_dataset = TensorDataset(val_X, val_Y)

val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False) 

In [None]:
best_accuracy = 0.0
best_val_loss = 99999.9
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer_fc, T_max=num_epochs, eta_min=0)

for epoch in range(num_epochs):
    results_file = open('results_SCL_fc_layer.txt', 'a') # create a txt file to save the training records
    running_loss = 0

    for data, targets in train_loader:
        data = data.to(device1)
        targets = targets.to(device1)
        pred = fc_layer(data)
        supervised_loss = CELoss(pred, targets)
        optimizer_fc.zero_grad()
        supervised_loss.backward()
        optimizer_fc.step()
        running_loss += supervised_loss
    
    # Validation phase
    total_val_correct = 0
    total_val = 0
    val_loss = 0

    with torch.no_grad():
        for batch_X, batch_Y in test_loader:
            batch_X = batch_X.to(device1)
            batch_Y = batch_Y.to(device1)

            val_scores = fc_layer(batch_X).softmax(-1) 
            _, val_predicted = torch.max(val_scores.data, 1)
            total_val_correct += (val_predicted == batch_Y).sum().item()
            total_val += batch_Y.size(0) 
            val_loss += CELoss(fc_layer(batch_X), batch_Y)

    val_accuracy = total_val_correct / total_val 
    results_file.write(f'Epoch [{epoch+1}/{num_epochs}], Training loss: {running_loss.item():.4f}, Val loss: {val_loss:.4f}, Val accuracy: {val_accuracy:.3f}\n')
    
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(fc_layer.state_dict(), f'best_fc_layer.pth')
        print(f'New best fc_layer saved with val accuracy: {val_accuracy}')
        results_file.write(f'New best fc_layer saved with val accuracy: {val_accuracy}\n')
        
    
    torch.cuda.empty_cache()
    results_file.close()
    scheduler.step()

results_file.close()

# test

In [31]:
with open('Output/SCL/test_X_encoder.pkl', 'rb') as f: # load the learned representations from the encoder
    X_test = pickle.load(f)

with open('Output/SCL/test_Y_encoder.pkl', 'rb') as f:
    Y_test = pickle.load(f)
    
test_X = torch.tensor(X_test, dtype=torch.float32)
test_Y = torch.tensor(Y_test, dtype=torch.long) 

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

In [32]:
trained_encoder = r3d_18()
fc_layer = torch.nn.Linear(trained_encoder.fc.in_features, 3)

trained_encoder = torch.nn.Sequential(*(list(trained_encoder.children())[:-1]))
trained_encoder.load_state_dict(torch.load(
    'best_encoder_ScratchLearn.pth', weights_only=True,
                                          map_location=device1
                                          ))
trained_encoder = trained_encoder.to(device1)
trained_encoder.eval()


fc_layer.load_state_dict(torch.load(
    'best_fc_layer.pth', weights_only=True,
                                   map_location=device1
                                   ))
fc_layer = fc_layer.to(device1)
fc_layer.eval()
type(test_loader)

torch.utils.data.dataloader.DataLoader

In [None]:
total_test_correct = 0
total_test = 0
pred_score = []
pred_label = []

with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device1)
        batch_Y = batch_Y.to(device1)

        test_scores = fc_layer(batch_X).softmax(-1)
        _, test_predicted = torch.max(test_scores.data, 1)  # Get predictions
        total_test_correct += (test_predicted == batch_Y).sum().item()  # Count correct predictions
        total_test += batch_Y.size(0)  # Total predictions
        pred_score.append(test_scores)
        pred_label.append(test_predicted)
        

In [34]:
def unpack_tensors(tensor_list):
    unpacked_tensors = []
    for tensor in tensor_list:
        for element in tensor:
            unpacked_tensors.append(element.cpu().numpy())
    return np.array(unpacked_tensors)

y_score = unpack_tensors(pred_score)
y_label = unpack_tensors(pred_label)

In [None]:
# Compute Accuracy
accuracy = accuracy_score(np.array(Y_test), y_label)
# Compute Confusion Matrix
cm = confusion_matrix(np.array(Y_test), y_label)#test_y_true

# Extract values for category y=1
TP_y1 = cm[1, 1]  # True Positives for y=1
FP_y1 = cm[0, 1] + cm[2, 1]  # False Positives for y=1
FN_y1 = cm[1, 0] + cm[1, 2]  # False Negatives for y=1 
# Calculate Precision and Recall for y=1
precision_y1 = TP_y1 / (TP_y1 + FP_y1) if (TP_y1 + FP_y1) > 0 else 0
recall_y1 = TP_y1 / (TP_y1 + FN_y1) if (TP_y1 + FN_y1) > 0 else 0

# Extract values for category y=2
TP_y2 = cm[2, 2]  # True Positives for y=2
FP_y2 = cm[0, 2] + cm[1, 2]  # False Positives for y=2
FN_y2 = cm[2, 0] + cm[2, 1]  # False Negatives for y=2
# Calculate Precision and Recall for y=2
precision_y2 = TP_y2 / (TP_y2 + FP_y2) if (TP_y2 + FP_y2) > 0 else 0
recall_y2 = TP_y2 / (TP_y2 + FN_y2) if (TP_y2 + FN_y2) > 0 else 0

# Compute F1 Score for Near-Crash
f1_y1 = 2 * (precision_y1 * recall_y1) / (precision_y1 + recall_y1) if (precision_y1 + recall_y1) > 0 else 0
# Compute F1 Score for Crash
f1_y2 = 2 * (precision_y2 * recall_y2) / (precision_y2 + recall_y2) if (precision_y2 + recall_y2) > 0 else 0

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall for Crash, Near-Crash: {recall_y2:.3f}, {recall_y1:.3f}")
print(f"Precision for Crash, Near-Crash: {precision_y2:.3f}, {precision_y1:.3f}")
print(f"F1 Score for Crash, Near-Crash: {f1_y2:.3f}, {f1_y1:.3f}")
print(cm)

In [89]:
77/15

5.133333333333334

In [17]:
# Define input transform
side_size = 256 ## The size of the shorter side of the frame after scaling. 
mean = [0.45, 0.45, 0.45] # Mean & Standard deviation values used for normalization, one per color channel (RGB)
std = [0.225, 0.225, 0.225]
crop_size = 256 ## The size to which the video frames will be center-cropped.

# The number of frames to extract from the video.
# The input video is temporally subsampled to this number of frames before being passed to the model.
num_frames = 32

sampling_rate = 4.7/2 # tune this!!!!! # The temporal stride of the fast pathway
frames_per_second = 30/2 # Frames per second (FPS) for the video.

slowfast_alpha = 4 # This parameter controls the relative speed between the slow and fast pathways.
# For instance, `SLOWFAST_ALPHA = 4` means the slow pathway samples frames at a rate 4 times less frequent than the fast pathway, i.e., every 8 frames.

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

#num_clips = 10
#num_crops = 3


class PackPathway(torch.nn.Module):
    """Transform for converting video frames as a list of tensors"""
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)



In [18]:
clip_duration

5.013333333333334

In [19]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

In [20]:
# Initialize an EncodedVideo helper class and load the video
video_path = path + train_files[2]
video = EncodedVideo.from_path(video_path)
# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)
inputs = [i for i in video_data["video"]]

In [21]:
print(len(inputs))
print(inputs[0].shape)
print(type(inputs[0]))
print(inputs[1].shape)
print(type(inputs[1]))

2
torch.Size([3, 8, 256, 256])
<class 'torch.Tensor'>
torch.Size([3, 32, 256, 256])
<class 'torch.Tensor'>


In [60]:
# Initialize an EncodedVideo helper class and load the video
video_path_1 = path + train_files[3]
video_1 = EncodedVideo.from_path(video_path_1)
# Load the desired clip
video_data_1 = video_1.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data_1 = transform(video_data_1)
inputs_1 = [i for i in video_data_1["video"]]

In [61]:
print(len(inputs_1))
print(inputs_1[0].shape)
print(type(inputs_1[0]))
print(inputs_1[1].shape)
print(type(inputs_1[1]))

2
torch.Size([3, 8, 256, 256])
<class 'torch.Tensor'>
torch.Size([3, 32, 256, 256])
<class 'torch.Tensor'>


In [64]:
tensor0 = torch.stack([inputs_1[0], inputs[0]])
tensor1 = torch.stack([inputs_1[1], inputs[1]])
inputs = [tensor0, tensor1]

In [66]:
type(inputs)
len(inputs)

2

In [67]:
model(inputs).shape

torch.Size([2, 512])

In [84]:
def process_video(file):
    # Initialize an EncodedVideo helper class and load the video
    video_path = path + file
    try:
        video = EncodedVideo.from_path(video_path)
        # Load the desired clip
        # {'video_data': Tensor of shape (T, C, H, W)} where T = number of frames, C = channels, H = height, W = width
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
        # Apply a transform to normalize the video input
        video_data = transform(video_data)

        #prepare the input for the model
        #inputs = [i[None, ...] for i in video_data["video"]]
        inputs = [i for i in video_data["video"]]  # Shape: (1, T, C, H, W)
        #inputs = torch.stack([i for i in video_data["video"]])
        return inputs
        
    except FileNotFoundError:
        print(f"No video found at {video_path}")
        #video = None 
    except Exception as e:
        print(f"An error occurred: {e}")

tensor0 = []
tensor1 = []
'''        
for file in train_files[:5]:
    inputs = process_video(file)
    tensor0.append(inputs[0])
    tensor1.append(inputs[1])
'''    

from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_video, train_files[:7]))

tensor0 = torch.stack([inputs[0] for inputs in results])
tensor1 = torch.stack([inputs[1] for inputs in results])

example = [tensor0, tensor1]


In [85]:
print(len(example))
print(example[0].shape)
print(type(example[0]))
print(example[1].shape)
print(type(example[1]))

2
torch.Size([7, 3, 8, 256, 256])
<class 'torch.Tensor'>
torch.Size([7, 3, 32, 256, 256])
<class 'torch.Tensor'>


In [86]:
pred = model(example)

In [89]:
pred

tensor([[ 4.7389e-03,  4.2112e-02, -5.6869e-02,  ..., -1.5899e-02,
         -1.7572e-03,  4.5431e-02],
        [ 9.0630e-03,  5.3385e-02, -6.8349e-02,  ..., -1.7457e-02,
          2.2629e-05,  6.3389e-02],
        [ 6.2081e-03,  4.5502e-02, -6.5099e-02,  ..., -2.5830e-02,
          6.4489e-04,  3.1176e-02],
        ...,
        [ 1.2675e-02,  6.6793e-02, -8.1811e-02,  ..., -2.0216e-02,
         -3.7418e-03,  7.7426e-02],
        [ 1.4328e-02,  4.5447e-02, -5.6876e-02,  ..., -2.0222e-02,
          1.0108e-03,  6.0961e-02],
        [ 1.8613e-03,  4.8267e-02, -6.5689e-02,  ..., -2.2184e-02,
          3.7392e-03,  3.8830e-02]], grad_fn=<ViewBackward0>)

In [94]:
features = torch.nn.functional.normalize(pred, p=2, dim=1)
torch.manual_seed(7)
# features = torch.rand(6, 512)
features = features.unsqueeze(1)
features.shape
# features: [bsz, n_views, f_dim]
# `n_views` is the number of crops from each image
# better be L2 normalized in f_dim dimension

labels = torch.as_tensor([1, 0, 2, 1, 2 ,1, 0])
batch_size = features.shape[0]
labels = labels.contiguous().view(-1, 1)
mask = torch.eq(labels, labels.T).float()#.to(device)

contrast_count = features.shape[1]
# Unbinds the features tensor along dimension 1 and concatenates the resulting tensors along dimension 0, resulting in a tensor of shape (5, 512).
contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)

anchor_feature = contrast_feature
anchor_count = contrast_count

temperature=10
base_temperature=10

# compute logits
anchor_dot_contrast = torch.div(
    torch.matmul(anchor_feature, contrast_feature.T),
    temperature)

# for numerical stability
logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
logits = anchor_dot_contrast - logits_max.detach()

# tile mask
#Repeats the mask tensor anchor_count times along the first dimension and contrast_count times along the second dimension.
mask = mask.repeat(anchor_count, contrast_count)

# mask-out self-contrast cases, Creates a mask to zero out the diagonal elements (self-contrast cases) in the logits tensor.
logits_mask = torch.scatter(
    torch.ones_like(mask),
    1,
    torch.arange(batch_size * anchor_count).view(-1, 1),#.to(device),
    0
)

mask = mask * logits_mask

# compute log_prob by exponentiating the logits, applying the mask, and normalizing.
exp_logits = torch.exp(logits) * logits_mask
log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

mask_pos_pairs = mask.sum(1)
mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, 1, mask_pos_pairs)
#mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, torch.tensor(1, dtype=mask_pos_pairs.dtype), mask_pos_pairs)

#Computes the mean log probability of positive pairs.
mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs

# Computes the loss by scaling the mean log probability and reshapes it to (anchor_count, batch_size).
loss = - (temperature / base_temperature) * mean_log_prob_pos
loss = loss.view(anchor_count, batch_size)


In [103]:
anchor_dot_contrast

tensor([[0.1000, 0.0986, 0.0958, 0.0965, 0.0960, 0.0986, 0.0979],
        [0.0986, 0.1000, 0.0927, 0.0944, 0.0990, 0.0965, 0.0985],
        [0.0958, 0.0927, 0.1000, 0.0995, 0.0884, 0.0929, 0.0958],
        [0.0965, 0.0944, 0.0995, 0.1000, 0.0904, 0.0931, 0.0975],
        [0.0960, 0.0990, 0.0884, 0.0904, 0.1000, 0.0936, 0.0968],
        [0.0986, 0.0965, 0.0929, 0.0931, 0.0936, 0.1000, 0.0945],
        [0.0979, 0.0985, 0.0958, 0.0975, 0.0968, 0.0945, 0.1000]],
       grad_fn=<DivBackward0>)

In [95]:
loss

tensor([[1.7914, 1.7899, 1.7975, 1.7922, 1.7974, 1.7908, 1.7901]],
       grad_fn=<ViewBackward0>)

In [96]:
loss.mean()

tensor(1.7927, grad_fn=<MeanBackward0>)

In [192]:
#original_tensor = torch.randn(6, 512)
tensor1 = torch.randn(6, 512)
tensor2 = torch.randn(6, 512)

In [194]:
tensor1

tensor([[ 1.7813, -1.8431,  0.0471,  ...,  1.2512, -0.2868, -1.3121],
        [-0.6395, -1.1887,  0.1142,  ..., -1.1530, -0.4722,  0.9251],
        [-2.4173,  0.6274,  0.9625,  ...,  1.5833, -0.6150,  1.4210],
        [-0.4424, -1.1124,  0.3907,  ..., -0.4711,  0.3590, -0.4634],
        [ 0.1884,  0.4230, -0.6830,  ...,  0.5008, -0.2486,  0.2541],
        [ 1.1854, -1.0656,  0.1663,  ..., -1.1010,  0.3486,  0.8192]])

In [193]:
tensor2

tensor([[-0.7217,  0.3723,  1.0594,  ..., -0.8908, -0.6278,  0.4420],
        [ 1.8054, -0.6130, -1.3816,  ..., -0.6997,  0.4745, -2.1738],
        [ 0.1287, -0.4778,  1.9450,  ...,  0.3311, -0.7177, -0.6761],
        [-1.9065,  0.7391, -0.5156,  ..., -0.0477, -0.5712, -0.1730],
        [ 1.2424, -0.6269,  0.0848,  ..., -0.5125,  1.1627, -0.1825],
        [-1.2697,  0.3114,  0.2778,  ..., -0.0835, -0.4085,  0.6941]])

In [163]:
#tensor1, tensor2 = torch.split(original_tensor, int(original_tensor.shape[1]/2), dim=1)

#tensor1 = torch.nn.functional.pad(tensor1, (0, int(original_tensor.shape[1]/2), 0, 0), mode='constant', value=0).unsqueeze(1)
#tensor2 = torch.nn.functional.pad(tensor2, (int(original_tensor.shape[1]/2), 0, 0, 0), mode='constant', value=0).unsqueeze(1)


In [195]:
res_tensor = torch.cat((tensor1.unsqueeze(1), tensor2.unsqueeze(1)), dim=1)
res_tensor

tensor([[[ 1.7813, -1.8431,  0.0471,  ...,  1.2512, -0.2868, -1.3121],
         [-0.7217,  0.3723,  1.0594,  ..., -0.8908, -0.6278,  0.4420]],

        [[-0.6395, -1.1887,  0.1142,  ..., -1.1530, -0.4722,  0.9251],
         [ 1.8054, -0.6130, -1.3816,  ..., -0.6997,  0.4745, -2.1738]],

        [[-2.4173,  0.6274,  0.9625,  ...,  1.5833, -0.6150,  1.4210],
         [ 0.1287, -0.4778,  1.9450,  ...,  0.3311, -0.7177, -0.6761]],

        [[-0.4424, -1.1124,  0.3907,  ..., -0.4711,  0.3590, -0.4634],
         [-1.9065,  0.7391, -0.5156,  ..., -0.0477, -0.5712, -0.1730]],

        [[ 0.1884,  0.4230, -0.6830,  ...,  0.5008, -0.2486,  0.2541],
         [ 1.2424, -0.6269,  0.0848,  ..., -0.5125,  1.1627, -0.1825]],

        [[ 1.1854, -1.0656,  0.1663,  ..., -1.1010,  0.3486,  0.8192],
         [-1.2697,  0.3114,  0.2778,  ..., -0.0835, -0.4085,  0.6941]]])

In [196]:
res_tensor.shape

torch.Size([6, 2, 512])

In [203]:
weights = R3D_18_Weights.DEFAULT
encoder_anchor = r3d_18(weights=weights)
encoder_anchor = torch.nn.Sequential(*(list(encoder_anchor.children())[:-1]))
encoder_anchor.eval()
val_files[:6]

['151859367.mp4',
 '151600338.mp4',
 '151568557.mp4',
 '151569020_b.mp4',
 '151864990_b.mp4',
 '151574090_b.mp4']

In [229]:
preprocess = weights.transforms()

def process_video0(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    if vid.size(0) == 0:  # Check if the first dimension (time) is zero
        return torch.ones(3, 76, 112, 112)  # Return a zero tensor with the specified shape
    return preprocess(vid[:76])

def process_video1(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    if vid.size(0) == 0:  # Check if the first dimension (time) is zero
        return torch.ones(3, 38, 112, 112)  # Return a zero tensor with the specified shape
    return preprocess(vid[:38])

def process_video2(file):
    vid, _, _ = read_video(path + file, output_format="TCHW", pts_unit='sec')
    if vid.size(0) == 0:  # Check if the first dimension (time) is zero
        return torch.ones(3, 38, 112, 112)  # Return a zero tensor with the specified shape
    return preprocess(vid[38:76])

In [233]:
with ThreadPoolExecutor() as executor:
    example0 = list(executor.map(process_video0, val_files[:6]))
example0 = torch.stack(example0)#.to(device1)
print(example0.shape)

with torch.no_grad():
    raw_features0 = encoder_anchor(example0)
    raw_features0 = raw_features0.view(raw_features0.size(0), -1)
print(raw_features0.shape)

torch.Size([6, 3, 76, 112, 112])
torch.Size([6, 512])


In [205]:
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    example1 = list(executor.map(process_video1, val_files[:6]))
example1 = torch.stack(example1)#.to(device1)
    
with ThreadPoolExecutor() as executor:
    example2 = list(executor.map(process_video2, val_files[:6]))
example2 = torch.stack(example2)#.to(device1)

with torch.no_grad():
    raw_features1 = encoder_anchor(example1)
    raw_features1 = raw_features1.view(raw_features1.size(0), -1)
    
    raw_features2 = encoder_anchor(example2)
    raw_features2 = raw_features2.view(raw_features2.size(0), -1)

features = torch.cat((raw_features1.unsqueeze(1), raw_features2.unsqueeze(1)), dim=1)

In [206]:
raw_features1

tensor([[0.5803, 1.3810, 0.6058,  ..., 0.6929, 0.8700, 1.5157],
        [0.9929, 0.3172, 1.2320,  ..., 0.1994, 1.0364, 0.3778],
        [1.5709, 0.6001, 0.6412,  ..., 2.0286, 0.6618, 0.5057],
        [0.8595, 0.4736, 0.6543,  ..., 0.9596, 1.8751, 0.1583],
        [0.3284, 0.7914, 1.0755,  ..., 1.3423, 0.2285, 0.0482],
        [1.0450, 0.2499, 0.5111,  ..., 1.2609, 0.8196, 0.0361]])

In [207]:
raw_features2

tensor([[0.6389, 1.4686, 0.4565,  ..., 0.6705, 0.8350, 0.9534],
        [1.2422, 0.1993, 1.1996,  ..., 0.3300, 0.8838, 0.3698],
        [1.3286, 0.4192, 0.7180,  ..., 1.7197, 0.5852, 0.8142],
        [1.1361, 0.4766, 0.4306,  ..., 1.1108, 1.4538, 0.2129],
        [0.3378, 0.4026, 1.7973,  ..., 1.4295, 0.4509, 0.0473],
        [0.6738, 0.2376, 0.5889,  ..., 1.0103, 1.0471, 0.0174]])

In [208]:
features

tensor([[[0.5803, 1.3810, 0.6058,  ..., 0.6929, 0.8700, 1.5157],
         [0.6389, 1.4686, 0.4565,  ..., 0.6705, 0.8350, 0.9534]],

        [[0.9929, 0.3172, 1.2320,  ..., 0.1994, 1.0364, 0.3778],
         [1.2422, 0.1993, 1.1996,  ..., 0.3300, 0.8838, 0.3698]],

        [[1.5709, 0.6001, 0.6412,  ..., 2.0286, 0.6618, 0.5057],
         [1.3286, 0.4192, 0.7180,  ..., 1.7197, 0.5852, 0.8142]],

        [[0.8595, 0.4736, 0.6543,  ..., 0.9596, 1.8751, 0.1583],
         [1.1361, 0.4766, 0.4306,  ..., 1.1108, 1.4538, 0.2129]],

        [[0.3284, 0.7914, 1.0755,  ..., 1.3423, 0.2285, 0.0482],
         [0.3378, 0.4026, 1.7973,  ..., 1.4295, 0.4509, 0.0473]],

        [[1.0450, 0.2499, 0.5111,  ..., 1.2609, 0.8196, 0.0361],
         [0.6738, 0.2376, 0.5889,  ..., 1.0103, 1.0471, 0.0174]]])

In [227]:
features.shape

torch.Size([6, 2, 512])

In [228]:
ys = torch.randn(6)

fakedataset = TensorDataset(features, ys)
fakeloader = DataLoader(fakedataset, batch_size=10, shuffle=False)  # Adjust batch_size as needed

In [None]:
#raw_features = torch.nn.functional.normalize(raw_features, p=2, dim=2)
#features = features.unsqueeze(1)
#raw_features.shape

In [None]:
#tensor1, tensor2 = torch.split(raw_features, int(raw_features.shape[1]/2), dim=1)

#tensor1 = torch.nn.functional.pad(tensor1, (0, int(raw_features.shape[1]/2), 0, 0), mode='constant', value=0).unsqueeze(1)
#tensor2 = torch.nn.functional.pad(tensor2, (int(raw_features.shape[1]/2), 0, 0, 0), mode='constant', value=0).unsqueeze(1)
#features = torch.cat((tensor1, tensor2), dim=1)
#features.shape

In [210]:
batch_size = features.shape[0]
mask = torch.eye(batch_size, dtype=torch.float32)

contrast_count = features.shape[1]
# Unbinds the features tensor along dimension 1 and concatenates the resulting tensors along dimension 0, resulting in a tensor of shape (5, 512).
contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)

anchor_feature = contrast_feature
anchor_count = contrast_count

temperature=10
base_temperature=10

In [211]:
mask

tensor([[1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1.]])

In [212]:
contrast_count

2

In [213]:
anchor_count

2

In [214]:
# compute logits
anchor_dot_contrast = torch.div(
    torch.matmul(anchor_feature, contrast_feature.T),
    temperature)

# for numerical stability
logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
logits = anchor_dot_contrast - logits_max.detach()

# tile mask
#Repeats the mask tensor anchor_count times along the first dimension and contrast_count times along the second dimension.
mask = mask.repeat(anchor_count, contrast_count)

In [215]:
logits

tensor([[  0.0000,  -7.4198,  -2.5497,  -7.3351, -11.8793,  -6.4295,  -1.9893,
          -7.3531,  -2.0151,  -6.6593, -10.8924,  -8.3623],
        [ -9.8684,   0.0000,  -5.6126,  -5.0045, -12.0003,  -6.0410, -10.7717,
          -0.5895,  -6.1009,  -5.8015, -11.4308,  -7.8125],
        [-13.1465, -13.7609,   0.0000, -11.4468, -19.6793, -11.5500, -14.6907,
         -14.1504,  -2.2684, -11.2548, -17.7309, -13.5254],
        [ -9.4456,  -4.6665,  -2.9605,   0.0000, -10.6302,  -4.5949, -10.2319,
          -4.2290,  -4.3877,  -2.3372,  -9.9660,  -6.6995],
        [ -3.7758,  -1.4482,  -0.9789,  -0.4161,   0.0000,  -2.0873,  -4.1295,
          -1.4917,  -0.8156,  -1.0051,  -1.8562,  -3.8388],
        [ -8.7188,  -5.8817,  -3.2425,  -4.7737, -12.4802,   0.0000,  -9.6851,
          -5.9796,  -4.9169,  -5.0875, -12.1852,  -2.9672],
        [ -1.4291,  -7.7629,  -3.5336,  -7.5611, -11.6728,  -6.8355,   0.0000,
          -7.8035,  -2.6586,  -6.7828, -10.7477,  -8.8981],
        [-11.4042,  -2.1919

In [216]:
mask

tensor([[1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.]])

In [217]:
# mask-out self-contrast cases, Creates a mask to zero out the diagonal elements (self-contrast cases) in the logits tensor.
logits_mask = torch.scatter(
    torch.ones_like(mask),
    1,
    torch.arange(batch_size * anchor_count).view(-1, 1),#.to(device),
    0)
mask = mask * logits_mask

In [218]:
mask

tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]])

In [219]:
# compute log_prob by exponentiating the logits, applying the mask, and normalizing.
exp_logits = torch.exp(logits) * logits_mask
log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
mask_pos_pairs = mask.sum(1)
mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, 1, mask_pos_pairs)

In [220]:
mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs

In [221]:
loss = - (temperature / base_temperature) * mean_log_prob_pos
loss = loss.view(anchor_count, batch_size)


In [222]:
loss = loss.mean()

In [223]:
loss

tensor(0.9051)