In [18]:
import torch
import torch.nn as nn
import numpy as np
import random
from torch.nn import functional as F

import torchvision.transforms.functional
import torch.nn as nn
from torchvision import transforms
import torchvision.io
from torch.utils.data import DataLoader, Dataset

# to create the resnet layers
class block(nn.Module):

    # identity_downsample --> conv layer
    def __init__(self, input_channel, output_channel, identity_downsample=None, stride=1):
        super(block,self).__init__()

        # number of output channel is always 4 times the number of input channel in a block
        self.expansion = 4

        """ ---- 1st conv layer (kernel_size = 1) ---- """
        # 1st convolution layer
        self.conv1 = nn.Conv3d(in_channels=input_channel,
                               out_channels=output_channel,
                               kernel_size=1,
                               stride=1,
                               padding=0)

        # 1st batch norm layer
        self.bn1 = nn.BatchNorm3d(output_channel)

        """ ---- 2nd conv layer (kernel_size = 3)---- """
        # 2nd convolution layer
        # the stride is from the init
        # the in_channels will be the output from the previous layer
        self.conv2 = nn.Conv3d(in_channels=output_channel,
                               out_channels=output_channel,
                               kernel_size=3,
                               stride=stride,
                               padding=1)

        # 2nd batch norm layer
        self.bn2 = nn.BatchNorm3d(output_channel)

        """ ---- 3rd layer conv (kernel_size = 1)---- """
        # 3rd convolution layer
        # the output channel will be 4 times the number of the input channel from the previous layer
        # the in_channels will be the output from the previous layer
        self.conv3 = nn.Conv3d(in_channels=output_channel,
                               out_channels=output_channel * self.expansion,
                               kernel_size=1,
                               stride=1,
                               padding=0)

        # 3rd batch norm layer
        self.bn3 = nn.BatchNorm3d(output_channel * self.expansion)

        """ ---- ReLU layer ---- """
        self.relu = nn.ReLU()

        """ ---- identity mapping ----"""
        # conv layer that do the identity mapping
        # to ensure same shape in the later layers
        # If their sizes mismatch, then the input goes into an identity
        # this is for the skipped connection
        self.identity_downsample = identity_downsample

    # forward pass
    def forward(self, x):
        identity = x


        # A basic ResNet block is composed by two layers of 3x3 conv/batchnorm/relu
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)

        # use the identity downsample if there is a need to change the shape
        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        # current output plus the residual skipped connection
        x = x + identity
        x = self.relu(x)
        return x

# [3,4,6,3] --> layers per block
class ResNet(nn.Module):
    # block --> from the block class
    # layers --> number of times to use the block class [3,4,6,3]
    # image_channels --> number of channels of the input (normally is 3, RGB)
    # num_classes --> number of classes in the data (remove it for CVLR)
    def __init__(self, block, layers, image_channels):
        super(ResNet,self).__init__()

        # first layer
        self.input_channel = 64
        # kernel size is 5 as mentioned in the paper
        # original kernel size is 7
        self.conv1 = nn.Conv3d(in_channels=image_channels, out_channels=64, kernel_size=5, stride=2, padding=3)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU()

        # max pooling layer
        self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)

        # Resnet layers
        # out_channels * 4 at the end
        # all data layers are stride 2 as mentioned in the paper
        #self.layer1 = self._make_layers(block, layers[0], out_channels=64, stride=1)
        self.layer1 = self._make_layers(block, layers[0], out_channels=64, stride=2)
        self.layer2 = self._make_layers(block, layers[1], out_channels=128, stride=2)
        self.layer3 = self._make_layers(block, layers[2], out_channels=256, stride=2)
        self.layer4 = self._make_layers(block, layers[3], out_channels=512, stride=2)

        # the features
        self.avgpool = nn.AdaptiveAvgPool3d((1,1,1))

        # fc layer
        #self.fc = nn.Linear(512 * 4, num_classes)

        # MLP layer
        # output channel * 4 from the layer 4
        self.l1 = nn.Linear(512*4, 512*4)
        self.l2 = nn.Linear(512*4, 128)


    # get the features from the CNN layers
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # get the correct shape to make the output 1x1
        """ Feature extraction """
        # to be use for downstream tasks
        h = self.avgpool(x)

        # reshape to send to the fully connected layer
        #h = x.reshape(h.shape[0], -1)

        """ MLP layer """
        h = h.squeeze()

        # Projection --> (Dense --> Relu --> Dense)
        x = self.l1(h)
        x = self.relu(x)
        x = self.l2(x)

        # original resnet FC
        #x = self.fc(x)

        return h, x


    # create ResNet Layers
    # num_residual_blocks --> number of times the block class will be used
    # out_channels --> number of channels for the output of the layer
    # calls the block multiple times
    def _make_layers(self, block, num_residual_blocks, out_channels, stride):
        identity_downsample = None
        # layers that changes the number of channels for each input channel in subsequent blocks
        layers = []

        # if the stride changes or the input channel into the nxt block changes
        if stride != 1 or self.input_channel != out_channels * 4:
            # only want to change the channel so the kernel size will remain as 1
            identity_downsample = nn.Sequential(nn.Conv3d(in_channels = self.input_channel,
                                                          out_channels=out_channels*4,
                                                          kernel_size = 1,
                                                          stride = stride),
                                                nn.BatchNorm3d(out_channels * 4))

        # first block --> the only changes in stride and channels
        # out_channels will be multiplied by 4 at the end of each block
        # identity mapping is the addition of the skipped connection and the output of the layer
        # need to do identity downsample due to the difference in input channel in the first layer and output layer in the first block to do identity mapping
        # it will downsample the identity via passed convolution layer to successfully perform addition
        layers.append(block(self.input_channel, out_channels, identity_downsample, stride))

        # need to change the number of input channels to match the output channels of the previous block
        self.input_channel = out_channels * 4

        # -1 because one residual block have been calculated in
        # 'layers.append(block(self.input_channel, out_channels, identity_downsample, stride))' that changes the num of channels
        for i in range(num_residual_blocks - 1):
            # out_channels will be 256 after the end of the first block
            # for this first layer, the in_channels will be 256 and the out_channels will be 64
            # therefore, need to map 256 (in_channels) to 64 (out_channels) --> at the end of the block, 64 * 4 = 256 again
            # stride will be one as well
            layers.append(block(self.input_channel, out_channels))

        # unpack the list of layers and pytorch will know that each layer will come after each other
        return (nn.Sequential(*layers))


def ResNet_3D_50(img_channels = 3, num_classes=1000):
    return ResNet(block, layers=[3,4,6,3], image_channels=img_channels)


def ResNet_3D_101(img_channels = 3, num_classes=1000):
    return ResNet(block, layers=[3,4,23,3], image_channels=img_channels)


def ResNet_3D_152(img_channels = 3, num_classes=1000):
    return ResNet(block, layers=[3,8,36,3], image_channels=img_channels)

def test():
    model = ResNet_3D_50()
    x = torch.randn(10, 3, 3, 224, 224)
    # get the representations and the projections
    ris, zis = model(x)
    #print(y.shape)
    print(ris.shape, zis.shape)

#test()

In [19]:
#!pip install av


In [20]:
!pip install pyav



In [21]:
import os

ROOT_FOLDER = "/kaggle/working/"

DATA_FOLDER = "/kaggle/input/cat-vs-dog-video/data"

#DATA_LIST_FOLDER = os.path.join(ROOT_FOLDER, 'ucfTrainTestlist')

#CLASS_LIST_TEXT_FILE = os.path.join(DATA_LIST_FOLDER, 'classInd.txt')

#TRAIN_FOLDER_PATH = os.path.join(DATA_FOLDER, 'train')

#TEST_FOLDER_PATH = os.path.join(DATA_FOLDER, 'test')

#VAL_FOLDER_PATH = os.path.join(DATA_FOLDER, 'val')

NUM_OF_EPOCH = 10

BATCH_SIZE = 4

BATCH_SIZE_TEST = 2

LENGTH_OF_CLIP = 16

RESIZED_FRAME = 224

DATALOADER_NUM_WORKERS = 3

CONTRASTIVE_LOSS_TEMP = 2

SAVED_MODEL_FOLDER = "/kaggle/working/"

SAVED_MODEL_CHECKPOINT_PATH = os.path.join(SAVED_MODEL_FOLDER, 'highest_val_acc_model.pt')

#TENSORBOARD_ROOT_LOGDIR = os.path.join(ROOT_FOLDER,'tensorboard_logs')

In [22]:
import numpy as np

import os
import torchvision.io
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms.functional
import pickle
from torchvision import transforms


import torch
import torch.nn as nn

In [23]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("There are {} GPUs available".format(torch.cuda.device_count()))

There are 1 GPUs available


In [25]:
class_labeling = [['0', 'dog'], ['1', 'cat']]
class_name_labelling = ['dog', 'cat']
class_num_labelling = ['0', '1']


In [26]:

# Inicialización de listas para train y test
train_class, train_videos, train_num_label = [], [], []
test_class, test_videos, test_num_label = [], [], []

# Diccionario para etiquetar clases
class_mapping = {"cat": 0, "dog": 1}

# Función para procesar el conjunto de datos (train/test)
def process_data(split, class_array, videos_array, num_label_array):
    split_dir = os.path.join(DATA_FOLDER, split)  # Ruta a train o test
    for class_name in os.listdir(split_dir):
        class_path = os.path.join(split_dir, class_name)
        if os.path.isdir(class_path):  # Si es una carpeta (clase)
            label = class_mapping[class_name]  # Obtener etiqueta de la clase
            for video_file in os.listdir(class_path):
                if video_file.endswith(".mp4"):  # Filtrar videos
                    video_path = os.path.join(class_path, video_file)
                    class_array.append(class_name)  # Agregar clase
                    videos_array.append(video_path)  # Agregar ruta del video
                    num_label_array.append(label)  # Agregar etiqueta numérica

# Procesar train y test
process_data("train", train_class, train_videos, train_num_label)
process_data("test", test_class, test_videos, test_num_label)

# Verificar resultados
print("Train data:")
print(f"Classes: {train_class[:5]}")  # Muestra de las clases
print(f"Videos: {train_videos[:5]}")  # Muestra de las rutas de los videos
print(f"Labels: {train_num_label[:5]}")  # Muestra de las etiquetas

print("\nTest data:")
print(f"Classes: {test_class[:5]}")
print(f"Videos: {test_videos[:5]}")
print(f"Labels: {test_num_label[:5]}")


Train data:
Classes: ['dog', 'dog', 'dog', 'dog', 'dog']
Videos: ['/kaggle/input/cat-vs-dog-video/data/train/dog/dog33.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog15.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog12.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog19.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog37.mp4']
Labels: [1, 1, 1, 1, 1]

Test data:
Classes: ['dog', 'dog', 'dog', 'dog', 'dog']
Videos: ['/kaggle/input/cat-vs-dog-video/data/test/dog/dog7.mp4', '/kaggle/input/cat-vs-dog-video/data/test/dog/dog1.mp4', '/kaggle/input/cat-vs-dog-video/data/test/dog/dog5.mp4', '/kaggle/input/cat-vs-dog-video/data/test/dog/dog3.mp4', '/kaggle/input/cat-vs-dog-video/data/test/dog/dog2.mp4']
Labels: [1, 1, 1, 1, 1]


In [27]:
import numpy as np

# Listas para el conjunto de validación
val_class = []
val_videos = []
val_num_label = []

# Copias de las listas originales
train_class_copy = train_class.copy()
train_num_label_copy = train_num_label.copy()
train_videos_copy = train_videos.copy()


def get_validation_set(set_num_per_class, original_class, original_num_label, original_videos,
                       val_class_array, val_num_label_array, val_videos_array):
    """
    Genera un conjunto de validación seleccionando una cantidad específica de elementos de cada clase.
    """
    # Mapeo para contar elementos seleccionados por clase
    class_count = {cls: 0 for cls in set(original_class)}

    # Índices de elementos seleccionados para validación
    val_indices = []

    # Recorrer los datos de entrenamiento
    for idx, cls in enumerate(original_class):
        # Verificar si se puede agregar más elementos de esta clase al conjunto de validación
        if class_count[cls] < set_num_per_class:
            # Agregar datos al conjunto de validación
            val_class_array.append(cls)
            val_num_label_array.append(original_num_label[idx])
            val_videos_array.append(original_videos[idx])
            
            # Marcar índice como seleccionado
            val_indices.append(idx)
            
            # Incrementar el contador de la clase
            class_count[cls] += 1

    # Filtrar listas originales para eliminar los elementos seleccionados
    remaining_indices = set(range(len(original_class))) - set(val_indices)
    remaining_indices = sorted(remaining_indices)  # Asegurar orden

    original_class[:] = [original_class[i] for i in remaining_indices]
    original_num_label[:] = [original_num_label[i] for i in remaining_indices]
    original_videos[:] = [original_videos[i] for i in remaining_indices]


# Generar el conjunto de validación seleccionando 10 elementos por clase
get_validation_set(
    set_num_per_class=4,
    original_class=train_class,
    original_num_label=train_num_label,
    original_videos=train_videos,
    val_class_array=val_class,
    val_num_label_array=val_num_label,
    val_videos_array=val_videos
)

# Verificar resultados
print("Clases en conjunto de validación:", val_class[:10])  # Muestra de clases
print("Videos en conjunto de validación:", val_videos[:10])  # Muestra de videos
print("Etiquetas numéricas en validación:", val_num_label[:10])  # Muestra de etiquetas
print("Total de elementos en validación:", len(val_class))

print("Clases restantes en conjunto de entrenamiento:", len(train_class))
print("Videos restantes en conjunto de entrenamiento:", len(train_videos))


Clases en conjunto de validación: ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'cat', 'cat']
Videos en conjunto de validación: ['/kaggle/input/cat-vs-dog-video/data/train/dog/dog33.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog15.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog12.mp4', '/kaggle/input/cat-vs-dog-video/data/train/dog/dog19.mp4', '/kaggle/input/cat-vs-dog-video/data/train/cat/cat26.mp4', '/kaggle/input/cat-vs-dog-video/data/train/cat/cat25.mp4', '/kaggle/input/cat-vs-dog-video/data/train/cat/cat30.mp4', '/kaggle/input/cat-vs-dog-video/data/train/cat/cat9.mp4']
Etiquetas numéricas en validación: [1, 1, 1, 1, 0, 0, 0, 0]
Total de elementos en validación: 8
Clases restantes en conjunto de entrenamiento: 73
Videos restantes en conjunto de entrenamiento: 73


In [28]:
class VideoDataset(Dataset):

    def __init__(self, class_labels, vid, transform = None):
        super().__init__()
        self.class_labels = class_labels
        self.vid = vid
        self.transform = transform

    def __getitem__(self, index: int):

        # get one video and its label
        vid_path, class_num_label = self.vid[index], self.class_labels[index]

        # can also use torch vision
        video, audio, info = torchvision.io.read_video(filename=vid_path)
        #print(video.size())

         # Shuffle the frames randomly
        shuffled_indices = torch.randperm(video.size()[0])
        video = video[shuffled_indices]
        
        total_vid_frames = video.size()[0]
        #print(total_vid_frames)

        # random selection of 5 - 10 frames ahead
        t = random.randint(5, 10)

        # number of frames to be saved into the frame folder for each clip
        # 16 frames to be put into the model
        length_of_separated_clip_in_frames = LENGTH_OF_CLIP

        # formula to get the sample distribution of P, which is the end point for clip 1 and, P + t for starting point of clip 2
        # P = L-(2*W+t)
        # allow the a few frames of overlap if clip 1 does not have enough frames for 16 frames for value p
        p = int((total_vid_frames - (2 * length_of_separated_clip_in_frames + t))/2)
        #print(p)

        # p - 16 to get 16 frames as stated in the paper
        # extend the clip 1 array
        start_frame_clip_1_idx = 0
        end_frame_clip_1_idx = 0

        # if p is a value that will result in a negative frame, start from frame 0
        if p - length_of_separated_clip_in_frames <= -1:
            start_frame_clip_1_idx = start_frame_clip_1_idx + 0
            end_frame_clip_1_idx = end_frame_clip_1_idx + LENGTH_OF_CLIP

        else:
            # p - 16 to get 16 frames as stated in the paper
            start_frame_clip_1_idx = start_frame_clip_1_idx + p - length_of_separated_clip_in_frames

            end_frame_clip_1_idx = end_frame_clip_1_idx + p



        #print('start_frame_clip_1', start_frame_clip_1_idx)
        #print('end_frame_clip_1', end_frame_clip_1_idx)

        tensor_clip_1 = video[start_frame_clip_1_idx: end_frame_clip_1_idx]

        tensor_clip_1 = torch.reshape(tensor_clip_1,
                                      [tensor_clip_1.size()[0],
                                       tensor_clip_1.size()[3],
                                       tensor_clip_1.size()[1],
                                       tensor_clip_1.size()[2]])
        #print(len(clip_1))
        #print('clip_1 size: ',clip_1.size())

        # P + t for starting point of clip 2 as said in the paper
        # int(p) + t + length_of_separated_clip_in_frames to get 16 frames for clip 2
        # extend the clip 2 array
        start_frame_clip_2_idx = p + t
        end_frame_clip_2_idx = p + t + length_of_separated_clip_in_frames

        tensor_clip_2 = video[start_frame_clip_2_idx: end_frame_clip_2_idx]
        tensor_clip_2 = torch.reshape(tensor_clip_2,
                                      [tensor_clip_2.size()[0],
                                       tensor_clip_2.size()[3],
                                       tensor_clip_2.size()[1],
                                       tensor_clip_2.size()[2]])
        #print(len(clip_2))
        #print(clip_1.size())

        #if len(clip_1) == len(clip_2):
        #sample = torch.stack([clip_1, clip_2], dim=0)



        if self.transform is not None:

            # do transformation as PIL images on clip 1 using the TrainTransform class
            # returns a list of transformed PIL images
            transformed_clip_1 = self.transform(tensor_clip_1)

            # do transformation as PIL images on clip 2 the TrainTransform class
            # returns a list of transformed PIL images
            transformed_clip_2 = self.transform(tensor_clip_2)


            # convert the clip_1 list to tensor
            # convert the PIL images to tensor then stack
            tensor_clip_1 = torch.stack([transforms.functional.to_tensor(pic) for pic in transformed_clip_1])

            # convert the clip_2 list to tensor
            # convert the PIL images to tensor then stack
            tensor_clip_2 = torch.stack([transforms.functional.to_tensor(pic) for pic in transformed_clip_2])

            tensor_clip_1 = torch.reshape(tensor_clip_1,
                                          [tensor_clip_1.size()[0],
                                           tensor_clip_1.size()[3],
                                           tensor_clip_1.size()[1],
                                           tensor_clip_1.size()[2]])

            tensor_clip_2 = torch.reshape(tensor_clip_2,
                                          [tensor_clip_2.size()[0],
                                           tensor_clip_2.size()[3],
                                           tensor_clip_2.size()[1],
                                           tensor_clip_2.size()[2]])



            # stack by columns and return a tensor
            #sample = torch.stack([tensor_clip_1, tensor_clip_2], dim=0)


        # returns a tuple of clip_1, clip_2 and the its label
        return tensor_clip_1, tensor_clip_2, class_num_label

    # get the length of total dataset
    def __len__(self):
        return len(self.vid)


In [29]:
""" --------- Create TrainTransform class ----------- """
class CVLRTrainTransform(object):

    def __init__(self):

        data_transforms = [
            transforms.RandomResizedCrop(size=RESIZED_FRAME, scale=(0.3, 1), ratio=(0.5, 2)),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomApply(torch.nn.ModuleList([transforms.ColorJitter(brightness=0.8 * 0.3,
                                                                               contrast=0.8 * 0.3,
                                                                               saturation=0.8 * 0.3,
                                                                               hue=0.8 * 0.2)]), p=0.8),
            transforms.RandomGrayscale(p=0.2),
            transforms.GaussianBlur(kernel_size=1, sigma=(0.1, 2.0)),
            #transforms.ToTensor()

        ]
        self.train_transform = transforms.Compose(data_transforms)

    # sample refers to one clip
    def __call__(self, sample):

        transform = self.train_transform

        transformed_clip = []

        for frame in sample:
            # takes in the frames as numpy array and convert to PIL image to do the transformation
            #im_pil = Image.fromarray(frame)
            im_pil = transforms.ToPILImage()(frame).convert("RGB")
            # do the transformation which will then convert to tensors
            transf_img = transform(im_pil)

            # append it to the list, which will be called by the dataset class in the '__getitem__' function
            transformed_clip.append(transf_img)

        return transformed_clip


In [30]:
""" --------- Create TestTransform class ----------- """
class CVLRTestTransform(object):

    def __init__(self):

        data_transforms = [
            transforms.RandomResizedCrop(size=RESIZED_FRAME, scale=(0.3, 1), ratio=(0.5, 2)),
            #transforms.ToTensor()

        ]
        self.train_transform = transforms.Compose(data_transforms)

    # sample refers to one clip
    def __call__(self, sample):

        # call the train_transform
        transform = self.train_transform

        # get the list of transformed frames
        transformed_clip = []

        for frame in sample:
            # takes in the frames as numpy array and convert to PIL image to do the transformation
            #im_pil = Image.fromarray(frame)
            im_pil = transforms.ToPILImage()(frame).convert("RGB")
            # do the transformation which will then convert to tensors
            transf_img = transform(im_pil)

            # append it to the list, which will be called by the dataset class in the '__getitem__' function
            transformed_clip.append(transf_img)

        return transformed_clip

In [31]:
""" ----- Train Dataloader ----- """
train_transformed_dataset = VideoDataset(class_labels=train_num_label, vid=train_videos, transform=CVLRTrainTransform())

train_dataloader = DataLoader(train_transformed_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              # uncomment when using server
                              # num_workers=2
                              )

""" ----- Test Dataloader ----- """
test_transformed_dataset = VideoDataset(class_labels=test_num_label, vid=test_videos, transform=CVLRTestTransform())

test_dataloader = DataLoader(test_transformed_dataset,
                              batch_size=BATCH_SIZE_TEST,
                              shuffle=True,
                              # uncomment when using server
                              # num_workers=2
                              )


""" ----- val Dataloader ----- """
val_transformed_dataset = VideoDataset(class_labels=val_num_label, vid=val_videos, transform=CVLRTestTransform())

val_dataloader = DataLoader(val_transformed_dataset,
                              batch_size=BATCH_SIZE_TEST,
                              shuffle=True,
                              # uncomment when using server
                              # num_workers=2
                              )

In [32]:

""" --------- Loss function ----------- """
# normalized temperature-scaled cross entropy loss
# output 1 and output 2 is the 2 different versions of the same input image
def nt_xent_loss(output1, output2, temperature):
    # concatenate v1 img and v2 img via the rows, stacking vertically
    out = torch.cat([output1, output2], dim=0)
    n_samples = len(out)

    # Full similarity matrix
    # torch.mm --> matrix multiplication for tensors
    # when a transposed is done on a tensor, PyTorch doesn't generate new tensor with new layout,
    # it just modifies meta information in Tensor object so the offset and stride are for the new shape --> its memory
    # layout is different than a tensor of same shape made from scratch
    # contiguous --> makes a copy of tensor so the order of elements would be same as if tensor of same shape created from scratch
    # --> https://discuss.pytorch.org/t/contigious-vs-non-contigious-tensor/30107
    # the diagonal of the matrix is the square of each vector element in the out vector, which shows the similarity between the same elements
    cov = torch.mm(out, out.t().contiguous())
    sim = torch.exp(cov/temperature)

    # Negative similarity
    # creates a 2-D tensor with True on the diagonal for the size of n_samples and False elsewhere
    mask = ~torch.eye(n_samples, device=sim.device).bool()
    # Returns a new 1-D tensor which indexes the input tensor (sim) according to the boolean mask (mask) which is a BoolTensor.
    # returns a tensor with 1 row and n columns and sum it with the last dimension
    neg = sim.masked_select(mask).view(n_samples,-1).sum(dim=-1)

    # Positive similarity
    # exp --> exponential of the sum of the last dimension after output1 * output2 divided by the temp
    pos = torch.exp(torch.sum(output1 * output2, dim=-1)/temperature)
    # concatenate via the rows, stacking vertically
    pos = torch.cat([pos,pos], dim=0)

    # 2 copies of the numerator as the loss is symmetric but the denominator is 2 different values --> 1 for x, 1 for y
    # the loss will be a scalar value
    loss = -torch.log(pos/neg).mean()
    return loss

# make a directory to save the model
saved_model_folder = SAVED_MODEL_FOLDER
if not os.path.exists(saved_model_folder):
    os.mkdir(saved_model_folder)

# create a function for the 3D ResNet 50 architecture
def ResNet_3D_50(img_channels = 3):
    return ResNet(block, layers=[3,4,6,3], image_channels=img_channels)


In [33]:
class CVLR(object):

    def __init__(self):

        #self.writer = SummaryWriter()
        self.device = self._get_device()
        # predefined above
        self.nt_xent_loss = nt_xent_loss
        self.encoder = ResNet_3D_50()

    # use GPU if available
    def _get_device(self):
        device = torch.device("cuda:{}".format(0) if torch.cuda.is_available() else "cpu")
        return device

    def _step(self, model, xis, xjs, n_iter):
        # get the representations and the projections
        ris, zis = model(xis)  # [N,C]

        # get the representations and the projections
        rjs, zjs = model(xjs)  # [N,C]

        # Verificar dimensiones
        print(f"Shape of zis before normalization: {zis.shape}")
        print(f"Shape of zjs before normalization: {zjs.shape}")
    
        # Asegurarse de que zis y zjs tengan al menos dos dimensiones
        if zis.dim() < 2:
            zis = zis.unsqueeze(0)
        if zjs.dim() < 2:
            zjs = zjs.unsqueeze(0)


        # normalize projection feature vectors
        zis = F.normalize(zis, dim=1)
        zjs = F.normalize(zjs, dim=1)

        loss = nt_xent_loss(zis, zjs, temperature=0.5)

        return loss

    def train(self):

        # get the mean batch loss
        def get_mean_of_list(L):

            return sum(L) / len(L)

        model = ResNet_3D_50().to(self.device)
        model = self._load_pre_trained_weights(model)

        optimizer = torch.optim.SGD(model.parameters(), lr=1.0, weight_decay=1e-6)

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_videos), eta_min=0,
                                                               last_epoch=-1)

        n_iter = 0
        valid_n_iter = 0
        best_valid_loss = np.inf
        best_mean_batch_loss = np.inf

        """
        batch, clip 1 and clip 2, class label
        sample_batched

        clip_1 with batch size --> torch.Size([4, 16, 3, 224, 224])
        sample_batched[0]

        clip_2 with batch size --> torch.Size([4, 16, 3, 224, 224])
        sample_batched[1]

        class label for clip 1 and clip 2 --> ('4', '1', '89', '44')
        sample_batched[2]

        clip_1 -> torch.Size([16, 3, 224, 224])
        16 -> frames
        3 -> colour channels
        224 -> height of frames
        224 -> width of frames
        sample_batched[0][0]

        clip_2 -> torch.Size([16, 3, 224, 224])
        16 -> frames
        3 -> colour channels
        224 -> height of frames
        224 -> width of frames
        sample_batched[1][0]
        """
        loss_lst = []
        val_loss_lst = []
        for epoch_counter in range(NUM_OF_EPOCH):

            # a list to store losses for each epoch
            epoch_losses_train = []

            for i_batch, sample_batched in enumerate(train_dataloader):
                #print(i_batch)
                optimizer.zero_grad()
                # print(sample_batched[1][0].size())

                xis = sample_batched[0].to(device)
                # the number of channels must be in the 2nd position else there will be an error
                print(xis.size())
                # xis.size()[0] -> 64 (batch size)
                # xis.size()[3] -> 3 (colour channels)
                # xis.size()[1] -> 16 (number of frames)
                # xis.size()[2] -> 224 (height of frame)
                # xis.size()[4] -> 224 (width of frame)
                xis = torch.reshape(xis, [xis.size()[0], xis.size()[3], xis.size()[1], xis.size()[2], xis.size()[4]])
                print(xis.size())

                xjs = sample_batched[1].to(device)
                xjs = torch.reshape(xjs, [xjs.size()[0], xjs.size()[3], xjs.size()[1], xjs.size()[2], xjs.size()[4]])

                loss = self._step(model, xis, xjs, n_iter)

                # put that loss value in the epoch losses list
                epoch_losses_train.append(loss.to(self.device).data.item())

                # back propagation
                loss.backward()

                optimizer.step()
                n_iter += 1
            # print("Epoch:{}".format(epoch_counter))
            print("Entrando a validation")
            print("-" * 5)
            valid_loss = self._validate(model, val_dataloader)

            #mean of epoch losses, essentially this will reflect mean batch loss for each epoch
            mean_batch_loss_training = get_mean_of_list(epoch_losses_train)
            loss_lst.append(mean_batch_loss_training)
            val_loss_lst.append(valid_loss)
            print("Epoch:{} ------ Mean Batch Loss ({}) ------ Validation_loss: ({})".format(epoch_counter, mean_batch_loss_training, valid_loss))
            model_path = os.path.join(saved_model_folder, 'epoch_{}_model.pt'.format(epoch_counter))
            torch.save(model.state_dict(), model_path)

            """
            if mean_batch_loss_training < best_mean_batch_loss:
                # save the model weights
                best_mean_batch_loss = mean_batch_loss_training
                torch.save(model.state_dict(), config.SAVED_MODEL_PATH_2)
                file = os.path.join(config.MODEL_CHECKPOINT_FOLDER, 'mean_batch_loss.txt')
                with open(file, 'w') as filetowrite:
                    filetowrite.write(
                        "Epoch:{} ------ Mean Batch Loss ({}) ------ Validation_loss: ({})".format(epoch_counter,
                                                                                                 best_mean_batch_loss,
                                                                                                   valid_loss))
            """

            """
            if valid_loss < best_valid_loss:
                # save the model weights
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), config.SAVED_MODEL_PATH)
                file = os.path.join(config.MODEL_CHECKPOINT_FOLDER, 'validation_loss.txt')
                with open(file, 'w') as filetowrite:
                    filetowrite.write(
                        "Epoch:{} ------ Mean Batch Loss ({}) ------ Validation_loss: ({})".format(epoch_counter,
                                                                                                   mean_batch_loss_training,
                                                                                                   best_valid_loss))
            """


            valid_n_iter += 1

            # warmup for the first 10 epochs
            if epoch_counter >= 10:
                scheduler.step()
        
        return loss_lst, val_loss_lst

    # validation step
    def _validate(self, model, val_dataloader):
        # validation steps
        with torch.no_grad():
            model.eval()

            valid_loss = 0.0
            counter = 0

            for i_batch, sample_batched in enumerate(val_dataloader):
                xis = sample_batched[0].to(device)
                #print(xis.size())
                # xis.size()[0] -> 64 (batch size)
                # xis.size()[3] -> 3 (colour channels)
                # xis.size()[1] -> 16 (number of frames)
                # xis.size()[2] -> 224 (height of frame)
                # xis.size()[4] -> 224 (width of frame)
                xis = torch.reshape(xis, [xis.size()[0], xis.size()[3], xis.size()[1], xis.size()[2], xis.size()[4]])

                xjs = sample_batched[1].to(device)
                xjs = torch.reshape(xjs, [xjs.size()[0], xjs.size()[3], xjs.size()[1], xjs.size()[2], xjs.size()[4]])

                loss = self._step(model, xis, xjs, counter)
                valid_loss += loss.item()
                counter += 1
            valid_loss = valid_loss / counter
        model.train()
        return valid_loss


    def _load_pre_trained_weights(self, model):
        try:
            state_dict = torch.load(SAVED_MODEL_CHECKPOINT_PATH)
            model.load_state_dict(state_dict)
            print("Loaded pre-trained model with success.")
        except FileNotFoundError:
            print("Pre-trained weights not found. Training from scratch.")

        return model

In [None]:
CVLR = CVLR()
CVLR.train()

Pre-trained weights not found. Training from scratch.
torch.Size([4, 16, 224, 3, 224])
torch.Size([4, 3, 16, 224, 224])
Shape of zis before normalization: torch.Size([4, 128])
Shape of zjs before normalization: torch.Size([4, 128])
torch.Size([4, 16, 224, 3, 224])
torch.Size([4, 3, 16, 224, 224])
Shape of zis before normalization: torch.Size([4, 128])
Shape of zjs before normalization: torch.Size([4, 128])
torch.Size([4, 16, 224, 3, 224])
torch.Size([4, 3, 16, 224, 224])
Shape of zis before normalization: torch.Size([4, 128])
Shape of zjs before normalization: torch.Size([4, 128])
torch.Size([4, 16, 224, 3, 224])
torch.Size([4, 3, 16, 224, 224])
Shape of zis before normalization: torch.Size([4, 128])
Shape of zjs before normalization: torch.Size([4, 128])
torch.Size([4, 16, 224, 3, 224])
torch.Size([4, 3, 16, 224, 224])
Shape of zis before normalization: torch.Size([4, 128])
Shape of zjs before normalization: torch.Size([4, 128])
torch.Size([4, 16, 224, 3, 224])
torch.Size([4, 3, 16, 2

In [None]:

# Inicialización de listas para train y test
train_class, train_videos, train_num_label = [], [], []
test_class, test_videos, test_num_label = [], [], []

# Diccionario para etiquetar clases
class_mapping = {"cat": 0, "dog": 1}

# Función para procesar el conjunto de datos (train/test)
def process_data(split, class_array, videos_array, num_label_array):
    split_dir = os.path.join(DATA_FOLDER, split)  # Ruta a train o test
    for class_name in os.listdir(split_dir):
        class_path = os.path.join(split_dir, class_name)
        if os.path.isdir(class_path):  # Si es una carpeta (clase)
            label = class_mapping[class_name]  # Obtener etiqueta de la clase
            for video_file in os.listdir(class_path):
                if video_file.endswith(".mp4"):  # Filtrar videos
                    video_path = os.path.join(class_path, video_file)
                    class_array.append(class_name)  # Agregar clase
                    videos_array.append(video_path)  # Agregar ruta del video
                    num_label_array.append(label)  # Agregar etiqueta numérica

# Procesar train y test
process_data("train", train_class, train_videos, train_num_label)
process_data("test", test_class, test_videos, test_num_label)

# Verificar resultados
print("Train data:")
print(f"Classes: {train_class[:5]}")  # Muestra de las clases
print(f"Videos: {train_videos[:5]}")  # Muestra de las rutas de los videos
print(f"Labels: {train_num_label[:5]}")  # Muestra de las etiquetas

print("\nTest data:")
print(f"Classes: {test_class[:5]}")
print(f"Videos: {test_videos[:5]}")
print(f"Labels: {test_num_label[:5]}")


In [None]:
""" Load the entire video instead of the 16 frames"""

""" --------- Create Dataset class ----------- """

class VideoDataset(Dataset):

    def __init__(self, class_labels, vid, transform = None):
        super().__init__()
        self.class_labels = class_labels
        self.vid = vid
        self.transform = transform

    def __getitem__(self, index: int):

        # get one video and its label
        vid_path, class_num_label = self.vid[index], self.class_labels[index]

        # can also use torch vision
        video, audio, info = torchvision.io.read_video(filename=vid_path)
        #print(video.size())

        total_vid_frames = video.size()[0]
        #print(total_vid_frames)




        tensor_clip = torch.reshape(video,
                                      [video.size()[0],
                                       video.size()[3],
                                       video.size()[1],
                                       video.size()[2]])




        if self.transform is not None:

            # do transformation as PIL images on the entire clip using the TrainTransform class
            # returns a list of transformed PIL images
            transformed_clip = self.transform(tensor_clip)



            # convert the entire clip list to tensor
            # convert the PIL images to tensor then stack
            tensor_clip = torch.stack([transforms.functional.to_tensor(pic) for pic in transformed_clip])


            tensor_clip = torch.reshape(tensor_clip,
                                          [tensor_clip.size()[0],
                                           tensor_clip.size()[3],
                                           tensor_clip.size()[1],
                                           tensor_clip.size()[2]])





        # returns a tuple of clip_1, clip_2 and the its label
        return tensor_clip, class_num_label

    # get the length of total dataset
    def __len__(self):
        return len(self.vid)

"""
# Test the dataset class

dataset = VideoDataset(class_labels=train_num_label, vid=train_videos)
print(dataset.__len__())
first_data = dataset[3]
#print(first_data)
"""

""" --------- Create Transform class ----------- """
# only do resizing because there is no transformation needed for downstream tasks
class CVLRTransform(object):

    def __init__(self):

        data_transforms = [
            transforms.RandomResizedCrop(size=224, scale=(0.3, 1), ratio=(0.5, 2)),
            #transforms.ToTensor()

        ]
        self.train_transform = transforms.Compose(data_transforms)

    # sample refers to one clip
    def __call__(self, sample):

        # call the train_transform
        transform = self.train_transform

        # get the list of transformed frames
        transformed_clip = []

        for frame in sample:
            # takes in the frames as numpy array and convert to PIL image to do the transformation
            #im_pil = Image.fromarray(frame)
            im_pil = transforms.ToPILImage()(frame).convert("RGB")
            # do the transformation which will then convert to tensors
            transf_img = transform(im_pil)

            # append it to the list, which will be called by the dataset class in the '__getitem__' function
            transformed_clip.append(transf_img)

        return transformed_clip



""" ----- Train Dataloader ----- """
train_transformed_dataset = VideoDataset(class_labels=train_num_label, vid=train_videos, transform=CVLRTransform())

# batch size 1 to get the features of one video in the training set
train_dataloader = DataLoader(train_transformed_dataset,
                              batch_size=1,
                              shuffle=True,
                              # uncomment when using server
                              #num_workers=3
                              )


""" ----- Test Dataloader ----- """
test_transformed_dataset = VideoDataset(class_labels=test_num_label, vid=test_videos, transform=CVLRTransform())

# batch size 1 to get the features of one video in the test set
test_dataloader = DataLoader(test_transformed_dataset,
                              batch_size=1,
                              shuffle=True,
                              # uncomment when using server
                              #num_workers=3
                              )

In [None]:
class ResNet_3D_50(ResNet):
    def __init__(self):
        super(ResNet_3D_50, self).__init__(block, layers=[3,4,6,3], image_channels=3)
        #self.fc = nn.Linear(512 * 4, num_classes)

    # get the features from the CNN layers
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)


        h = self.avgpool(x)
        h = h.squeeze()
        return h

"""
# Check the output size of the model

def test():
    model = ResNet_3D_50()
    x = torch.randn(10, 3, 3, 224, 224)
    # get the representations and the projections
    output = model(x)
    print(output.size())

#test()
"""

In [None]:
#model = ResNet_3D_50()
model = ResNet_3D_50()
#model = nn.DataParallel(model, device_ids=list(range(torch.cuda.device_count())))
model.eval()
state_dict = torch.load("/kaggle/working/epoch_9_model.pt")

In [None]:
model.load_state_dict(state_dict)
model = model.to(device)

In [None]:
# freeze all the layers in the resnet model
for param in model.parameters():
    param.requires_grad = False
    #print(param)

In [None]:
train_features_path = os.path.join(ROOT_FOLDER, 'train_features')

if not os.path.exists(train_features_path):
    os.mkdir(train_features_path)

In [None]:
x_train_pkl_path = os.path.join(train_features_path, 'x_train.pkl')
y_train_pkl_path = os.path.join(train_features_path, 'y_train.pkl')

In [None]:
# run only if the pickle file does not exist
if os.path.isfile(x_train_pkl_path) == False or os.path.isfile(y_train_pkl_path) == False:

    x_train = []
    y_train = []

    for i_batch, sample_batched in enumerate(train_dataloader):
        img = sample_batched[0].to(device)
        # reshape to fit into the model
        img = torch.reshape(img, [img.size()[0], img.size()[3], img.size()[1], img.size()[2], img.size()[4]]).to(device)
        outputs = model(img).to(device)
        x_train.append(outputs)
        y_train.extend(sample_batched[1])

    # save x train features in a pickle file
    with open(x_train_pkl_path, 'wb') as f:
        pickle.dump(x_train, f)
    # save y train labels in a pickle file
    with open(y_train_pkl_path, 'wb') as f:
        pickle.dump(y_train, f)

else:
    print("'x_train.pkl' and 'y_train.pkl' exists.")


In [None]:
test_features_path = os.path.join(ROOT_FOLDER, 'test_features')

if not os.path.exists(test_features_path):
    os.mkdir(test_features_path)

x_test_pkl_path = os.path.join(test_features_path, 'x_test.pkl')
y_test_pkl_path = os.path.join(test_features_path, 'y_test.pkl')

# run only if the pickle file does not exist
if os.path.isfile(x_test_pkl_path) == False or os.path.isfile(y_test_pkl_path) == False:

    x_test = []
    y_test = []

    for i_batch, sample_batched in enumerate(test_dataloader):
        img = sample_batched[0].to(device)
        # reshape to fit into the model
        img = torch.reshape(img, [img.size()[0], img.size()[3], img.size()[1], img.size()[2], img.size()[4]]).to(device)
        outputs = model(img).to(device)
        x_test.append(outputs)
        y_test.extend(sample_batched[1])

    # save x test features in a pickle file
    with open(x_test_pkl_path, 'wb') as f:
        pickle.dump(x_test, f)
    # save y test labels in a pickle file
    with open(y_test_pkl_path, 'wb') as f:
        pickle.dump(y_test, f)

else:
    print("'x_test.pkl' and 'y_test.pkl' exists.")

In [None]:
with open(os.path.join(train_features_path, 'x_train.pkl'), 'rb') as f:
    x_train = pickle.load(f)

with open(os.path.join(train_features_path, 'y_train.pkl'), 'rb') as f:
    y_train = pickle.load(f)

with open(os.path.join(test_features_path, 'x_test.pkl'), 'rb') as f:
    x_test = pickle.load(f)

with open(os.path.join(test_features_path, 'y_test.pkl'), 'rb') as f:
    y_test = pickle.load(f)

In [None]:
# need to convert tensors to numpy to load into regression model
x_train_numpy_pkl_path = os.path.join(train_features_path, 'x_train_numpy.pkl')

if os.path.isfile(x_train_numpy_pkl_path) == False:
    x_train_numpy = []

    for i in x_train:
        i = i.detach().cpu().numpy()
        x_train_numpy.append(i)


    with open(x_train_numpy_pkl_path, 'wb') as f:
        pickle.dump(x_train_numpy, f)

else:
    print("'x_train_numpy.pkl' exists.")


# need to convert tensors to numpy to load into regression model
x_test_numpy_pkl_path = os.path.join(test_features_path, 'x_test_numpy.pkl')

if os.path.isfile(x_test_numpy_pkl_path) == False:

    x_test_numpy = []

    for i in x_test:
        i = i.detach().cpu().numpy()
        x_test_numpy.append(i)

    with open(x_test_numpy_pkl_path, 'wb') as f:
        pickle.dump(x_test_numpy, f)

else:
    print("'x_test_numpy.pkl' exists.")


with open(os.path.join(train_features_path, 'x_train_numpy.pkl'), 'rb') as f:
    x_train_numpy = pickle.load(f)

with open(os.path.join(test_features_path, 'x_test_numpy.pkl'), 'rb') as f:
    x_test_numpy = pickle.load(f)


logistic_regression = LogisticRegression(random_state=0, max_iter=5000, solver='lbfgs', C=1.0)
logistic_regression.fit(x_train_numpy, y_train)

y_predict = logistic_regression.predict(x_test_numpy)

acc = accuracy_score(y_test, y_predict)
print("Accuracy is {}%".format(acc * 100))

In [None]:
print("Hello")