# Import Library

In [None]:
import sys
print(sys.version, sys.platform, sys.executable)

import math
import time

import torch.nn as nn
from torch.nn.modules.utils import _triple

from torchsummary import summary
from torch import nn, optim

import cv2
import numpy as np
import torch
import torch.nn.functional as F

from torchviz import make_dot
import matplotlib.pyplot as plt
from matplotlib import *
import pylab

import os
from os import listdir
from os.path import isfile, join, isdir

from pathlib import Path

import cv2
import numpy as np
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

torch.cuda.empty_cache()

In [None]:
print(torch.__version__)

# Implement Network

In [None]:
import math

import torch.nn as nn
from torch.nn.modules.utils import _triple


class SpatioTemporalConv(nn.Module):
    r"""Applies a factored 3D convolution over an input signal composed of several input 
    planes with distinct spatial and time axes, by performing a 2D convolution over the 
    spatial axes to an intermediate subspace, followed by a 1D convolution over the time 
    axis to produce the final output.
    Args:
        in_channels (int): Number of channels in the input tensor
        out_channels (int): Number of channels produced by the convolution
        kernel_size (int or tuple): Size of the convolving kernel
        stride (int or tuple, optional): Stride of the convolution. Default: 1
        padding (int or tuple, optional): Zero-padding added to the sides of the input during their respective convolutions. Default: 0
        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
    """

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
        super(SpatioTemporalConv, self).__init__()

        # if ints are entered, convert them to iterables, 1 -> [1, 1, 1]
        kernel_size = _triple(kernel_size)
        stride = _triple(stride)
        padding = _triple(padding)

        # decomposing the parameters into spatial and temporal components by
        # masking out the values with the defaults on the axis that
        # won't be convolved over. This is necessary to avoid unintentional
        # behavior such as padding being added twice
        spatial_kernel_size =  [kernel_size[0], kernel_size[1], kernel_size[2]]
        spatial_stride =  [stride[0], stride[1], stride[2]]
        spatial_padding =  [padding[0], padding[1], padding[2]]

        # the spatial conv is effectively a 2D conv due to the 
        # spatial_kernel_size, followed by batch_norm and ReLU
        self.spatial_conv = nn.Conv3d(in_channels, out_channels, spatial_kernel_size,
                                    stride=spatial_stride, padding=spatial_padding, bias=bias)

    def forward(self, x):
        x = self.spatial_conv(x)
        return x




In [None]:
class SpatioTemporalResBlock(nn.Module):
    r"""Single block for the ResNet network. Uses SpatioTemporalConv in 
        the standard ResNet block layout (conv->batchnorm->ReLU->conv->batchnorm->sum->ReLU)
        
        Args:
            in_channels (int): Number of channels in the input tensor.
            out_channels (int): Number of channels in the output produced by the block.
            kernel_size (int or tuple): Size of the convolving kernels.
            downsample (bool, optional): If ``True``, the output size is to be smaller than the input. Default: ``False``
        """
    def __init__(self, in_channels, out_channels, kernel_size, downsample=False):
        super(SpatioTemporalResBlock, self).__init__()
        
        # If downsample == True, the first conv of the layer has stride = 2 
        # to halve the residual output size, and the input x is passed 
        # through a seperate 1x1x1 conv with stride = 2 to also halve it.

        # no pooling layers are used inside ResNet
        self.downsample = downsample
        
        # to allow for SAME padding
        padding = kernel_size//2

        if self.downsample:
            # downsample with stride =2 the input x
            self.downsampleconv = SpatioTemporalConv(in_channels, out_channels, 1, stride=2)
            self.downsamplebn = nn.BatchNorm3d(out_channels)

            # downsample with stride = 2when producing the residual
            self.conv1 = SpatioTemporalConv(in_channels, out_channels, kernel_size, padding=padding, stride=2)
        else:
            self.conv1 = SpatioTemporalConv(in_channels, out_channels, kernel_size, padding=padding)

        self.bn1 = nn.BatchNorm3d(out_channels)
        self.relu1 = nn.ReLU()

        # standard conv->batchnorm->ReLU
        self.conv2 = SpatioTemporalConv(out_channels, out_channels, kernel_size, padding=padding)
        self.bn2 = nn.BatchNorm3d(out_channels)
        self.outrelu = nn.ReLU()

    def forward(self, x):
        res = self.relu1(self.bn1(self.conv1(x)))    
        res = self.bn2(self.conv2(res))

        if self.downsample:
            x = self.downsamplebn(self.downsampleconv(x))

        return self.outrelu(x + res)




In [None]:
class SpatioTemporalResLayer(nn.Module):
    r"""Forms a single layer of the ResNet network, with a number of repeating 
    blocks of same output size stacked on top of each other
        
        Args:
            in_channels (int): Number of channels in the input tensor.
            out_channels (int): Number of channels in the output produced by the layer.
            kernel_size (int or tuple): Size of the convolving kernels.
            layer_size (int): Number of blocks to be stacked to form the layer
            block_type (Module, optional): Type of block that is to be used to form the layer. Default: SpatioTemporalResBlock. 
            downsample (bool, optional): If ``True``, the first block in layer will implement downsampling. Default: ``False``
        """

    def __init__(self, in_channels, out_channels, kernel_size, layer_size, block_type=SpatioTemporalResBlock, downsample=False):
        
        super(SpatioTemporalResLayer, self).__init__()

        # implement the first block
        self.block1 = block_type(in_channels, out_channels, kernel_size, downsample)

        # prepare module list to hold all (layer_size - 1) blocks
        self.blocks = nn.ModuleList([])
        for i in range(layer_size - 1):
            # all these blocks are identical, and have downsample = False by default
            self.blocks += [block_type(out_channels, out_channels, kernel_size)]

    def forward(self, x):
        x = self.block1(x)
        for block in self.blocks:
            x = block(x)

        return x




In [None]:
class R3DNet(nn.Module):
    r"""Forms the overall ResNet feature extractor by initializng 5 layers, with the number of blocks in 
    each layer set by layer_sizes, and by performing a global average pool at the end producing a 
    512-dimensional vector for each element in the batch.
        
        Args:
            layer_sizes (tuple): An iterable containing the number of blocks in each layer
            block_type (Module, optional): Type of block that is to be used to form the layers. Default: SpatioTemporalResBlock. 
        """
    def __init__(self, layer_sizes, block_type=SpatioTemporalResBlock):
        super(R3DNet, self).__init__()

        # first conv, with stride 1x2x2 and kernel size 3x7x7
        self.conv1 = SpatioTemporalConv(3, 64, [3, 7, 7], stride=[1, 2, 2], padding=[1, 3, 3])
        # output of conv2 is same size as of conv1, no downsampling needed. kernel_size 3x3x3
        self.conv2 = SpatioTemporalResLayer(64, 64, 3, layer_sizes[0], block_type=block_type)
        # each of the final three layers doubles num_channels, while performing downsampling 
        # inside the first block
        self.conv3 = SpatioTemporalResLayer(64, 128, 3, layer_sizes[1], block_type=block_type, downsample=True)
        self.conv4 = SpatioTemporalResLayer(128, 256, 3, layer_sizes[2], block_type=block_type, downsample=True)
        self.conv5 = SpatioTemporalResLayer(256, 512, 3, layer_sizes[3], block_type=block_type, downsample=True)

        # global average pooling of the output
        self.pool = nn.AdaptiveAvgPool3d(1)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)

        x = self.pool(x)
        
        return x.view(-1, 512)



In [None]:
class R3DClassifier(nn.Module):
    r"""Forms a complete ResNet classifier producing vectors of size num_classes, by initializng 5 layers, 
    with the number of blocks in each layer set by layer_sizes, and by performing a global average pool
    at the end producing a 512-dimensional vector for each element in the batch, 
    and passing them through a Linear layer.
        
        Args:
            num_classes(int): Number of classes in the data
            layer_sizes (tuple): An iterable containing the number of blocks in each layer
            block_type (Module, optional): Type of block that is to be used to form the layers. Default: SpatioTemporalResBlock. 
        """
    def __init__(self, num_classes, layer_sizes, block_type=SpatioTemporalResBlock):
        super(R3DClassifier, self).__init__()

        self.res2plus1d = R3DNet(layer_sizes, block_type)
        self.linear = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.res2plus1d(x)
        x = self.linear(x) 

        return x

# Crezione dataset

In [None]:
#Proprietà input e device:

CLIP_LEN, RESIZE_HEIGHT, CROP_SIZE = 10, 128, 112
resize_height = RESIZE_HEIGHT
crop_size = CROP_SIZE
clip_len = CLIP_LEN

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def center_crop(image):
    height_index = math.floor((image.shape[0] - crop_size) / 2)
    width_index = math.floor((image.shape[1] - crop_size) / 2)
    image = image[height_index:height_index + crop_size, width_index:width_index + crop_size, :]
    return np.array(image).astype(np.uint8)

In [None]:
class VideoDataset(Dataset):
    def __init__(self, directory, mode='train', clip_len=8, num_sec=8):
        folder = Path(directory)

        self.clip_len = clip_len-1
        
        self.num_sec=num_sec
        
        # the following three parameters are chosen as described in the paper section 4.1
        self.resize_height = 128/2  
        self.resize_width = 171/2
        self.crop_size = 112/2

        # obtain all the filenames of files inside all the class folders 
        # going through each class folder one at a time
        self.fnames, labels = [], []
        for label in sorted(os.listdir(folder)):
            for fname in os.listdir(os.path.join(folder, label)):
                self.fnames.append(os.path.join(folder, label, fname))
                labels.append(label)
                
        # prepare a mapping between the label names (strings) and indices (ints)
        self.label2index = {label:index for index, label in enumerate(sorted(set(labels)))} 
        
        # convert the list of label names into an array of label indices
        self.label_array1 = np.array([self.label2index[label] for label in labels], dtype=int)        
        #create_dataset()
        self.data_array = []
        
        self.label_array = []
        for i in np.arange(len(self.fnames)):
            fname = self.fnames[i]
            files = [f for f in os.listdir(fname) if os.path.isfile(os.path.join(fname, f))]
            clips = []
            for f in files:
                clip = cv2.imread(os.path.join(fname, f))
                clips.append(clip)
                inputs = np.array(clips)
                inputs = np.expand_dims(inputs, axis=0)
            
            try:
                buffer = np.transpose(inputs, (0, 4, 1, 2, 3))
                buffer = buffer[0,:,:,:,:]
            except:
                buffer = np.transpose(inputs, (3, 0, 1, 2))
                buffer = buffer
            
            if(np.shape(buffer)[1] != 1):
                self.label_array.append(self.label_array1[i])
                self.data_array.append(buffer)
        
    
    def __getitem__(self, index):
        
        return [self.data_array[index], self.label_array[index]]

    def __len__(self):
        return len(self.label_array)

# Training

In [None]:

# Use GPU if available else revert to CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device being used:", device)

def train_model(num_classes, directory, layer_sizes=[2, 2, 2, 2], num_epochs=45, save=True, path="model_data.pth.tar"):
    """Initalizes and the model for a fixed number of epochs, using dataloaders from the specified directory, 
    selected optimizer, scheduler, criterion, defualt otherwise. Features saving and restoration capabilities as well. 
    Adapted from the PyTorch tutorial found here: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
        Args:
            num_classes (int): Number of classes in the data
            directory (str): Directory where the data is to be loaded from
            layer_sizes (list, optional): Number of blocks in each layer. Defaults to [2, 2, 2, 2], equivalent to ResNet18.
            num_epochs (int, optional): Number of epochs to train for. Defaults to 45. 
            save (bool, optional): If true, the model will be saved to path. Defaults to True. 
            path (str, optional): The directory to load a model checkpoint from, and if save == True, save to. Defaults to "model_data.pth.tar".
    """


    # initalize the ResNet 18 version of this model
    model = R3DClassifier(num_classes=num_classes, layer_sizes=layer_sizes).to(device)
    model.half()
    
    criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification
    optimizer = optim.SGD(model.parameters(), lr=0.01)  # hyperparameters as given in paper sec 4.1
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)  # the scheduler divides the lr by 10 every 10 epochs

    # prepare the dataloaders into a dict
    train_dataloader = DataLoader(VideoDataset(directory), batch_size=6, shuffle=True, num_workers=4)
    # IF training on Kinetics-600 and require exactly a million samples each epoch, 
    # import VideoDataset1M and uncomment the following
    # train_dataloader = DataLoader(VideoDataset1M(directory), batch_size=32, num_workers=4)
    val_dataloader = DataLoader(VideoDataset(directory, mode='val'), batch_size=4, num_workers=4)
    dataloaders = {'train': train_dataloader, 'val': val_dataloader}
    
    dataset_sizes = {x: len(dataloaders[x].dataset) for x in ['train', 'val']}
    
    # saves the time the process was started, to compute total time at the end
    start = time.time()
    epoch_resume = 0

    # check if there was a previously saved checkpoint
    if os.path.exists(path):
        # loads the checkpoint
        checkpoint = torch.load(path)
        print("Reloading from previously saved checkpoint")

        # restores the model and optimizer state_dicts
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['opt_dict'])
        
        # obtains the epoch the training is to resume from
        epoch_resume = checkpoint["epoch"]

    for epoch in tqdm(range(epoch_resume, num_epochs), unit="epochs", initial=epoch_resume, total=num_epochs):
        # each epoch has a training and validation step, in that order
        for phase in ['train', 'val']:
            
            # reset the running loss and corrects
            running_loss = 0.0;
            running_corrects = 0;

            # set model to train() or eval() mode depending on whether it is trained
            # or being validated. Primarily affects layers such as BatchNorm or Dropout.
            if phase == 'train':
                # scheduler.step() is to be called once every epoch during training
                optimizer.step() 
                scheduler.step()
                model.train()
            else:
                model.eval()


            for inputs, labels in dataloaders[phase]:
                
                # move inputs and labels to the device the training is taking place on
                inputs = inputs.to(device);
                labels = labels.to(device);
                optimizer.zero_grad()

                # keep intermediate states iff backpropagation will be performed. If false, 
                # then all intermediate states will be thrown away during evaluation, to use
                # the least amount of memory possible.
                with torch.set_grad_enabled(phase=='train'):
                    inputs = inputs.half()
                    outputs = model(inputs);
                    # we're interested in the indices on the max values, not the values themselves
                    _, preds = torch.max(outputs, 1)  
                    loss = criterion(outputs, labels)

                    # Backpropagate and optimize iff in training mode, else there's no intermediate
                    # values to backpropagate with and will throw an error.
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()   

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f"{phase} Loss: {epoch_loss} Acc: {epoch_acc}")

    # save the model if save=True
    if save:
        torch.save({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'acc': epoch_acc,
        'opt_dict': optimizer.state_dict(),
        }, path)

    # print the total time needed, HH:MM:SS format
    time_elapsed = time.time() - start    
    print(f"Training complete in {time_elapsed//3600}h {(time_elapsed%3600)//60}m {time_elapsed %60}s")
    
    return model

### Launch training and load weight

In [None]:
torch.cuda.empty_cache()
#path = dataset path
path = "dataset_path"

#path_rete = where the weight of the network will be stored
path_rete = "NameNN.pth.tar"

net = train_model(4, path+str(fps), layer_sizes=[1, 1, 1, 1], num_epochs=10, save=True, path= path_rete)

model = R3DClassifier(num_classes=4, layer_sizes=[1, 1, 1, 1]).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)

x = torch.load(path_rete)

model.load_state_dict(x['state_dict'])
optimizer.load_state_dict(x['opt_dict'])
acc= x['acc']