# Torch

In [55]:
import torch
#use GPU if available
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #'cpu' # 'cuda' or 'cpu'
print(DEVICE)

cuda:0


# transforms functions

In [56]:
import random
import math
import numbers
import collections
import numpy as np
import torch
from PIL import Image, ImageOps
try:
    import accimage
except ImportError:
    accimage = None


class Compose(object):
    """Composes several transforms together.
    Args:
        transforms (list of ``Transform`` objects): list of transforms to compose.
    Example:
        >>> transforms.Compose([
        >>>     transforms.CenterCrop(10),
        >>>     transforms.ToTensor(),
        >>> ])
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, inv=False, flow=False):
        for t in self.transforms:
            img = t(img, inv, flow)
        return img

    def randomize_parameters(self):
        for t in self.transforms:
            t.randomize_parameters()


class ToTensor(object):
    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
    Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
    """

    def __init__(self, norm_value=255):
        self.norm_value = norm_value

    def __call__(self, pic, inv, flow):
        """
        Args:
            pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
        Returns:
            Tensor: Converted image.
        """
        if isinstance(pic, np.ndarray):
            # handle numpy array
            img = torch.from_numpy(pic.transpose((2, 0, 1)))
            # backward compatibility
            return img.float().div(self.norm_value)

        if accimage is not None and isinstance(pic, accimage.Image):
            nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
            pic.copyto(nppic)
            return torch.from_numpy(nppic)

        # handle PIL Image
        if pic.mode == 'I':
            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
        elif pic.mode == 'I;16':
            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
        else:
            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
        if pic.mode == 'YCbCr':
            nchannel = 3
        elif pic.mode == 'I;16':
            nchannel = 1
        else:
            nchannel = len(pic.mode)
        img = img.view(pic.size[1], pic.size[0], nchannel)
        # put it from HWC to CHW format
        # yikes, this transpose takes 80% of the loading time/CPU
        img = img.transpose(0, 1).transpose(0, 2).contiguous()
        if isinstance(img, torch.ByteTensor):
            return img.float().div(self.norm_value)
        else:
            return img

    def randomize_parameters(self):
        pass


class Normalize(object):
    """Normalize an tensor image with mean and standard deviation.
    Given mean: (R, G, B) and std: (R, G, B),
    will normalize each channel of the torch.*Tensor, i.e.
    channel = (channel - mean) / std
    Args:
        mean (sequence): Sequence of means for R, G, B channels respecitvely.
        std (sequence): Sequence of standard deviations for R, G, B channels
            respecitvely.
    """

    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor, inv, flow):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        # TODO: make efficient
        if flow is True:
            mean = [np.mean(self.mean)]
            std = [np.mean(self.std)]
        else:
            mean = self.mean
            std = self.std
        for t, m, s in zip(tensor, mean, std):
            t.sub_(m).div_(s)
        return tensor

    def randomize_parameters(self):
        pass


class Scale(object):
    """Rescale the input PIL.Image to the given size.
    Args:
        size (sequence or int): Desired output size. If size is a sequence like
            (w, h), output size will be matched to this. If size is an int,
            smaller edge of the image will be matched to this number.
            i.e, if height > width, then image will be rescaled to
            (size * height / width, size)
        interpolation (int, optional): Desired interpolation. Default is
            ``PIL.Image.BILINEAR``
    """

    def __init__(self, size, interpolation=Image.BILINEAR):
        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
        self.size = size
        self.interpolation = interpolation

    def __call__(self, img, inv, flow):
        """
        Args:
            img (PIL.Image): Image to be scaled.
        Returns:
            PIL.Image: Rescaled image.
        """
        if isinstance(self.size, int):
            w, h = img.size
            if (w <= h and w == self.size) or (h <= w and h == self.size):
                return img
            if w < h:
                ow = self.size
                oh = int(self.size * h / w)
                return img.resize((ow, oh), self.interpolation)
            else:
                oh = self.size
                ow = int(self.size * w / h)
                return img.resize((ow, oh), self.interpolation)
        else:
            return img.resize(self.size, self.interpolation)

    def randomize_parameters(self):
        pass


class CenterCrop(object):
    """Crops the given PIL.Image at the center.
    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made.
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, img, inv, flow):
        """
        Args:
            img (PIL.Image): Image to be cropped.
        Returns:
            PIL.Image: Cropped image.
        """
        w, h = img.size
        th, tw = self.size
        x1 = int(round((w - tw) / 2.))
        y1 = int(round((h - th) / 2.))
        return img.crop((x1, y1, x1 + tw, y1 + th))

    def randomize_parameters(self):
        pass


class RandomHorizontalFlip(object):
    """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""

    def __call__(self, img, inv, flow):
        """
        Args:
            img (PIL.Image): Image to be flipped.
        Returns:
            PIL.Image: Randomly flipped image.
        """
        if self.p < 0.5:
            img =  img.transpose(Image.FLIP_LEFT_RIGHT)
            if inv is True:
                img = ImageOps.invert(img)
        return img

    def randomize_parameters(self):
        self.p = random.random()


class MultiScaleCornerCrop(object):
    """Crop the given PIL.Image to randomly selected size.
    A crop of size is selected from scales of the original size.
    A position of cropping is randomly selected from 4 corners and 1 center.
    This crop is finally resized to given size.
    Args:
        scales: cropping scales of the original size
        size: size of the smaller edge
        interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation

        self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']

    def __call__(self, img, inv, flow):
        # print(img.size[0])
        min_length = min(img.size[0], img.size[1])
        crop_size = int(min_length * self.scale)

        image_width = img.size[0]
        image_height = img.size[1]

        if self.crop_position == 'c':
            center_x = image_width // 2
            center_y = image_height // 2
            box_half = crop_size // 2
            x1 = center_x - box_half
            y1 = center_y - box_half
            x2 = center_x + box_half
            y2 = center_y + box_half
        elif self.crop_position == 'tl':
            x1 = 0
            y1 = 0
            x2 = crop_size
            y2 = crop_size
        elif self.crop_position == 'tr':
            x1 = image_width - crop_size
            y1 = 1
            x2 = image_width
            y2 = crop_size
        elif self.crop_position == 'bl':
            x1 = 1
            y1 = image_height - crop_size
            x2 = crop_size
            y2 = image_height
        elif self.crop_position == 'br':
            x1 = image_width - crop_size
            y1 = image_height - crop_size
            x2 = image_width
            y2 = image_height

        img = img.crop((x1, y1, x2, y2))

        return img.resize((self.size, self.size), self.interpolation)

    def randomize_parameters(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.crop_position = self.crop_positions[random.randint(0, len(self.crop_positions) - 1)]




class FiveCrops(object):
    """Crop the given PIL.Image to randomly selected size.
    A crop of size is selected from scales of the original size.
    A position of cropping is randomly selected from 4 corners and 1 center.
    This crop is finally resized to given size.
    Args:
        scales: cropping scales of the original size
        size: size of the smaller edge
        interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0], interpolation=Image.BILINEAR, tenCrops=False):
        self.size = size
        self.interpolation = interpolation
        self.mean = mean
        self.std = std
        self.to_Tensor = ToTensor()
        self.normalize = Normalize(self.mean, self.std)
        self.tenCrops = tenCrops

    def __call__(self, img, inv, flow):
        # print(img.size[0])
        crop_size = self.size

        image_width = img.size[0]
        image_height = img.size[1]
        crop_positions = []
        # center
        center_x = image_width // 2
        center_y = image_height // 2
        box_half = crop_size // 2
        x1 = center_x - box_half
        y1 = center_y - box_half
        x2 = center_x + box_half
        y2 = center_y + box_half
        crop_positions += [[x1, y1, x2, y2]]
    # tl
        x1 = 0
        y1 = 0
        x2 = crop_size
        y2 = crop_size
        crop_positions += [[x1, y1, x2, y2]]
        # tr
        x1 = image_width - crop_size
        y1 = 1
        x2 = image_width
        y2 = crop_size
        crop_positions += [[x1, y1, x2, y2]]
        # bl
        x1 = 1
        y1 = image_height - crop_size
        x2 = crop_size
        y2 = image_height
        crop_positions += [[x1, y1, x2, y2]]
        # br
        x1 = image_width - crop_size
        y1 = image_height - crop_size
        x2 = image_width
        y2 = image_height
        crop_positions += [[x1, y1, x2, y2]]
        cropped_imgs = [img.crop(crop_positions[i]).resize((self.size, self.size), self.interpolation) for i in range(5)]
        # cropped_imgs = [img.resize(self.size, self.size, self.interpolation) for img in cropped_imgs]
        if self.tenCrops is True:
            if inv is True:
                flipped_imgs = [ImageOps.invert(cropped_imgs[i].transpose(Image.FLIP_LEFT_RIGHT)) for i in range(5)]
            else:
                flipped_imgs = [cropped_imgs[i].transpose(Image.FLIP_LEFT_RIGHT) for i in range(5)]
            cropped_imgs += flipped_imgs
                # cropped_imgs.append(img1.transpose(Image.FLIP_LEFT_RIGHT))

        tensor_imgs = [self.to_Tensor(img, inv, flow) for img in cropped_imgs]

        normalized_imgs = [self.normalize(img, inv, flow) for img in tensor_imgs]
        fiveCropImgs = torch.stack(normalized_imgs, 0)
        return fiveCropImgs

    def randomize_parameters(self):
        pass

class TenCrops(object):
    """Crop the given PIL.Image to randomly selected size.
    A crop of size is selected from scales of the original size.
    A position of cropping is randomly selected from 4 corners and 1 center.
    This crop is finally resized to given size.
    Args:
        scales: cropping scales of the original size
        size: size of the smaller edge
        interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, size, mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0], interpolation=Image.BILINEAR):
        self.size = size
        self.interpolation = interpolation
        self.mean = mean
        self.std = std
        self.fiveCrops = FiveCrops(self.size, self.mean, self.std, self.interpolation, True)

    def __call__(self, img, inv, flow):
        # print(img.size[0])
        return self.fiveCrops(img, inv, flow)

    def randomize_parameters(self):
        pass


class FlippedImagesTest(object):
    """Image and its horizontally flipped versions
    """

    def __init__(self, mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]):
        self.mean = mean
        self.std = std
        self.to_Tensor = ToTensor()
        self.normalize = Normalize(self.mean, self.std)

    def __call__(self, img, inv, flow):
        # print(img.size[0])
        img_flipped = img.transpose(Image.FLIP_LEFT_RIGHT)
        if inv is True:
            img_flipped = ImageOps.invert(img_flipped)

        # center

        tensor_img = self.to_Tensor(img, inv, flow)
        tensor_img_flipped = self.to_Tensor(img_flipped, inv, flow)

        normalized_img = self.normalize(tensor_img, inv, flow)
        normalized_img_flipped = self.normalize(tensor_img_flipped, inv, flow)
        horFlippedTest_imgs = [normalized_img, normalized_img_flipped]
        horFlippedTest_imgs = torch.stack(horFlippedTest_imgs, 0)
        return horFlippedTest_imgs

    def randomize_parameters(self):
        pass
    

class DownSampling(object):
    
    
    def __init__(self, len_x = 224, len_y = 224, num_x = 7, num_y = 7):
        
        self.len_x = len_x
        self.num_x = num_x
        
        self.len_y = len_y
        self.num_y = num_y

    
    def __call__(self, tensor, inv, flow):
        
        tensor = tensor[0]
        
        pos_x = self.__getPositions(self.len_x, self.num_x)
        pos_y = self.__getPositions(self.len_y, self.num_y)
        
        new_tensor = []

        for i, x in enumerate(pos_x):
            new_tensor.append([])
            for y in pos_y:
                new_tensor[i].append(int(tensor[x][y]))
        
        return torch.Tensor(new_tensor)
    
    
    def __getPositions(self, length, num):
    
        pos = []

        step = int(length/num)
        curr_pos = int(np.ceil((length%num)/2))

        if curr_pos == 0:
            curr_pos = int(length/(2*num))

        while curr_pos < length:
            pos.append(curr_pos)
            curr_pos += step

        return pos
    
    
    def randomize_parameters(self):
        pass

    
class KNN_DownSampling(object):
    
    
    def __init__(self, len_x = 224, len_y = 224, num_x = 7, num_y = 7, K = 0, regression = False, full224 = False):
        
        self.len_x = len_x
        self.num_x = num_x
        
        self.len_y = len_y
        self.num_y = num_y
        
        self.K = K
        
        self.regression = regression
        self.full224 = full224

    
    def __call__(self, tensor, inv, flow):
        
        K = self.K
        
        tensor = tensor[0]
        
        pos_x = self.__getPositions(self.len_x, self.num_x)
        pos_y = self.__getPositions(self.len_y, self.num_y)
        
        new_tensor = []

        if not self.full224:
            
            for i, x in enumerate(pos_x):

                new_tensor.append([])
                start_x = x - K
                end_x = x + K + 1

                if start_x < 0 or end_x > self.len_x:
                    raise Exception("ERROR - x out of bounds")

                for y in pos_y:

                    start_y = y - K
                    end_y = y + K + 1

                    if start_y < 0 or end_y > self.len_y:
                        raise Exception("ERROR - y out of bounds")

                    if not self.regression:
                        value = round(int(tensor[start_x:end_x, start_y:end_y].sum())/((2*K+1)**2), 0)
                    else:
                        value = tensor[start_x:end_x, start_y:end_y].sum()/((2*K+1)**2)

                    new_tensor[i].append(value)
            
        else:

            step = int(224/self.num_x)
            jumps = np.arange(0, 224, step)

            for pos, i in enumerate(jumps):
                
                new_tensor.append([])

                for j in jumps:

                    if not self.regression:
                        value = round(int(tensor[i:i+step, j:j+step].sum())/(step**2), 0)
                    else:
                        value = tensor[i:i+step, j:j+step].sum()/(step**2)
                    
                    new_tensor[pos].append(value)
                
        
        return torch.Tensor(new_tensor)
    
    
    def __getPositions(self, length, num):
    
        pos = []

        step = int(length/num)
        curr_pos = int(np.ceil((length%num)/2))

        if curr_pos == 0:
            curr_pos = int(length/(2*num))

        while curr_pos < length:
            pos.append(curr_pos)
            curr_pos += step

        return pos
    
    
    def randomize_parameters(self):
        pass


class To1Dimension(object):
    
    
    def __init__(self):
        pass
        
        
    def __call__(self, tensor, inv, flow):
        
        l = []
        
        for i in tensor:
            for j in i:
                l.append(int(j))
        
        return torch.Tensor(l)
    
    
    def randomize_parameters(self):
        pass    

# dataset definition

In [57]:
from torchvision.datasets import VisionDataset
from PIL import Image
from math import ceil
import numpy as np
import random
import os
import sys
import torch
from random import randrange

IMAGE = 0
LABEL = 1
TEST_USER = 'S2'
# directory containing the x-flows frames
FLOW_X_FOLDER = "flow_x_processed"
# directory containing the y-flows frames
FLOW_Y_FOLDER = "flow_y_processed"
# directory containing the rgb frames
FRAME_FOLDER = "processed_frames2"
RGB_FOLDER = 'rgb'
RGB_FILENAME = 'rgb'
MMAPS_FOLDER = 'mmaps'
MMAP_FILENAME = 'map'


def pil_loader(path):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    # functions that loads an image as an rgb pil object
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')


def grey_scale_pil_loader(path):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    # functions that loads an image as a grey-scale pil object
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('L')


class GTEA61(VisionDataset):
    # this class inherites from VisionDataset and represents the rgb frames of the dataset
    def __init__(self, root, split='train', seq_len=16, transform=None, target_transform=None,
                 label_map=None, mmaps=False, mmaps_transform=None, static_frames=False):
        super(GTEA61, self).__init__(root, transform=transform, target_transform=target_transform)
        self.datadir = root
        # split indicates whether we should load the train or test split
        self.split = split
        self.get_mmaps = mmaps
        # seq len tells us how many frames for each video we are going to consider
        # frames will be taken uniformly spaced
        self.seq_len = seq_len
        self.label_map = label_map
        self.mmaps = mmaps
        self.mmaps_transform = mmaps_transform
        self.static_frames = static_frames

        if label_map is None:
            # if the label map dictionary is not provided, we are going to build it
            self.label_map = {}
        # videos is a list containing for each video, its path where you can find all its frames
        # whereas mmaps contains the path of the mmaps
        self.videos = []
        if mmaps:
            self.mmaps = []
        # labels[i] contains the class ID of the i-th video
        self.labels = []
        # n_frames[i] contains the number of frames available for i-th video
        self.n_frames = []
        # check if ToTensor is among the transformations
        check_totensor = [isinstance(tr, ToTensor) for tr in self.transform.transforms]
        self.has_to_tensor = True in check_totensor
        if not self.has_to_tensor:
            raise ValueError("you did NOT provide ToTensor as a transformation for rgbs")
        
        if mmaps:
            check_mmaps_totensor = [isinstance(tr, ToTensor) for tr in self.mmaps_transform.transforms]
            self.mmaps_has_to_tensor = True in check_mmaps_totensor
            if not self.mmaps_has_to_tensor:
                raise ValueError("you did NOT provide ToTensor as a transformation for mmaps")

        # we expect datadir to be GTEA61, so we add FRAME_FOLDER to get to the frames
        frame_dir = os.path.join(self.datadir, FRAME_FOLDER)
        users = os.listdir(frame_dir)
        users_tmp = []
        for i in users:
            if i != '.DS_Store':
                users_tmp.append(i)
        users = users_tmp

        print(users)
        if len(users) < 4:
            raise FileNotFoundError("you specified the wrong directory")
        if TEST_USER not in users:
            raise FileNotFoundError("S2 folder not found")
        if self.split == 'test':
            folders = [users[users.index(TEST_USER)]]
        else:
            users.remove(TEST_USER)
            folders = users

        # folders is a list that contains either :
        #   - 1 element -> the path of the folder of the user S2 if split == 'test'
        #   - 3 elements -> the paths of the folders for S1,S3,S4 if split == 'train'

        if label_map is None:
            # now we build the label map; we take folders[0] just to get all class names
            # since it is GUARANTEED that all users have same classes
            classes = sorted(os.listdir(os.path.join(frame_dir, folders[0])))
            classes_tmp = []
            for i in classes:
                if '.DS_Store' not in i :
                    classes_tmp.append(i)
            classes = classes_tmp
            self.label_map = {act: i for i, act in enumerate(classes)}
        for user in sorted(folders):
            if ".DS_Store" in user:
                continue
            user_dir = os.path.join(frame_dir, user)
            # user dir it's gonna be ../GTEA61/processed_frames2/S1 or any other user
            for action in sorted(os.listdir(user_dir)):
                if ".DS_Store" in action:
                    continue
                action_dir = os.path.join(user_dir, action)
                # inside an action dir we can have 1 or more videos
                for element in sorted(os.listdir(action_dir)):
                    if ".DS_Store" in element:
                        continue

                    # we add rgb to the path since there is an additional folder inside S1/1/rgb
                    # before the frames
                    frames = os.path.join(action_dir, element, RGB_FOLDER)
                    if self.get_mmaps:
                        mmap = os.path.join(action_dir, element, MMAPS_FOLDER)
                        self.mmaps.append(mmap)
                    # we append in videos the path
                    self.videos.append(frames)
                    # in labels the label, using the label map
                    self.labels.append(self.label_map[action])
                    # in frames its length in number of frames
                    self.n_frames.append(len(os.listdir(frames)))

    def __getitem__(self, index):
        # firstly we retrieve the video path, label and num of frames
        vid = self.videos[index]
        label = self.labels[index]
        length = self.n_frames[index]
        if self.transform is not None:
            # this is needed to randomize the parameters of the random transformations
            self.transform.randomize_parameters()
            
        if self.mmaps:
            if self.mmaps_transform is not None:
                self.mmaps_transform.randomize_parameters()

        # sort the list of frames since the name is like rgb002.png
        # so we use the last number as an ordering
        frames = np.array(sorted(os.listdir(vid)))
        # now we take seq_len equally spaced frames between 0 and length
        # linspace with the option int will give us the indices to take
        select_indices = np.linspace(0, length, self.seq_len, endpoint=False, dtype=int)
        # we then select the frames using numpy fancy indexing
        # note that the numpy arrays are arrays of strings, containing the file names
        # nevertheless, numpy will work with string arrays as well
        select_frames = frames[select_indices]
        # append to each file its path
        select_files = [os.path.join(vid, frame) for frame in select_frames]
        # use pil_loader to get pil objects
        sequence = [pil_loader(file) for file in select_files]
        if self.get_mmaps:
            # replace folder
            select_map = [os.path.join(os.path.dirname(file).replace(RGB_FOLDER, MMAPS_FOLDER), os.path.basename(file).replace(RGB_FILENAME, MMAP_FILENAME) ) for file in select_files]
            maps_sequence = [grey_scale_pil_loader(file) for file in select_map]
    
        # Applies preprocessing when accessing the image
        
        if not self.static_frames:
            
            if self.transform is not None:
                sequence = [self.transform(image) for image in sequence]
                # now, if the ToTensor transformation is applied
                # we have in sequence a list of tensor, so we use stack along dimension 0
                # to create a tensor with one more dimension that contains them all
                if self.has_to_tensor:
                    sequence = torch.stack(sequence, 0)

                if self.get_mmaps:
                    maps_sequence = [self.mmaps_transform(mmap) for mmap in maps_sequence]
                    if self.has_to_tensor:
                        maps_sequence = torch.stack(maps_sequence, 0)
                    maps_sequence = maps_sequence.squeeze(1)

                    return sequence, maps_sequence, label

            return sequence, label
        
        else:
            
            random_number = randrange(self.seq_len)
            
            if self.transform is not None:
                sequence = [self.transform(image) for image in sequence]
                # now, if the ToTensor transformation is applied
                # we have in sequence a list of tensor, so we use stack along dimension 0
                # to create a tensor with one more dimension that contains them all
                if self.has_to_tensor:
                    sequence = torch.stack(sequence, 0)
                
                static_sequence = [sequence[random_number] for i in range(self.seq_len)]
                
                if self.has_to_tensor:
                    static_sequence = torch.stack(static_sequence, 0)
                
                if self.get_mmaps:
                    maps_sequence = [self.mmaps_transform(mmap) for mmap in maps_sequence]
                    if self.has_to_tensor:
                        maps_sequence = torch.stack(maps_sequence, 0)
                    maps_sequence = maps_sequence.squeeze(1)

                    return sequence, static_sequence, maps_sequence, label

            return sequence, static_sequence, label
            

    def __len__(self):
        return len(self.videos)


class GTEA61_flow(VisionDataset):
    # this class inherites from VisionDataset and represents the rgb frames of the dataset
    def __init__(self, root, split='train', seq_len=5, transform=None, target_transform=None,
                 label_map=None, n_seq=-1):
        super(GTEA61_flow, self).__init__(root, transform=transform, target_transform=target_transform)
        # we expect datadir to be ../GTEA61
        self.datadir = root
        # split indicates whether we should load the train or test split
        self.split = split
        self.n_seq = n_seq
        # seq len here tells us how many optical frames for each video
        # we are going to consider; note that now
        # frames will be sequential and not uniformly spaced
        self.seq_len = seq_len
        self.label_map = label_map
        if label_map is None:
            # if the label map dictionary is not provided, we are going to build it
            self.label_map = {}
        # x_frames is a list containing for each flow video, its path, where you can find all its frames
        # it will contain the ones under flow_x_processed
        self.x_frames = []
        # y_frames is the same as x_frames, but contains the ones under flow_y_processed
        self.y_frames = []
        # labels[i] contains the class ID of the i-th video
        self.labels = []
        # n_frames[i] contains the number of frames available for i-th video
        self.n_frames = []
        # check if ToTensor is among the transformations
        check_totensor = [isinstance(tr, ToTensor) for tr in self.transform.transforms]
        self.has_to_tensor = True in check_totensor
        if not self.has_to_tensor:
            raise ValueError("you did NOT provide ToTensor as a transformation")

        # we expect datadir to be GTEA61, so we add the flow folder to get to the flow frames
        flow_dir = os.path.join(self.datadir, FLOW_X_FOLDER)
        users = os.listdir(flow_dir)
        if len(users) != 4:
            raise FileNotFoundError("you specified the wrong directory")
        if TEST_USER not in users:
            raise FileNotFoundError("S2 folder not found")
        if self.split == 'test':
            folders = [users[users.index(TEST_USER)]]
        else:
            users.remove(TEST_USER)
            folders = users

        # folders is a list that contains either :
        #   - 1 element -> the path of the folder of the user S2 if split == 'test'
        #   - 3 elements -> the paths of the folders for S1,S3,S4 if split == 'train'

        if label_map is None:
            # now we build the label map; we take folders[0] just to get all class names
            # since it is GUARANTEED that all users have same classes
            classes = os.listdir(os.path.join(flow_dir, folders[0]))
            self.label_map = {act: i for i, act in enumerate(classes)}

        for user in folders:
            # user dir it's gonna be ../GTEA61/flow_x_processed/S1 or any other user
            user_dir = os.path.join(flow_dir, user)
            for action in os.listdir(user_dir):
                # inside an action dir we can have 1 or more videos
                action_dir = os.path.join(user_dir, action)
                for element in os.listdir(action_dir):
                    frames = os.path.join(action_dir, element)
                    # we put in x_frames the path to the folder with all the flow frames
                    self.x_frames.append(frames)
                    # the path for the y_frames is the same as x, except that we replace
                    # flow_x_processed with flow_y_processed in the path
                    # it is GUARANTEED that for each action we have the same number
                    # of x and y frames
                    self.y_frames.append(frames.replace(FLOW_X_FOLDER, FLOW_Y_FOLDER))
                    # put the label in label using the label map dictionary
                    self.labels.append(self.label_map[action])
                    # put here the number of flow frames
                    self.n_frames.append(len(os.listdir(frames)))

    def get_selected_files(self, vid_x, frames_x, frames_y, select_indices):
        # select the frames using numpy fancy indexing
        # note these are arrays of strings, containing the file names
        select_x_frames = frames_x[select_indices]
        select_y_frames = frames_y[select_indices]
        # this will position the elements of select_x_frames and select_y_frames
        # alternatively in a numpy array. remember these file names of the frames
        select_frames = np.ravel(np.column_stack((select_x_frames, select_y_frames)))
        # append to each file the root path. we use the one for  x frames,
        # then replace with y for y frames.x frames are in even positions, y in odd positions
        select_files = [os.path.join(vid_x, frame) for frame in select_frames]
        select_files[1::2] = [y_files.replace('x', 'y') for y_files in select_files[1::2]]
        # create pil objects
        sequence = [grey_scale_pil_loader(file) for file in select_files]
        # Applies preprocessing when accessing the image
        if self.transform is not None:
            # inv=True will create the negative image for x frames
            sequence[::2] = [self.transform(image, inv=True, flow=True) for image in sequence[::2]]
            sequence[1::2] = [self.transform(image, inv=False, flow=True) for image in sequence[1::2]]
            # if the ToTensor transformation is applied
            # 'sequence' is a list of tensors, so we stack along dimension 0 in a single tensor
            # then we apply squeeze along the 1 dimension, because the images are grey-scale,
            # so there is only one channel and we eliminate that dimension
            if self.has_to_tensor:
                sequence = torch.stack(sequence, 0).squeeze(1)
        return sequence

    def __getitem__(self, index):
        # get the paths of the x video, y, label and length
        vid_x = self.x_frames[index]
        vid_y = self.y_frames[index]
        label = self.labels[index]
        length = self.n_frames[index]
        # needed to randomize the parameters of the custom transformations
        self.transform.randomize_parameters()
        # sort list of frames since the name is like flow_x_002.png, last number as ordering
        frames_x = np.array(sorted(os.listdir(vid_x)))
        # do the same for y
        frames_y = np.array(sorted(os.listdir(vid_y)))
        if self.n_seq > 0:
            segments = []
            starting_frames = np.linspace(1, length-self.seq_len+1, self.n_seq, endpoint=False, dtype=int)
            for start_frame in starting_frames:
                select_indices = start_frame + np.arange(0, self.seq_len)
                sequence = self.get_selected_files(vid_x, frames_x, frames_y, select_indices)
                segments.append(sequence)
            segments = torch.stack(segments, 0)

            return segments, label
        else:
            if self.split == 'train':
                # if we are training, we take a random starting frame
                start_frame = random.randint(0, length - self.seq_len)
            else:
                # if we are testing, we take a centered interval
                start_frame = np.ceil((length - self.seq_len) / 2).astype('int')
            # the frames will be sequential, so the select indices are
            # from startFrame to starFrame + seq_len
            select_indices = start_frame + np.arange(0, self.seq_len)
            sequence = self.get_selected_files(vid_x, frames_x, frames_y, select_indices)

            return sequence, label

    def __len__(self):
        return len(self.x_frames)


class GTEA61_2Stream(VisionDataset):
    # this class inherites from VisionDataset and represents both rgb and flow frames of the dataset
    # it does so by wrapping together an instance of GTEA61 for the rgb frames
    # and an instance of GTEA61_flow for the flow frames
    def __init__(self, root, split='train', seq_len=7, stack_size=5, transform=None, target_transform=None):
        super(GTEA61_2Stream, self).__init__(root, transform=transform, target_transform=target_transform)
        # we expect datadir to be ../GTEA61
        self.datadir = root
        # split indicates whether we should load the train or test split
        self.split = split
        # seq len is the number of rgb frames. they will be uniformly spaced
        self.seq_len = seq_len
        # stack size is the number of flow frames. they will be sequential
        self.stack_size = stack_size

        # now we check that we are in the right directory
        frame_dir = os.path.join(self.datadir, FRAME_FOLDER)
        users = os.listdir(frame_dir)
        if len(users) != 4:
            raise FileNotFoundError("you specified the wrong directory")
        if TEST_USER not in users:
            raise FileNotFoundError("S2 folder not found")
        if self.split == 'test':
            folders = [users[users.index(TEST_USER)]]
        else:
            users.remove(TEST_USER)
            folders = users
        # now we build a label map dictionary and we pass it to the instances of GTEA and GTEA_flow
        classes = os.listdir(os.path.join(frame_dir, folders[0]))
        self.label_map = {act: i for i, act in enumerate(classes)}
        # instance the rgb dataset
        self.frame_dataset = GTEA61(self.datadir, split=self.split, seq_len=self.seq_len,
                                    transform=self.transform, label_map=self.label_map)
        # instance the flow dataset
        self.flow_dataset = GTEA61_flow(self.datadir, split=self.split, seq_len=self.stack_size,
                                        transform=self.transform, label_map=self.label_map)

    def __getitem__(self, index):
        # to retrieve an item, we just ask the instances of
        # rgb and flow dataset to do it
        # then we return both the tensors, and the label
        frame_seq, label = self.frame_dataset.__getitem__(index)
        flow_seq, _ = self.flow_dataset.__getitem__(index)
        return flow_seq, frame_seq, label

    def __len__(self):
        return self.frame_dataset.__len__()

# imports

In [58]:
import os
import logging
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.backends import cudnn
import torchvision
from colorama import init
from colorama import Fore, Back, Style

from torchvision.models import resnet34
from PIL import Image
from tqdm import tqdm

import sys


#**Learning without Temporal information** (avgpool)

#MAIN PARAMs

In [59]:
homework_step = 0 #--> Learning without Temporal information (avgpool)
homework_step = 1 #--> Learning with Temporal information (LSTM)
homework_step = 2 #--> Learning with Spatio-Temporal information (ConvLSTM)



DATA_DIR = 'GTEA61/' #path dataset
model_folder = '/content/saved_models/' + "/" + "homework_step"+ str(homework_step) + "/" #path to save model
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)


# All this param can be change!

NUM_CLASSES = 61
BATCH_SIZE = 64
LR = 0.001            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 4e-5  # Regularization, you can keep this at the default
NUM_EPOCHS = 200     # Total number of training epochs (iterations over dataset)
STEP_SIZE = [25, 75, 150] # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down
MEM_SIZE = 512       # Dim of internal state of LSTM or ConvLSTM
SEQ_LEN = 3          # Num Frames

# this dictionary is needed for the logger class
parameters = {'DEVICE':DEVICE, 'NUM_CLASSES':NUM_CLASSES, 'BATCH_SIZE':BATCH_SIZE,
             'LR':LR, 'MOMENTUM':MOMENTUM, 'WEIGHT_DECAY':WEIGHT_DECAY, 'NUM_EPOCHS':NUM_EPOCHS,
             'STEP_SIZE':STEP_SIZE, 'GAMMA':GAMMA, 'MEM_SIZE':MEM_SIZE, 'SEQ_LEN':SEQ_LEN}

#Dataloaders & Preprocessing

In [60]:
# Normalize
normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
spatial_transform = Compose([Scale(256), RandomHorizontalFlip(), MultiScaleCornerCrop([1, 0.875, 0.75, 0.65625], 224),
                             ToTensor(), normalize])
spatial_transform_val = Compose([Scale(256), CenterCrop(224), ToTensor(), normalize])


In [61]:
# Prepare Pytorch train/test Datasets
train_dataset = GTEA61(DATA_DIR, split='train', transform=spatial_transform, seq_len=SEQ_LEN)
test_dataset = GTEA61(DATA_DIR, split='test', transform=spatial_transform_val, seq_len=SEQ_LEN)

# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))

# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)


['S1', 'S2', 'S3', 'S4']
['S1', 'S2', 'S3', 'S4']
Train Dataset: 341
Test Dataset: 116


#Models - different resnet implementations

In [62]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, noBN=False):
        super(BasicBlock, self).__init__()
        self.noBN = noBN
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        # print('noBN in basicBlock = ', self.noBN)
        outBN = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        outBN = outBN + residual
        outBN = self.relu(outBN)
        if self.noBN is False:
            return outBN
        else:
            out = out + residual
            return outBN, out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, noBN=False):
        self.inplanes = 64
        self.noBN = noBN
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, noBN=self.noBN)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, noBN=False):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        # print('blocks = ', blocks)
        if noBN is False:
            # print('with BN')
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes))
        else:
            # print('no BN')
            if blocks > 2:
                # print('blocks > 2')
                for i in range(1, blocks-1):
                    layers.append(block(self.inplanes, planes))
                layers.append(block(self.inplanes, planes, noBN=True))
            else:
                # print('blocks <= 2')
                layers.append(block(self.inplanes, planes, noBN=True))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        if self.noBN:
            conv_layer4BN, conv_layer4NBN = self.layer4(x)
        else:
            conv_layer4BN = self.layer4(x)

        x = self.avgpool(conv_layer4BN)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        if self.noBN:
            return x, conv_layer4BN, conv_layer4NBN
        else:
            return x, conv_layer4BN


def resnet18(pretrained=False, noBN=False, **kwargs):
    """Constructs a ResNet-18 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], noBN=noBN, **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False)
    return model


def resnet34(pretrained=False, noBN=False, **kwargs):
    """Constructs a ResNet-34 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], noBN=noBN, **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False)
    return model


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model

#ours model implementation

In [63]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable



# LSTM
class MyLSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size):
        super(MyLSTMCell, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.lin_i_xx = nn.Linear(input_size, hidden_size)
        self.lin_i_hh = nn.Linear(hidden_size, hidden_size, bias=False)

        self.lin_f_xx = nn.Linear(input_size, hidden_size)
        self.lin_f_hh = nn.Linear(hidden_size, hidden_size, bias=False)

        self.lin_c_xx = nn.Linear(input_size, hidden_size)
        self.lin_c_hh = nn.Linear(hidden_size, hidden_size, bias=False)

        self.lin_o_xx = nn.Linear(input_size, hidden_size)
        self.lin_o_hh = nn.Linear(hidden_size, hidden_size, bias=False)


    def forward(self, x, state):
        if state is None:
            state = (Variable(torch.randn(x.size(0), x.size(1)).cuda()),
                     Variable(torch.randn(x.size(0), x.size(1)).cuda()))

        ##################################
        # You should implement this part #
            
        ht_1, ct_1 = state
        it = torch.sigmoid(self.lin_i_xx(x) + self.lin_i_hh(ht_1))
        ft = torch.sigmoid(self.lin_f_xx(x) + self.lin_f_hh(ht_1))
        ct_tilde = torch.tanh(self.lin_c_xx(x) + self.lin_c_hh(ht_1))
        ct = (ct_tilde * it) + (ct_1 * ft)
        ot = torch.sigmoid(self.lin_o_xx(x) + self.lin_o_hh(ht_1))
        ht = ot * torch.tanh(ct)
        return ht, ct

        ##################################

        return  None, None


#ConvLSTM
class MyConvLSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, kernel_size=3, stride=1, padding=1):
        super(MyConvLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.conv_i_xx = nn.Conv2d(input_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv_i_hh = nn.Conv2d(hidden_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding,
                                   bias=False)

        self.conv_f_xx = nn.Conv2d(input_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv_f_hh = nn.Conv2d(hidden_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding,
                                   bias=False)

        self.conv_c_xx = nn.Conv2d(input_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv_c_hh = nn.Conv2d(hidden_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding,
                                   bias=False)

        self.conv_o_xx = nn.Conv2d(input_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv_o_hh = nn.Conv2d(hidden_size, hidden_size, kernel_size=kernel_size, stride=stride, padding=padding,
                                   bias=False)

        torch.nn.init.xavier_normal_(self.conv_i_xx.weight)
        torch.nn.init.constant_(self.conv_i_xx.bias, 0)
        torch.nn.init.xavier_normal_(self.conv_i_hh.weight)

        torch.nn.init.xavier_normal_(self.conv_f_xx.weight)
        torch.nn.init.constant_(self.conv_f_xx.bias, 0)
        torch.nn.init.xavier_normal_(self.conv_f_hh.weight)

        torch.nn.init.xavier_normal_(self.conv_c_xx.weight)
        torch.nn.init.constant_(self.conv_c_xx.bias, 0)
        torch.nn.init.xavier_normal_(self.conv_c_hh.weight)

        torch.nn.init.xavier_normal_(self.conv_o_xx.weight)
        torch.nn.init.constant_(self.conv_o_xx.bias, 0)
        torch.nn.init.xavier_normal_(self.conv_o_hh.weight)

    def forward(self, x, state):
        if state is None:
            state = (Variable(torch.randn(x.size(0), x.size(1), x.size(2), x.size(3)).cuda()),
                     Variable(torch.randn(x.size(0), x.size(1), x.size(2), x.size(3)).cuda()))

        ##################################
        # You should implement this part #
            
        ht_1, ct_1 = state
        it = torch.sigmoid(self.conv_i_xx(x) + self.conv_i_hh(ht_1))
        ft = torch.sigmoid(self.conv_f_xx(x) + self.conv_f_hh(ht_1))
        ct_tilde = torch.tanh(self.conv_c_xx(x) + self.conv_c_hh(ht_1))
        ct = (ct_tilde * it) + (ct_1 * ft)
        ot = torch.sigmoid(self.conv_o_xx(x) + self.conv_o_hh(ht_1))
        ht = ot * torch.tanh(ct)
        return ht, ct
        ##################################

        return  None, None




#Network
class ourModel(nn.Module):
    def __init__(self, num_classes=61, mem_size=512, homework_step = 0 , DEVICE=""):
        super(ourModel, self).__init__()
        self.DEVICE = DEVICE
        self.num_classes = num_classes
        self.resNet = resnet34(True, True)
        self.mem_size = mem_size
        self.weight_softmax = self.resNet.fc.weight
        self.homework_step = homework_step
        if self.homework_step == 1:
          self.lstm_cell = MyLSTMCell(512, mem_size)
        elif self.homework_step == 2:
          self.lstm_cell = MyConvLSTMCell(512, mem_size)

        self.avgpool = nn.AvgPool2d(7)
        self.dropout = nn.Dropout(0.7)
        self.fc = nn.Linear(mem_size, self.num_classes)
        self.classifier = nn.Sequential(self.dropout, self.fc)

    def forward(self, inputVariable):
        #Learning without Temporal information (mean)
        if self.homework_step == 0:
            video_level_features = torch.zeros((inputVariable.size(1), self.mem_size)).to(self.DEVICE)
            for t in range(inputVariable.size(0)):
                #spatial_frame_feat: (bs, 512, 7, 7)
                _, spatial_frame_feat, _ = self.resNet(inputVariable[t])
                #frames_feat: (bs, 512)
                frame_feat = self.avgpool(spatial_frame_feat).view(spatial_frame_feat.size(0), -1)
                video_level_features = video_level_features + frame_feat

            video_level_features = video_level_features / inputVariable.size(0)
            logits = self.classifier(video_level_features)
            return logits, video_level_features

        #Learning with Temporal information (LSTM)
        elif self.homework_step == 1:
            state = ( torch.zeros((inputVariable.size(1), self.mem_size)).to(self.DEVICE),
                     torch.zeros((inputVariable.size(1), self.mem_size)).to(self.DEVICE) )
            for t in range(inputVariable.size(0)):
                #spatial_frame_feat: (bs, 512, 7, 7)
                _, spatial_frame_feat, _ = self.resNet(inputVariable[t])
                #frames_feat: (bs, 512)
                frame_feat = self.avgpool(spatial_frame_feat).view(state[1].size(0), -1)
                state = self.lstm_cell(frame_feat, state)

            video_level_features = state[1]
            logits = self.classifier(video_level_features)
            return logits, video_level_features

        #Learning with Temporal information (ConvLSTM)
        elif self.homework_step == 2:
            state = (torch.zeros((inputVariable.size(1), self.mem_size, 7, 7)).to(self.DEVICE),
                     torch.zeros((inputVariable.size(1), self.mem_size, 7, 7)).to(self.DEVICE))
            for t in range(inputVariable.size(0)):
                #spatial_frame_feat: (bs, 512, 7, 7)
                _, spatial_frame_feat, _ = self.resNet(inputVariable[t])
                state = self.lstm_cell(spatial_frame_feat, state)
            video_level_features = self.avgpool(state[1]).view(state[1].size(0), -1)
            logits = self.classifier(video_level_features)
            return logits, video_level_features

#Build Model - Loss - Opt

In [65]:
#CUDA_LAUNCH_BLOCKING=1
validate = True

model = ourModel(num_classes=NUM_CLASSES, mem_size=MEM_SIZE, homework_step=homework_step, DEVICE=DEVICE) #model

#Train only the lstm cell and classifier
model.train(False)
for params in model.parameters():
    params.requires_grad = False

if homework_step > 0:
    for params in model.lstm_cell.parameters():
        params.requires_grad = True
    model.lstm_cell.train(True)

for params in model.classifier.parameters():
    params.requires_grad = True
model.classifier.train(True)


model = model.to(DEVICE)

model.load_state_dict(torch.load("best_model_state_dict_rgb_split2.pth", map_location=torch.device('cpu')), strict=True)


#Loss
loss_fn = nn.CrossEntropyLoss()
#Opt
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer_fn = optim.Adam(trainable_params, lr=LR, weight_decay=WEIGHT_DECAY, eps=1e-4)
#Scheduler
optim_scheduler = optim.lr_scheduler.MultiStepLR(optimizer_fn, milestones=STEP_SIZE, gamma=GAMMA)


#Training


In [66]:
train_iter = 0
val_iter = 0
min_accuracy = 0

trainSamples = len(train_dataset) - (len(train_dataset) % BATCH_SIZE)
val_samples = len(test_dataset)
iterPerEpoch = len(train_loader)
val_steps = len(val_loader)
cudnn.benchmark
model_checkpoint = "model" #name


for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    numCorrTrain = 0

    #blocks to train
    if homework_step > 0:
        model.lstm_cell.train(True)
    model.classifier.train(True)


    for i, (inputs, targets) in enumerate(train_loader):
        train_iter += 1
        optimizer_fn.zero_grad()

        # (BS, Frames, C, W, H) --> (Frames, BS, C, W, H)
        inputVariable = inputs.permute(1, 0, 2, 3, 4).to(DEVICE)
        labelVariable = targets.to(DEVICE)

        # feeds in model
        output_label, _ = model(inputVariable)

        # compute loss
        loss = loss_fn(output_label, labelVariable)

        # backward loss and optimizer step
        loss.backward()
        optimizer_fn.step()

        #compute the training accuracy
        _, predicted = torch.max(output_label.data, 1)
        numCorrTrain += torch.sum(predicted == labelVariable.data).data.item()
        step_loss = loss.data.item()
        epoch_loss += step_loss

    avg_loss = epoch_loss/iterPerEpoch
    trainAccuracy = (numCorrTrain / trainSamples) * 100
    #train_logger.add_epoch_data(epoch+1, trainAccuracy, avg_loss)
    print(Fore.BLACK + 'Train: Epoch = {} | Loss = {:.3f} | Accuracy = {:.3f}'.format(epoch+1, avg_loss, trainAccuracy))
    if validate:
        if (epoch+1) % 1 == 0:
            model.train(False)
            val_loss_epoch = 0
            numCorr = 0
            for j, (inputs, targets) in enumerate(val_loader):
                val_iter += 1
                inputVariable = inputs.permute(1, 0, 2, 3, 4).to(DEVICE)
                labelVariable = targets.to(DEVICE)

                output_label, _ = model(inputVariable)
                val_loss = loss_fn(output_label, labelVariable)
                val_loss_step = val_loss.data.item()
                val_loss_epoch += val_loss_step
                _, predicted = torch.max(output_label.data, 1)
                numCorr += torch.sum(predicted == labelVariable.data).data.item()
                #val_logger.add_step_data(val_iter, numCorr, val_loss_step)

            val_accuracy = (numCorr / val_samples) * 100
            avg_val_loss = val_loss_epoch / val_steps

            print(Fore.GREEN + 'Val: Epoch = {} | Loss {:.3f} | Accuracy = {:.3f}'.format(epoch + 1, avg_val_loss, val_accuracy))
            if val_accuracy > min_accuracy:
                print("[||| NEW BEST on val||||]")
                save_path_model = os.path.join(model_folder, model_checkpoint)
                torch.save(model.state_dict(), save_path_model)
                min_accuracy = val_accuracy

    optim_scheduler.step()

print(Fore.CYAN + "Best Acc --> ", min_accuracy)
print(Fore.CYAN + "Last Acc --> ", val_accuracy)


[30mTrain: Epoch = 1 | Loss = 4.551 | Accuracy = 12.500
[32mVal: Epoch = 1 | Loss 2.741 | Accuracy = 26.724
[||| NEW BEST on val||||]
[30mTrain: Epoch = 2 | Loss = 3.350 | Accuracy = 16.875
[32mVal: Epoch = 2 | Loss 2.647 | Accuracy = 25.000
[30mTrain: Epoch = 3 | Loss = 3.084 | Accuracy = 17.500
[32mVal: Epoch = 3 | Loss 2.466 | Accuracy = 28.448
[||| NEW BEST on val||||]
[30mTrain: Epoch = 4 | Loss = 2.902 | Accuracy = 21.562
[32mVal: Epoch = 4 | Loss 2.557 | Accuracy = 30.172
[||| NEW BEST on val||||]
[30mTrain: Epoch = 5 | Loss = 2.809 | Accuracy = 22.188
[32mVal: Epoch = 5 | Loss 2.302 | Accuracy = 36.207
[||| NEW BEST on val||||]
[30mTrain: Epoch = 6 | Loss = 2.734 | Accuracy = 25.000
[32mVal: Epoch = 6 | Loss 2.409 | Accuracy = 33.621
[30mTrain: Epoch = 7 | Loss = 2.560 | Accuracy = 30.000
[32mVal: Epoch = 7 | Loss 2.273 | Accuracy = 37.931
[||| NEW BEST on val||||]
[30mTrain: Epoch = 8 | Loss = 2.473 | Accuracy = 30.938
[32mVal: Epoch = 8 | Loss 2.244 | Accuracy

#Test

In [67]:
model.train(False)
val_loss_epoch = 0
numCorr = 0
val_iter = 0
val_samples = len(test_dataset)
val_steps = len(val_loader)

with torch.no_grad():
    for j, (inputs, targets) in enumerate(val_loader):
        val_iter += 1
        inputVariable = inputs.permute(1, 0, 2, 3, 4).to(DEVICE)
        labelVariable = targets.to(DEVICE)

        output_label, _ = model(inputVariable)
        val_loss = loss_fn(output_label, labelVariable)
        val_loss_step = val_loss.data.item()
        val_loss_epoch += val_loss_step
        _, predicted = torch.max(output_label.data, 1)
        numCorr += torch.sum(predicted == labelVariable.data).data.item()

    val_accuracy = (numCorr / val_samples) * 100
    avg_val_loss = val_loss_epoch / val_steps

print('Loss {:.3f} | Accuracy = {:.3f}'.format(avg_val_loss, val_accuracy))

Loss 1.771 | Accuracy = 47.414


#**Learning with Temporal information** (LSTM)

#**Learning with Spatio-Temporal information** (ConvLSTM)



