In [75]:
import os
import json
import pickle
import random
import requests
import numpy as np
import pandas as pd


import torch
import torch.nn as nn
from PIL import Image
from typing import Dict
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import pytorchvideo.transforms as video_transform
import torchvision.transforms._transforms_video as transform_video
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (CenterCropVideo,
                                                      NormalizeVideo)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (ApplyTransformToKey,
                                     ShortSideScale,
                                     UniformTemporalSubsample,
                                     UniformCropVideo)

## Data transformation according to PytorchVideo tutorial

In [3]:
class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self, alpha):
        super().__init__()
        self.alpha = alpha

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // self.alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list
    
def transform_image(train=True, video=False):
    """
    Get the transform to be used on an image.
    :return: A transform
    """
    data_mean = [0.485, 0.456, 0.406]
    data_std = [0.229, 0.224, 0.225]
    # Remember to check it for video and eval
    if video:
        return transforms.Compose([transforms.ToTensor()])

    if train:
        return transforms.Compose([
            transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.0)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=data_mean, std=data_std)
        ])
    else:
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=data_mean, std=data_std)
        ])

def transformer_video():   
    """Trasformation valid for SlowFast"""
    side_size = 256
    mean = [0.45, 0.45, 0.45]
    std = [0.225, 0.225, 0.225]
    crop_size = 256
    num_frames = 32
    alpha = 4
    

    return  video_transform.ApplyTransformToKey(key="video",
                                transform=transforms.Compose([
                                    video_transform.UniformTemporalSubsample(num_frames),
                                    transforms.Lambda(lambda x: x/255.0),
                                    transform_video.NormalizeVideo(mean, std),# transform_video
                                    video_transform.ShortSideScale(size=side_size),
                                    transform_video.CenterCropVideo(crop_size),# transform_video
                                    PackPathway(alpha)
                                ])
                                )

In [4]:
class CustomVideoDataset(Dataset):
    """
        A custom dataset used to create dataloaders.
        """
    
    def __init__(self, filepaths, labels=None, label_map=None, 
                transform_img=None, transform_vid=None, clips_dictionary=None):
        """
        Create a new CustomVideoDataset.

        :param filepaths: A list of filepaths.
        :param labels: A list of labels
        :param label_map: A dictionary to map string labels to intergers
        :param transform: A transform to perform on the frames
        :pram clips_dictionary: dictionary (id clip, list images) to get frames of a clip
        """
        self.filepaths = filepaths
        self.labels = labels
        self.label_map = label_map
        self.transform_img = transform_img
        self.transform_vid = transform_vid
        self.clips_dictionary = clips_dictionary
    
    def __getitem__(self, index):
        clip_id = int(os.path.basename(self.filepaths[index]))  # chech what path you have/want
        frames_paths = self.clips_dictionary[str(clip_id)]

        frames = []
        for f in frames_paths:#frames_paths[:num_frames]:#[:10]:  # get same size clips - random pick for eval
            frame = Image.open(f).convert('RGB')
            #if self.transform is not None:  # BE CAREFUL TRANSFORMATION MIGHT NEED TO CHANGE FOR VIDEO EVAL!!!!!
            frame = self.transform_img(frame)
            frames.append(frame)
        
        img = torch.stack(frames)  # need to be of the same size!
        img = torch.transpose(img, 0, 1) 
        video_data = {'video': img}
        img = self.transform_vid(video_data)
        
        if self.labels is not None:
            if self.label_map is not None:
                label = torch.tensor(self.label_map[(self.labels[index])])
                
            else:
                label = torch.tensor(int(self.labels[index]))
            return img, label
        else:
            return img
    
    def __len__(self):
        return len(self.filepaths)


## Use the API to get same data as JPL to train

In [22]:
def create_session(task_name):
    """
    Create a new session.
    :param task_name: The name of the task (problem
    :return: None
    """
    headers = {'user_secret': team_secret,
               'govteam_secret': ''}
    session_json = {'session_name': 'testing', 'data_type': data_type, 'task_id': task_name}

    response = post_only_once("auth/create_session", headers, session_json)

    session_token = response['session_token']
    session_token = session_token
    return session_token

def post_only_once(command, headers, posting_json):
    r = requests.post(url + "/" + command,
                      json=posting_json, 
                      headers=headers)
    return r.json()

def get_only_once(command, headers):
        
        r = requests.get(url + "/" + command, 
                         headers=headers)
        return r.json()
    
def get_session_status():
    """
    Get the session status.
    :return: The session status
    """
    headers = {'user_secret': team_secret,
               'govteam_secret': '',
               'session_token': session_token}
    r = requests.get(url + "/session_status", headers=headers)
    if 'Session_Status' in r.json():
        return r.json()['Session_Status']
    else:
        return {}

In [11]:
url = 'https://api-dev.lollllz.com/'
team_secret = 'a5aed2a8-db80-4b22-bf72-11f2d0765572'
data_type = 'full'
task_name = '9c103cc4-e2e1-4070-9877-c3b64a6f327f'
session_token = create_session(task_name)


# I ran the code on the VM, that's why the path has mnt/
dataset_dir = '/mnt/lwll/datasets/external'
dataset_name = 'hmdb'
data_type = 'full'

evaluation_meta_path = os.path.join(dataset_dir,
                                    dataset_name,
                                    "labels" + "_" + data_type,
                                    "meta_test.feather")
evaluation_image_path = os.path.join(dataset_dir,
                                     dataset_name,
                                     dataset_name + "_" + data_type,
                                     "test")

unlabeled_image_path = os.path.join(dataset_dir,
                                     dataset_name,
                                     dataset_name + "_" + data_type,
                                     "train")

test_labels_path = os.path.join(dataset_dir,
                              '..',
                              'external',
                              dataset_name,
                              "labels" + "_" + data_type,
                              "labels_test.feather")

all_train_labels_path = os.path.join(dataset_dir,
                                    '..',
                                    'external',
                                    dataset_name,
                                    "labels" + "_" + data_type,
                                    "labels_train.feather")

In [13]:
headers = {'user_secret': team_secret,
               'govteam_secret': '',
               'session_token': session_token}
#log.debug(f"HEADERS: {headers}")
response = get_only_once("seed_labels", headers)
labels = response['Labels']

In [27]:
# Ceate class mapping
session_status = get_session_status()
current_dataset = session_status['current_dataset']

classes = current_dataset['classes']

label_map = {}
class_names = classes
for idx, item in enumerate(class_names):
    label_map[item] = idx

In [29]:
label_map

{'turn': 0,
 'swing_baseball': 1,
 'eat': 2,
 'somersault': 3,
 'situp': 4,
 'shoot_bow': 5,
 'kiss': 6,
 'draw_sword': 7,
 'sit': 8,
 'golf': 9,
 'pushup': 10,
 'shoot_ball': 11,
 'pullup': 12,
 'stand': 13,
 'jump': 14,
 'fencing': 15,
 'smoke': 16,
 'push': 17,
 'cartwheel': 18,
 'flic_flac': 19,
 'hit': 20,
 'run': 21,
 'ride_horse': 22,
 'catch': 23,
 'sword_exercise': 24,
 'walk': 25,
 'talk': 26,
 'fall_floor': 27,
 'climb_stairs': 28,
 'climb': 29,
 'brush_hair': 30,
 'pick': 31,
 'punch': 32,
 'chew': 33,
 'throw': 34,
 'clap': 35,
 'dribble': 36,
 'drink': 37,
 'sword': 38,
 'kick_ball': 39,
 'pour': 40,
 'dive': 41,
 'shake_hands': 42,
 'ride_bike': 43,
 'smile': 44,
 'shoot_gun': 45,
 'laugh': 46,
 'wave': 47,
 'handstand': 48,
 'kick': 49,
 'hug': 50}

In [28]:
# Training data
labeled_images = []
dictionary_clips = {}
for clip in labels:
    action_frames = [str(clip['id']) + '/' + str(i)+'.jpg' for i in range(clip['start_frame'], clip['end_frame'] + 1)]
    dictionary_clips[clip["id"]] = action_frames
    labeled_images.append([clip["class"], clip["id"]])

In [20]:
dictionary_clips['881'][:10]

['881/80326.jpg',
 '881/80327.jpg',
 '881/80328.jpg',
 '881/80329.jpg',
 '881/80330.jpg',
 '881/80331.jpg',
 '881/80332.jpg',
 '881/80333.jpg',
 '881/80334.jpg',
 '881/80335.jpg']

In [26]:
labeled_images[:10]

[['brush_hair', '881'],
 ['cartwheel', '1366'],
 ['catch', '93'],
 ['chew', '561'],
 ['clap', '294'],
 ['climb', '679'],
 ['climb_stairs', '1064'],
 ['dive', '1428'],
 ['draw_sword', '360'],
 ['dribble', '145']]

In [30]:
image_labels, image_names = list(zip(*labeled_images))
image_paths = [os.path.join(unlabeled_image_path, str(image_name)) for image_name in image_names]

In [35]:
paths_dictionary_clips = {}
for clip, frames in dictionary_clips.items():
    paths_dictionary_clips[clip] = [os.path.join(unlabeled_image_path, str(f)) for f in frames]
dictionary_clips = paths_dictionary_clips

In [38]:
dictionary_clips['881'][:10]

['/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80326.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80327.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80328.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80329.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80330.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80331.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80332.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80333.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80334.jpg',
 '/mnt/lwll/datasets/external/hmdb/hmdb_full/train/881/80335.jpg']

In [41]:
image_paths = np.asarray(image_paths)
image_labels = np.asarray(image_labels)

train_idx = list(range(len(image_paths)))
val_idx = []

In [44]:
labeled_dataset = CustomVideoDataset(image_paths[train_idx], 
                                   labels=image_labels[train_idx],
                                   label_map=label_map,
                                   transform_img=transform_image(video=True),
                                   transform_vid=transformer_video(),
                                   clips_dictionary=dictionary_clips)

if len(val_idx) != 0:
    val_dataset = CustomVideoDataset(image_paths[val_idx],
                                     labels=image_labels[val_idx],
                                     label_map=label_map,
                                     transform_img=transform_image(video=True),
                                     transform_vid=transformer_video(),
                                     clips_dictionary=dictionary_clips)
else:
    val_dataset = None

In [45]:
labeled_dataset.__dict__.keys()

dict_keys(['filepaths', 'labels', 'label_map', 'transform_img', 'transform_vid', 'clips_dictionary'])

In [48]:
image_paths = []
dictionary_clips = {}
test_meta = pd.read_feather(evaluation_meta_path)
for clip in test_meta.iterrows():
    row = clip[1]
    action_frames = [os.path.join(evaluation_image_path, str(row['id'])) + '/' + str(i)+'.jpg'
                     for i in range(row['start_frame'], row['end_frame'])] 
    dictionary_clips[row["id"]] = action_frames
    image_paths.append(os.path.join(evaluation_image_path, str(row["id"])))
    
evaluation_dataset = CustomVideoDataset(image_paths,
                                        transform_img=transform_image(train=False, video=True),
                                        transform_vid=transformer_video(),
                                        clips_dictionary=dictionary_clips)

## Extract features

In [49]:
# Load model
device = "cpu"

# Pick a pretrained model and load the pretrained weights
model_name = "slowfast_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model = model.to(device)

Using cache found in /home/cmenghini/.cache/torch/hub/facebookresearch_pytorchvideo_master


Here I substitute the last layer with the identity such to get the
features. Looking at the code of JPL (which relies on the paper's code
https://gitlab.lollllz.com/bjohnson/lwll_baselines/-/blob/master/video_classification/tools/emb_net.py
in lines 83-88 you can see that other than the projection theey also change act. That should be an activation
function, but I can't find it in the model I downloaded)

In [50]:
model.blocks[6].proj = nn.Sequential()

In [51]:
model = model.eval()

In [52]:
# Extract training features
inference_loader = torch.utils.data.DataLoader(
            dataset=labeled_dataset, batch_size=1, shuffle=False,
            num_workers=0, pin_memory=True)

In [None]:
# To speed it up we can use the GPUs
X = np.zeros((len(labeled_dataset.filepaths), 2304))
Y = np.zeros((51))
i = 0
for batch in inference_loader:
    inputs = batch[0]['video']
    #print(inputs[0].size())
    labels = batch[1]

    
    output = model(inputs)
    #print(torch.max(output, 1)[1])
    print(labels.item())
    X[i,:] = output.detach().numpy()[0]
    Y[i] = labels.item()
    i += 1
    print(i)

In [None]:
pickle.dump(X, open("X.p","wb"))
pickle.dump(Y, open("Y.p", "wb"))

In [54]:
evaluation_loader = torch.utils.data.DataLoader(
            dataset=evaluation_dataset, batch_size=1, shuffle=False,
            num_workers=0, pin_memory=True)

In [None]:
# This operation takes long if executed on cpu
eval_X = np.zeros((len(evaluation_dataset.filepaths), 2304))

i = 0
for batch in evaluation_loader:
    inputs = batch['video']
    #print(inputs[0].size())

    output = model(inputs)
    #print(output.detach().numpy()[0])
    eval_X[i,:] = output.detach().numpy()[0]
    i += 1
    print(i)

In [None]:
pickle.dump(eval_X, open("eval_X.p", "wb"))

In [None]:
def get_true_labels(split, mode, dict_clips=None,video=False):
    if mode == 'prod':
        return None

    #print(f"Path for feather {self.all_train_labels_path}")
    if split == 'train':
        df = pd.read_feather(all_train_labels_path)
    else:
        df = pd.read_feather(test_labels_path)

    # convert string labels to int labels
    mapped_label_col = df['class'].map(label_map)
    df['class'] = mapped_label_col
    #log.info(f"Dataframe: {df.head()}")
    #log.info(f"Label map: {self.label_map}")
    # turn Dataframe into a dict
    df = df.set_index('video_id')
    labels_dict = df.to_dict()['class']
    #log.info(f"Label dict: {labels_dict}")

    # get a list of corresponding labels
    if split == 'train':
        image_names = get_unlabeled_image_names(dictionary_clips=dict_clips, video=video)
    else:
        image_names = get_evaluation_image_names(video=video)

    #log.info(f"Images names:{image_names}")
    labels = [labels_dict[image_name] for image_name in image_names]
    return labels

In [56]:
df = pd.read_feather(test_labels_path)
mapped_label_col = df['class'].map(label_map)
df['class'] = mapped_label_col
df = df.set_index('video_id')
labels_dict = df.to_dict()['class']

In [71]:
# Evaluation set Ground truth
true_names = list(df.index)
labels = [labels_dict[true_name] for true_name in true_names]

{'5412': 37,
 '5413': 21,
 '5414': 25,
 '5415': 28,
 '5416': 35,
 '5417': 29,
 '5418': 16,
 '5419': 27,
 '5420': 46,
 '5421': 23,
 '5422': 29,
 '5423': 44,
 '5424': 1,
 '5425': 33,
 '5426': 37,
 '5427': 11,
 '5428': 25,
 '5429': 36,
 '5430': 37,
 '5431': 44,
 '5432': 48,
 '5433': 1,
 '5434': 45,
 '5435': 37,
 '5436': 25,
 '5437': 14,
 '5438': 0,
 '5439': 13,
 '5440': 17,
 '5441': 8,
 '5442': 43,
 '5443': 13,
 '5444': 8,
 '5445': 46,
 '5446': 21,
 '5447': 5,
 '5448': 42,
 '5449': 13,
 '5450': 34,
 '5451': 48,
 '5452': 18,
 '5453': 7,
 '5454': 23,
 '5455': 28,
 '5456': 37,
 '5457': 20,
 '5458': 16,
 '5459': 25,
 '5460': 20,
 '5461': 25,
 '5462': 3,
 '5463': 35,
 '5464': 42,
 '5465': 12,
 '5466': 25,
 '5467': 25,
 '5468': 21,
 '5469': 29,
 '5470': 34,
 '5471': 3,
 '5472': 15,
 '5473': 38,
 '5474': 5,
 '5475': 30,
 '5476': 9,
 '5477': 3,
 '5478': 38,
 '5479': 0,
 '5480': 9,
 '5481': 48,
 '5482': 8,
 '5483': 37,
 '5484': 33,
 '5485': 25,
 '5486': 21,
 '5487': 11,
 '5488': 0,
 '5489': 25,
 '

# Train SVC

In [55]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import normalize

In [76]:
# Random predictions
rnd_predictions = random.choices(range(len(classes)), 
                                 [1/len(classes)]*len(classes), 
                                 k=len(labels))

preds_valid = rnd_predictions
labs_valid = eval_Y

print((labs_valid == preds_valid).mean())

### Model train and prediction (here is the same model JPL uses https://gitlab.lollllz.com/bjohnson/lwll_baselines/-/blob/master/video_classification/tools/shallow_model.py lines 67-72)

In [None]:
nembs_train = normalize(X, axis=1)
labs_train = Y
nembs_valid = normalize(eval_X, axis=1)
labs_valid = eval_Y

model       = LinearSVC().fit(nembs_train, labs_train)
preds_valid = model.predict(nembs_valid)
acc_valid   = (labs_valid == preds_valid).mean()
print(acc_valid)