# Action Recognition over UCF101  

We use a pre-trained model (VGG16) to extract features from each frame (output from VGG 4096x25 for the first 25 frames of every video) and it is fed to an LSTM network which takes a **dx25** sample as input (where **d** is the dimension of the extracted feature for each frame), and outputs the action label of that sample.


Compare the performance with a SVM trained over stacked **dx25** feature matrix.


Raw images of 256x340 are resized by cropping five **nxn** images, one at the image center and four at the corners and compute the **d**-dim features for each of them, and average these five **d**-dim feature to get a final feature representation for the raw image.

The first 25 classes of the whole dataset are initially considered.

## Dataset
Download dataset at [UCF101](http://vision.cs.stonybrook.edu/~yangwang/public/UCF101_images.tar)(Image data for each video) 

**annos** folder has the video labels and the label to class name mapping.

---
Feature extraction:

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from skimage import io
from sklearn import svm
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader

In [None]:
# Initialize variables
low_class_count_bound =1
class_count = 25
video_labels_file = "annos/videos_labels_subsets.txt"
actions_file = "annos/actions.txt"

# Read actions and labels file
actions_map = pd.read_csv(actions_file, sep="  ", header=None, engine='python')
actions_map.columns = ["label", "label_name"]

label_map = pd.read_csv(video_labels_file, sep="\t", header=None)
label_map.columns = ["image_name", "label", "train_flag"]

# Filter based on the label
filtered_label_map = label_map.loc[label_map.label <=class_count].loc[label_map.label >=low_class_count_bound]
filtered_label_map = filtered_label_map.set_index('label').join(actions_map.set_index('label'), lsuffix='_caller', rsuffix='_other').reset_index()
train_label_map = filtered_label_map.loc[filtered_label_map.train_flag == 1]
test_label_map = filtered_label_map.loc[filtered_label_map.train_flag == 2]

In [None]:
# Dataset class
class VideoImageDataset(Dataset):
    def __init__(self, init_df, root_dir, transform=None, crop_transform=None):
        # initialization
        self.labels_frame = init_df
        self.root_dir = root_dir
        self.transform = transform
        self.crop_transform = crop_transform

    def __len__(self):
        return len(self.labels_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Iterating over all the images in folder
        img_list = []
        img_folder_path = os.path.join(self.root_dir, self.labels_frame.iloc[idx, 1])
        img_files = [os.path.join(img_folder_path, img) for img in os.listdir(img_folder_path) if os.path.isfile(os.path.join(img_folder_path, img))]
        for img_file in img_files:
            # reading the image
            img_list.append(io.imread(img_file))

        
        # Applying transformations like normalizing, 5-cropping
        if self.transform:
            img_norm_list = []
            for i in range(len(img_list)):
                # normalize
                img_norm_list.append(self.transform(img_list[i]))
            img_tuple = {'img_list': img_norm_list, 'labels': self.labels_frame.iloc[idx,0]}
            # 5-crop
            img_tuple = self.crop_transform(img_tuple)
        else:
            img_tuple = {'img_list': img_list, 'labels': self.labels_frame.iloc[idx,0]}
        return img_tuple

In [None]:
######### VGG #########

import torchvision.models as models

# pretrained vgg16 for feature extraction
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_vgg16 = models.vgg16_bn(pretrained=True)

# updating the classifier so as to extract features from the first fully connected layer
new_classifier = list(model_vgg16.classifier.children())[0:2]
# Updating the classifier model
model_vgg16.classifier =  nn.Sequential(*new_classifier)
model_vgg16.eval()
model_vgg16 = model_vgg16.to(device)
output_size = (224,224)

In [None]:
# 5-Crop transformation class
class Crop(object):
    def __init__(self, output_size, vgg_model):
        # initialization
        self.output_size = output_size
        self.vgg_model = vgg_model

    # Function to extract features using vgg16
    def get_vgg_features(self, img):
        with torch.no_grad():
            # Convert to torch Variable
            img = torch.from_numpy(img)
            img = img.float() 

            # Check if GPU can be used
            CUDA = torch.cuda.is_available()
            if CUDA:
                img = img.cuda()
            # Make the forward pass
            outputs = self.vgg_model.forward(img)
            # Clearing to free up some memory
            del img
        
        torch.cuda.empty_cache()
        # Return the features
        return outputs
        
    def __call__(self, img_tuple):
        n = output_size[0]
        img_array = np.empty((0, 3, 224, 224))
        img_list, labels = img_tuple['img_list'], img_tuple['labels']
        img_feat_list = []
        # 5-cropping - 4 corners and one at the centre
        for i in range(len(img_list)):
            c,x,y = img_list[i].shape
            img_array = np.append(img_array, np.transpose(np.expand_dims(img_list[i][:,:n,:n], axis=0), axes=[0,1,2,3]), axis=0)
            img_array = np.append(img_array, np.transpose(np.expand_dims(img_list[i][:,-n:,:n], axis=0), axes=[0,1,2,3]), axis=0)
            img_array = np.append(img_array, np.transpose(np.expand_dims(img_list[i][:,:n,-n:], axis=0), axes=[0,1,2,3]), axis=0)
            img_array = np.append(img_array, np.transpose(np.expand_dims(img_list[i][:,-n:,-n:], axis=0), axes=[0,1,2,3]), axis=0)
            img_array = np.append(img_array, np.transpose(np.expand_dims(img_list[i][:,x//2-n//2:x//2+n//2,y//2-n//2:y//2+n//2], axis=0), 
                                                          axes=[0,1,2,3]), axis=0)
        # get the image features from vgg model
        img_feat_array = self.get_vgg_features(img_array)
        
        for i in range(len(img_list)):
            # take the mean of 5-crop images of the main image
            img_feat_list.append(torch.mean(img_feat_array[i:i+5,:], 0,False))
       
        return {'img_list': img_feat_list, 'labels':labels}

In [None]:
# generate train dataset

# normalize transform
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# training dataset
train_video_dataset = VideoImageDataset(train_label_map, root_dir=r"data/images", 
                                  transform=transforms.Compose([
                                               transforms.ToTensor(), normalize]),
                                  crop_transform=transforms.Compose([Crop(output_size, model_vgg16)]))
print("Total count: %d"%len(train_video_dataset))

# Saving the dataset for reuse
dataset_list = []
start = time.time()
for i in range(len(train_video_dataset)):
    print(i)
    dataset_list.append(train_video_dataset[i])
print("Time taken for feature generation: %f"%(time.time()-start))
start = time.time()
torch.save(dataset_list, "train_dataset_"+str(low_class_count_bound)+"_to_"+str(class_count)+".pt")
print("Time taken for saving features: %f"%(time.time()-start))

In [None]:
# Load the train dataset

start = time.time()
train_video_dataset = torch.load("train_dataset_"+str(low_class_count_bound)+"_to_"+str(class_count)+".pt")
print("Time taken for loading features: %f"%(time.time()-start))
print("Total count: %d"%len(train_video_dataset))

Time taken for loading features: 5.327873
Total count: 2409


In [None]:
# generate test dataset

# normalize transform
test_video_dataset = VideoImageDataset(test_label_map, root_dir=r"data/images", 
                                  transform=transforms.Compose([
                                               transforms.ToTensor(), normalize]),
                                  crop_transform=transforms.Compose([Crop(output_size, model_vgg16)]))
print("Total count: %d"%len(test_video_dataset))
      
# Saving the dataset for reuse
dataset_list = []
start = time.time()
for i in range(len(test_video_dataset)):
    print(i)
    dataset_list.append(test_video_dataset[i])
print("Time taken for feature generation: %f"%(time.time()-start))
start = time.time()
torch.save(dataset_list, "test_dataset_"+str(low_class_count_bound)+"_to_"+str(class_count)+".pt")
print("Time taken for saving features: %f"%(time.time()-start))

In [None]:
# Load the test dataset

start = time.time()
test_video_dataset = torch.load("test_dataset_"+str(low_class_count_bound)+"_to_"+str(class_count)+".pt")
print("Time taken for loading features: %f"%(time.time()-start))
print("Total count: %d"%len(test_video_dataset))

Time taken for loading features: 2.217070
Total count: 951


In [None]:
# reshaping data for feeding to the network

for i in range(len(train_video_dataset)):
    for j in range(len(train_video_dataset[i]['img_list'])):
        train_video_dataset[i]['img_list'][j] = train_video_dataset[i]['img_list'][j].reshape(-1).cpu().numpy()
    #train_video_dataset[i]['img_list'] = np.transpose(np.array(train_video_dataset[i]['img_list']), axes=[1,0])
    train_video_dataset[i]['img_list'] = np.array(train_video_dataset[i]['img_list'])
for i in range(len(test_video_dataset)):
    for j in range(len(test_video_dataset[i]['img_list'])):
        test_video_dataset[i]['img_list'][j] = test_video_dataset[i]['img_list'][j].reshape(-1).cpu().numpy()
#     test_video_dataset[i]['img_list'] = np.transpose(np.array(test_video_dataset[i]['img_list']), axes=[1,0])
    test_video_dataset[i]['img_list'] = np.array(test_video_dataset[i]['img_list'])

***
Modelling:

In [None]:
print('Shape of training data is :', train_video_dataset[0]['img_list'].shape)
print('Number of training records is :', len(train_video_dataset))
print('Shape of test/validation data is :', test_video_dataset[0]['img_list'].shape)
print('Number of testing records is :', len(test_video_dataset))

Shape of training data is : (25, 4096)
Number of training records is : 2409
Shape of test/validation data is : (25, 4096)
Number of testing records is : 951


In [None]:
# LSTM classifier model class
class LSTM(nn.Module):
    def __init__(self, input_size=4096, num_lstm_layers=2, hidden_layer_size=[200], 
                 output_size=class_count, dropout=0, bidirectional=False):
        
        super().__init__()
        # initialization
        self.hidden_layer_size = hidden_layer_size
        self.num_lstm_layers = num_lstm_layers
        if bidirectional:
            self.directions = 2
        else:
            self.directions = 1
        # hidden state and cell state
        self.hidden_cell = \
            (torch.zeros(self.num_lstm_layers*self.directions, 10, 
                self.hidden_layer_size[0]).cuda(), 
             torch.zeros(self.num_lstm_layers*self.directions, 10, 
                self.hidden_layer_size[0]).cuda())
        
        # LSTM model
        self.lstm = nn.LSTM(input_size, hidden_layer_size[0], 
                            num_layers=num_lstm_layers, batch_first=True, 
                            dropout=dropout, bidirectional=bidirectional)
        
        # Sequential model (generating dynamically based on the input hidden dimensions list)
        sequential_model_list = []
        
        # if only one hidden layer
        # first layer
        if len(hidden_layer_size) == 1:
            sequential_model_list.append(nn.Linear(in_features=hidden_layer_size[0]*self.directions, 
                    out_features=output_size))
        
        # if more than one hidden layers
        else:
            for i in range(len(hidden_layer_size)-1):
                # first layer
                if i == 0:
                    sequential_model_list.append(nn.Linear(in_features=hidden_layer_size[i]*self.directions, 
                        out_features=hidden_layer_size[i+1]))
                # subsequent layers
                else:
                    # activation function
                    sequential_model_list.append(nn.ReLU(inplace=True))
                    # dropout layer
                    sequential_model_list.append(nn.Dropout())
                    # hidden layer
                    sequential_model_list.append(nn.Linear(in_features=hidden_layer_size[i], 
                        out_features=hidden_layer_size[i+1]))
            sequential_model_list.append(nn.ReLU(inplace=True))
            # last layer
            sequential_model_list.append(nn.Linear(in_features=hidden_layer_size[-1], 
                        out_features=output_size))
        # creating a sequential model
        self.linear_layers = nn.Sequential(*sequential_model_list)
    
    # forward pass
    def forward(self, input_seq):
        # pass through LSTM
        lstm_out,self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        # pass through hidden layers
        predictions = self.linear_layers(lstm_out[:,-1,:])
        # applying softmax to get label scores
        label_scores = F.log_softmax(predictions, dim=1)
        return label_scores

In [None]:
# train / predict function

def start_train(model, dataloader, optimizer, num_epochs=30, train=True):
    debug = False
    # Checking if GPU mode available
    CUDA = torch.cuda.is_available()
    if CUDA:
        model = model.cuda()

    # Using Cross entropy loss function
    loss_func = nn.CrossEntropyLoss()   

    #Define the lists for storing the results
    training_loss = []
    training_accuracy = []
    start = time.time()

    # For evaluation, we don't need many epochs, its just running once
    if train==False:
        num_epochs = 1
    # Iterating over epochs - training
    for epoch in range(num_epochs): 
        #Resetting the variables at the begining of every epoch
        correct_pred = 0
        iterations = 0
        iteration_loss = 0.0
        sample_count = 0

        # Setting the model into train / eval mode based on the type of execution
        if train:
            model.train()
        else:
            model.eval()

        # Iterating over all the training batches
        for i, data_batch in enumerate(dataloader):
            inputs = Variable(data_batch['img_list'])
            batch_size = len(inputs)
            
            # hidden and cell states of LSTM model
            lstm_model.hidden_cell = \
            (torch.zeros(model.num_lstm_layers*model.directions, batch_size, 
                lstm_model.hidden_layer_size[0]).cuda(), 
             torch.zeros(model.num_lstm_layers*model.directions, batch_size, 
                lstm_model.hidden_layer_size[0]).cuda())
            
            # Converting to tensor variable (from pytorch>0.4.0, torch ~ variable)
            labels = Variable(data_batch['labels'])
            inputs = Variable(inputs)
            inputs = inputs.float() 
            labels = labels - 1
            labels = Variable(labels)

            # If we have GPU, shift the data to GPU
            if CUDA:
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Clearing the gradient
            optimizer.zero_grad()
            # Making a forward pass
            outputs = model.forward(inputs)      

            # Calculating the loss value
            loss_val = loss_func(outputs, labels) 
            # Accumulating the loss 
            iteration_loss += loss_val.data
            if train:
                # Backpropagation
                loss_val.backward()
                # Updating the weights
                optimizer.step()

            # Calculating the correct predictions for training data
            _, predicted = torch.max(outputs, 1)
            correct_pred += (predicted == labels).sum()
            sample_count += len(labels)
            iterations += 1

            # Clearing to free up some memory
            del inputs, labels, outputs, predicted
            torch.cuda.empty_cache()

        # Storing the training loss
        training_loss.append(iteration_loss/iterations)
        # Storing the training accuracy
        training_accuracy.append((100.0 * correct_pred / sample_count))
        if debug:
            if train:
                print ('Epoch {}/{}, Training Loss: {:.3f}, Training Accuracy: {:.3f}'
                  .format(epoch+1, num_epochs, training_loss[-1], training_accuracy[-1]))
            else:
                print ('Epoch {}/{}, Testing Loss: {:.3f}, Testing Accuracy: {:.3f}'
                  .format(epoch+1, num_epochs, training_loss[-1], training_accuracy[-1]))
    stop = time.time()
    torch.cuda.empty_cache()

    return training_loss[-1], training_accuracy[-1], (stop-start)

In [None]:
# create the model

lstm_model = LSTM(num_lstm_layers=1, hidden_layer_size=[1000,100], 
                  dropout=0, bidirectional=True)
# loss_function = nn.MSELoss()
# loss_function = nn.NLLLoss()

# optimizer to be used

# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(lstm_model.parameters(), lr=0.05)

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# lstm_model = lstm_model.to(device)

In [None]:
# train the model

batch_size = 10
num_epochs = 30
print(lstm_model)

# load train dataset
dataloader = DataLoader(train_video_dataset, batch_size=batch_size, 
                        shuffle=True, num_workers=0)
train_loss, train_accuracy, train_time = start_train(lstm_model, dataloader, optimizer, num_epochs=num_epochs, train=True)

LSTM(
  (lstm): LSTM(4096, 1000, batch_first=True, bidirectional=True)
  (linear_layers): Sequential(
    (0): Linear(in_features=2000, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=100, out_features=25, bias=True)
  )
)


---
Evaluation:

In [None]:
# evalute the model by predicting over test dataset

# load test dataset
dataloader = DataLoader(test_video_dataset, batch_size=batch_size, 
                        shuffle=True, num_workers=0)
test_loss, test_accuracy, test_time = start_train(lstm_model, dataloader, optimizer, train=False)

In [None]:
print('Training accuracy is %2.3f' %(train_accuracy) )
print('Test accuracy is %2.3f' %(test_accuracy) )

Training accuracy is 100.000 :
Test accuracy is 84.122 :


Train and test and test accuracy of SVM:

In [None]:
# SVM classification

# initialization
batch_size = 10
trainX = []
trainY = []
testX = []
testY = []

# load train dataset
dataloader = DataLoader(train_video_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
for i, data_batch in enumerate(dataloader):
    batch_size = len(data_batch['img_list'])
    # flatten the sequence image features
    inputs = data_batch['img_list'].numpy().reshape([batch_size,-1])
    labels = data_batch['labels'].numpy().reshape([batch_size])
    trainX = trainX + inputs.tolist()
    trainY = trainY + labels.tolist()

# load test dataset
dataloader = DataLoader(test_video_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
for i, data_batch in enumerate(dataloader):
    batch_size = len(data_batch['img_list'])
    # flatten the sequence image features
    inputs = data_batch['img_list'].numpy().reshape([batch_size,-1])
    labels = data_batch['labels'].numpy().reshape([batch_size])
    testX = testX + inputs.tolist()
    testY = testY + labels.tolist()

# train the SVM classifier

start = time.time()
# Instantiate the SVM classifier
model_svc = svm.LinearSVC(C=0.00103, max_iter=1000)
# Train the SVM classifier
train_accuracy = model_svc.fit(trainX, trainY).score(trainX, trainY) * 100
time_taken = time.time()-start
print ('Train Accuracy: {:.3f}, Time taken: {:.5f}'.format(train_accuracy, time_taken))

# Make predictions

start = time.time()
# pred_labels = model_svc.predict(testX)
# time_taken = time.time()-start

# Find the count of correct labels
# correct = (pred_labels == testY).sum()

# Calculate the accuracy
# accuracy = float(correct) / len(testY) * 100

test_accuracy = model_svc.score(testX, testY) * 100
time_taken = time.time()-start
print ('Test Accuracy: {:.3f}, Time taken: {:.5f}'.format(test_accuracy, time_taken))

print('Training accuracy is %2.3f :' %(train_accuracy) )
print('Test accuracy is %2.3f :' %(test_accuracy) )

Train Accuracy: 100.000, Time taken: 135.35001
Test Accuracy: 85.699, Time taken: 3.32707
Training accuracy is 100.000 :
Test accuracy is 85.699 :
