In [None]:
import random
from esper.prelude import *
from rekall.video_interval_collection import VideoIntervalCollection
from rekall.temporal_predicates import *
from esper.rekall import *
import matplotlib.pyplot as plt
import cv2
import pickle

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler

from collections import OrderedDict
import scannertools as st

import esper.shot_detection_torch.models.deepsbd_resnet as deepsbd_resnet
import esper.shot_detection_torch.models.deepsbd_alexnet as deepsbd_alexnet
import esper.shot_detection_torch.dataloaders.movies_deepsbd as movies_deepsbd_data

In [None]:
st.init_storage(os.environ['BUCKET'])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Construct five folds

In [None]:
# Load up all manually annotated shots
shots_qs = Shot.objects.filter(labeler__name__contains='manual')

In [None]:
shots = VideoIntervalCollection.from_django_qs(shots_qs)

In [None]:
video_ids = sorted(list(shots.get_allintervals().keys()))

In [None]:
random.seed(0)

In [None]:
# randomly shuffle video IDs
random.shuffle(video_ids)

In [None]:
# construct five folds
total_shots = shots_qs.count()
folds = []
num_shots_in_folds = 0
cur_fold = []
for video_id in video_ids:
    if num_shots_in_folds + shots.get_intervallist(video_id).size() > (len(folds) + 1) * total_shots / 5:
        folds.append(cur_fold)
        cur_fold = []
    num_shots_in_folds += shots.get_intervallist(video_id).size()
    cur_fold.append(video_id)
folds.append(cur_fold)

In [None]:
# store folds
with open('/app/data/shot_detection_folds.pkl', 'wb') as f:
    pickle.dump(folds, f)

In [None]:
# or load folds from disk
with open('/app/data/shot_detection_folds.pkl', 'rb') as f:
    folds = pickle.load(f)

# DeepSBD Evaluation

In [None]:
# helper functions for deepsbd testing
def calculate_accuracy(outputs, targets):
    batch_size = targets.size(0)

    _, pred = outputs.topk(1, 1, True)
    pred = pred.t()
    correct = pred.eq(targets.view(1, -1))
    n_correct_elems = correct.float().sum().item()

    return n_correct_elems / batch_size

def prf1_array(pos_label, neg_label, gt, preds):
    tp = 0.
    fp = 0.
    tn = 0.
    fn = 0.
    
    for truth, pred in zip(gt, preds):
        if truth == pred:
            if pred == pos_label:
                tp += 1.
            else:
                tn += 1.
        else:
            if pred == pos_label:
                fp += 1.
            else:
                fn += 1.
    
    precision = tp / (tp + fp) if tp + fp != 0 else 0
    recall = tp / (tp + fn) if tp + fn != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0
    
    return (precision, recall, f1, tp, tn, fp, fn)

def get_label(res_tensor):
    res_numpy=res_tensor.data.cpu().numpy()
    labels=[]
    for row in res_numpy:
        labels.append(np.argmax(row))
    return labels

def test_deepsbd(model, dataloader):
    preds = []
    labels = []
    outputs = []
    for clip_tensor, l, _ in tqdm(dataloader):
        o = model(clip_tensor.to(device))

        preds += get_label(o)
        labels += l.data.numpy().tolist()
        outputs += o.cpu().data.numpy().tolist()
    
    preds = [2 if p == 2 else 0 for p in preds]
        
    precision, recall, f1, tp, tn, fp, fn = prf1_array(2, 0, labels, preds)
    print("Precision: {}, Recall: {}, F1: {}".format(precision, recall, f1))
    print("TP: {}, TN: {}, FP: {}, FN: {}".format(tp, tn, fp, fn))
    
    return preds, labels, outputs

In [None]:
# Load DeepSBD datasets for each fold
deepsbd_datasets = []
for fold in folds:
    shots_in_fold_qs = Shot.objects.filter(
        labeler__name__contains='manual',
        video_id__in = fold
    )
    shots_in_fold = VideoIntervalCollection.from_django_qs(shots_in_fold_qs)
    
    data = movies_deepsbd_data.DeepSBDDataset(shots_in_fold, verbose=True)
    deepsbd_datasets.append(data)

In [None]:
# dataset to hold multiple folds
class DeepSBDTrainDataset(Dataset):
    def __init__(self, datasets):
        self.datasets = datasets
    
    def __len__(self):
        return sum(len(d) for d in self.datasets)
    
    def __getitem__(self, idx):
        for d in self.datasets:
            if idx < len(d):
                return d[idx]
            else:
                idx -= len(d)
        
        return None
    
    def weights_for_balanced_classes(self):
        labels = [
            item[3]
            for d in self.datasets
            for item in d.items
        ]
        
        class_counts = {}
        for l in labels:
            if l not in class_counts:
                class_counts[l] = 1
            else:
                class_counts[l] += 1
        
        weights_per_class = {
            l: len(labels) / class_counts[l]
            for l in class_counts
        }
        
        return [
            weights_per_class[l]
            for l in labels
        ]

In [None]:
# models
deepsbd_alexnet_model = deepsbd_alexnet.deepSBD()
deepsbd_resnet_model = deepsbd_resnet.resnet18(num_classes=3,
    sample_size=128,
    sample_duration=16)

In [None]:
# alexnet deepSBD pre-trained on ClipShots
alexnet_state_dict = torch.load('models/ClipShots-DeepSBD-Alexnet-final.pth')['state_dict']
new_state_dict = OrderedDict()
for k, v in alexnet_state_dict.items():
    name = k[7:]
    new_state_dict[name] = v
deepsbd_alexnet_model.load_state_dict(new_state_dict)
# deepsbd_alexnet_model = deepsbd_alexnet_model.to(device)
# deepsbd_alexnet_model = deepsbd_alexnet_model.eval()

In [None]:
# resnet deepSBD pre-trained on ClipShots
resnet_state_dict = torch.load('models/ClipShots-DeepSBD-Resnet-18-final.pth')['state_dict']
new_state_dict = OrderedDict()
for k, v in resnet_state_dict.items():
    name = k[7:]
    new_state_dict[name] = v
deepsbd_resnet_model.load_state_dict(new_state_dict)
# deepsbd_resnet_model = deepsbd_resnet_model.to(device)
# deepsbd_resnet_model = deepsbd_resnet_model.eval()

In [None]:
# resnet deepSBD pre-trained on Kinetics
deepsbd_resnet_model_no_clipshots = deepsbd_resnet.resnet18(
    num_classes=3,
    sample_size=64,
    sample_duration=16
)
deepsbd_resnet_model_no_clipshots.load_weights('models/resnet-18-kinetics.pth')

In [None]:
# alexnet deepSBD
deepsbd_alexnet_model_no_clipshots = deepsbd_alexnet.deepSBD()

In [None]:
deepsbd_resnet_model_no_clipshots = deepsbd_resnet_model_no_clipshots.to(device).train()

In [None]:
training_dataset_fold1 = DeepSBDTrainDataset(deepsbd_datasets[:-1])

In [None]:
fold1_weights = torch.DoubleTensor(training_dataset_fold1.weights_for_balanced_classes())

In [None]:
fold1_sampler = torch.utils.data.sampler.WeightedRandomSampler(fold1_weights, len(fold1_weights))

In [None]:
training_dataloader_fold1 = DataLoader(
    training_dataset_fold1,
    num_workers=0,
    shuffle=False,
    batch_size=64,
    sampler=fold1_sampler
)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.SGD(deepsbd_resnet_model_no_clipshots.parameters(), 
                      lr=.001, momentum=.9, weight_decay=1e-3)

In [None]:
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=60000)

In [None]:
def train_epoch(epoch, training_dataloader, model, criterion, optimizer, scheduler):
    iter_len = len(training_dataloader)
    training_iter = iter(training_dataloader)
    
    for i in range(iter_len):
        clip_tensor, targets, _ = next(training_iter)
        
        outputs = model(clip_tensor.to(device))
        targets = targets.to(device)
        
        loss = criterion(outputs, targets)
        acc = calculate_accuracy(outputs, targets)
        preds = get_label(outputs)
        preds = [2 if p == 2 else 0 for p in preds]
        precision, recall, f1, tp, tn, fp, fn = prf1_array(
            2, 0, targets.cpu().data.numpy().tolist(), preds)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print('Epoch: [{0}][{1}/{2}]\t'
              'Loss_conf {loss_c:.4f}\t'
              'acc {acc:.4f}\t'
              'pre {pre:.4f}\t'
              'rec {rec:.4f}\t'
              'f1 {f1: .4f}\t'
              'TP {tp} '
              'TN {tn} '
              'FP {fp} '
              'FN {fn} '
              .format(
                  epoch, i + 1, iter_len, loss_c=loss.item(), acc=acc,
                  pre=precision, rec=recall, f1=f1,
                  tp=tp, tn=tn, fp=fp, fn=fn))
    
    save_file_path = os.path.join(
        '/app/notebooks/learning/models/deepsbd_resnet_train_on_folds',
        'fold5_{}_epoch.pth'.format(epoch)
    )
    states = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(states, save_file_path)

In [None]:
state = torch.load('models/deepsbd_resnet_train_on_folds/fold1_2_epoch.pth')

In [None]:
deepsbd_resnet_model_no_clipshots.load_state_dict(state['state_dict'])

In [None]:
for i in range(5):
    train_epoch(i, training_dataloader_fold1, deepsbd_resnet_model_no_clipshots, criterion, optimizer, scheduler)

In [None]:
# specialize pre-trained model

In [None]:
# test models on splits
model = deepsbd_resnet_model.to(device).eval()
per_fold_preds_labels_outputs = []
for fold_dataset in deepsbd_datasets:
    dataloader = DataLoader(fold_dataset, batch_size=8, shuffle=False, num_workers=0)
    preds, labels, outputs = test_deepsbd(model, dataloader)
    
    per_fold_preds_labels_outputs.append((preds, labels, outputs))

In [None]:
# test models on splits
model = deepsbd_alexnet_model.to(device).eval()
per_fold_preds_labels_outputs_alexnet = []
for fold_dataset in deepsbd_datasets:
    dataloader = DataLoader(fold_dataset, batch_size=8, shuffle=False, num_workers=0)
    preds, labels, outputs = test_deepsbd(model, dataloader)
    
    per_fold_preds_labels_outputs.append((preds, labels, outputs))

In [None]:
model = deepsbd_resnet_model_no_clipshots.eval()
per_fold_preds_labels_outputs_fold_training_only = []
for fold_dataset in deepsbd_datasets[3:4]:
    dataloader = DataLoader(fold_dataset, batch_size=8, shuffle=False, num_workers=0)
    preds, labels, outputs = test_deepsbd(model, dataloader)
    
    per_fold_preds_labels_outputs_fold_training_only.append((preds, labels, outputs))

In [None]:
model.load_weights('models/resnet-18-kinetics.pth')
per_fold_preds_labels_outputs_fold_training_only = []
for fold_dataset in deepsbd_datasets[:1]:
    dataloader = DataLoader(fold_dataset, batch_size=8, shuffle=False, num_workers=0)
    preds, labels, outputs = test_deepsbd(model, dataloader)
    
    per_fold_preds_labels_outputs_fold_training_only.append((preds, labels, outputs))

DeepSBD, ResNet18 backbone trained on ClipShots:
* Fold 1
  * Precision: 0.8636363636363636, Recall: 0.9620253164556962, F1: 0.9101796407185629
  * TP: 228.0, TN: 1322.0, FP: 36.0, FN: 9.0
* Fold 2
  * Precision: 0.8934010152284264, Recall: 0.9617486338797814, F1: 0.9263157894736842
  * TP: 176.0, TN: 314.0, FP: 21.0, FN: 7.0
* Fold 3
  * Precision: 0.7666666666666667, Recall: 0.8263473053892215, F1: 0.7953890489913544
  * TP: 276.0, TN: 2246.0, FP: 84.0, FN: 58.0
* Fold 4
  * Precision: 0.8960396039603961, Recall: 1.0, F1: 0.9451697127937337
  * TP: 181.0, TN: 901.0, FP: 21.0, FN: 0.0
* Fold 5
  * Precision: 0.8571428571428571, Recall: 0.9831932773109243, F1: 0.9158512720156555
  * TP: 234.0, TN: 1141.0, FP: 39.0, FN: 4.0

DeepSBD, AlexNet backbone trained on ClipShots:
* Fold 1
  * Precision: 0.8507462686567164, Recall: 0.9620253164556962, F1: 0.902970297029703
  * TP: 228.0, TN: 1318.0, FP: 40.0, FN: 9.0
* Fold 2
  * Precision: 0.912568306010929, Recall: 0.912568306010929, F1: 0.912568306010929
  * TP: 167.0, TN: 319.0, FP: 16.0, FN: 16.0
* Fold 3
  * Precision: 0.7818696883852692, Recall: 0.8263473053892215, F1: 0.8034934497816594
  * TP: 276.0, TN: 2253.0, FP: 77.0, FN: 58.0
* Fold 4
  * Precision: 0.9782608695652174, Recall: 0.994475138121547, F1: 0.9863013698630136
  * TP: 180.0, TN: 918.0, FP: 4.0, FN: 1.0
* Fold 5
  * Precision: 0.8669201520912547, Recall: 0.957983193277311, F1: 0.9101796407185628
  * TP: 228.0, TN: 1145.0, FP: 35.0, FN: 10.0
  
DeepSBD, ResNet18 backbone trained on folds only:
* Fold 1
  * Precision: 0.7737226277372263, Recall: 0.8945147679324894, F1: 0.8297455968688846
  * TP: 212.0, TN: 1296.0, FP: 62.0, FN: 25.0
* Fold 2
  * Precision: 0.8165680473372781, Recall: 0.7540983606557377, F1: 0.7840909090909091
  * TP: 138.0, TN: 304.0, FP: 31.0, FN: 45.0
* Fold 3
  * Precision: 0.7407407407407407, Recall: 0.718562874251497, F1: 0.7294832826747719
  * TP: 240.0, TN: 2246.0, FP: 84.0, FN: 94.0
* Fold 4
  * Precision: 0.7990196078431373, Recall: 0.9005524861878453, F1: 0.8467532467532468
  * TP: 163.0, TN: 881.0, FP: 41.0, FN: 18.0

# DSM Evaluation

In [None]:
# adaptive filtering

In [None]:
# dataloaders

In [None]:
# model

In [None]:
# load pre-loaded model

In [None]:
# train from scratch

In [None]:
# specialize pre-trained model