<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Load-up-Ground-Truth" data-toc-modified-id="Load-up-Ground-Truth-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load up Ground Truth</a></span></li><li><span><a href="#Evaluate-Baselines" data-toc-modified-id="Evaluate-Baselines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Evaluate Baselines</a></span><ul class="toc-item"><li><span><a href="#Load-up-Shots-from-Heuristics" data-toc-modified-id="Load-up-Shots-from-Heuristics-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Load up Shots from Heuristics</a></span></li><li><span><a href="#Machine-Learning" data-toc-modified-id="Machine-Learning-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Machine Learning</a></span></li><li><span><a href="#Different-Window-Sizes" data-toc-modified-id="Different-Window-Sizes-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Different Window Sizes</a></span><ul class="toc-item"><li><span><a href="#Window-Size-1" data-toc-modified-id="Window-Size-1-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>Window Size 1</a></span></li><li><span><a href="#Window-Size-2" data-toc-modified-id="Window-Size-2-2.3.2"><span class="toc-item-num">2.3.2&nbsp;&nbsp;</span>Window Size 2</a></span></li><li><span><a href="#Window-Size-3" data-toc-modified-id="Window-Size-3-2.3.3"><span class="toc-item-num">2.3.3&nbsp;&nbsp;</span>Window Size 3</a></span></li><li><span><a href="#Window-Size-4" data-toc-modified-id="Window-Size-4-2.3.4"><span class="toc-item-num">2.3.4&nbsp;&nbsp;</span>Window Size 4</a></span></li><li><span><a href="#Window-Size-5" data-toc-modified-id="Window-Size-5-2.3.5"><span class="toc-item-num">2.3.5&nbsp;&nbsp;</span>Window Size 5</a></span></li><li><span><a href="#Window-Size-6" data-toc-modified-id="Window-Size-6-2.3.6"><span class="toc-item-num">2.3.6&nbsp;&nbsp;</span>Window Size 6</a></span></li><li><span><a href="#Window-Size-7" data-toc-modified-id="Window-Size-7-2.3.7"><span class="toc-item-num">2.3.7&nbsp;&nbsp;</span>Window Size 7</a></span></li></ul></li><li><span><a href="#DeepSBD" data-toc-modified-id="DeepSBD-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>DeepSBD</a></span></li></ul></li><li><span><a href="#ClipShots" data-toc-modified-id="ClipShots-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>ClipShots</a></span></li><li><span><a href="#Notes" data-toc-modified-id="Notes-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Notes</a></span><ul class="toc-item"><li><span><a href="#Model/loss:-raw-output-of-last-FC-layer-to-BCEWithLogitsLoss" data-toc-modified-id="Model/loss:-raw-output-of-last-FC-layer-to-BCEWithLogitsLoss-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Model/loss: raw output of last FC layer to BCEWithLogitsLoss</a></span><ul class="toc-item"><li><span><a href="#Scratchpad" data-toc-modified-id="Scratchpad-4.1.1"><span class="toc-item-num">4.1.1&nbsp;&nbsp;</span>Scratchpad</a></span></li></ul></li></ul></li></ul></div>

In [1]:
from esper.prelude import *
from rekall.video_interval_collection import VideoIntervalCollection
from rekall.temporal_predicates import *
from esper.rekall import *
import matplotlib.pyplot as plt
import cv2

# Load up Ground Truth

In [2]:
# Load up small ground truth set for training
shots_gt_training_qs = Shot.objects.filter(
    Q(video_id=123, labeler__name__contains='manual', max_frame__lte=16560) | # easy
    Q(video_id=172, labeler__name__contains='manual') | # hard
    Q(video_id=179, labeler__name__contains='manual') | # easy
    Q(video_id=104, labeler__name__contains='manual') |
    Q(video_id=148, labeler__name__contains='manual')
)

In [3]:
shots_gt_test_qs = Shot.objects.filter(labeler__name__contains='manual')

In [4]:
shots_gt_training = VideoIntervalCollection.from_django_qs(shots_gt_training_qs)

In [5]:
shots_gt_test = VideoIntervalCollection.from_django_qs(shots_gt_test_qs).minus(shots_gt_training)

In [7]:
# Visualize the ground truth.
esper_widget(intrvllists_to_result(shots_gt_training), jupyter_keybindings=True, disable_captions=True)

VGridWidget(jsglobals={'bucket': 'esper', 'queries': [['All faces', 'def all_faces():\n    from query.models i…

# Evaluate Baselines

## Load up Shots from Heuristics

In [220]:
# Figure out temporal extents of the clips that were labeled
clips_training = shots_gt_training.dilate(1).coalesce().dilate(-1)
clips_test = shots_gt_test.dilate(1).coalesce().dilate(-1)

In [221]:
cinematic_shots_qs = Shot.objects.filter(cinematic=True).all()
cinematic_shots = VideoIntervalCollection.from_django_qs(
    cinematic_shots_qs,
    progress = True
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 875669/875669 [00:01<00:00, 458662.38it/s]


In [222]:
cinematic_shots_training = cinematic_shots.filter_against(
    clips_training,
    predicate=overlaps()
)
cinematic_shots_test = cinematic_shots.filter_against(
    clips_test,
    predicate=overlaps()
)

In [223]:
cinematic_shot_boundaries_training = cinematic_shots_training.map(lambda i: (i.start, i.start, i.payload))
cinematic_shot_boundaries_test = cinematic_shots_test.map(lambda i: (i.start, i.start, i.payload))
gt_shot_boundaries_training = shots_gt_training.map(lambda i: (i.start, i.start, i.payload))
gt_shot_boundaries_test = shots_gt_test.map(lambda i: (i.start, i.start, i.payload))

In [225]:
def size(interval_collection):
    count = 0
    for video_id in interval_collection.get_allintervals():
        count += interval_collection.get_intervallist(video_id).size()
        
    return count

In [13]:
def print_per_video_precision_recall(gt_shot_boundaries, eval_shot_boundaries):
    for video_id in gt_shot_boundaries.get_allintervals():
        print("Video {}: ".format(video_id))
        cine_sb = VideoIntervalCollection({
            video_id: eval_shot_boundaries.get_intervallist(video_id)
        })
        gt_sb = VideoIntervalCollection({
            video_id: gt_shot_boundaries.get_intervallist(video_id)
        })
        accurate_sb = cine_sb.filter_against(gt_sb, predicate=overlaps())
        inaccurate_sb = cine_sb.minus(accurate_sb)

        found_human_sb = gt_sb.filter_against(cine_sb, predicate=overlaps())
        missed_human_sb = gt_sb.minus(found_human_sb)
        
        print("Precision: {}, {} out of {}".format(
            size(accurate_sb) / size(cine_sb), 
            size(accurate_sb), 
            size(cine_sb)))
        print("Recall: {}, {} out of {}".format(
            size(accurate_sb) / size(gt_sb), 
            size(accurate_sb), 
            size(gt_sb)))

In [227]:
def prf1_boundary_intrvllist(gt_shot_boundaries, eval_shot_boundaries):
    tp = 0.
    fp = 0.
    fn = 0.
    for video_id in gt_shot_boundaries.get_allintervals():
        cine_sb = VideoIntervalCollection({
            video_id: eval_shot_boundaries.get_intervallist(video_id)
        })
        gt_sb = VideoIntervalCollection({
            video_id: gt_shot_boundaries.get_intervallist(video_id)
        })
        accurate_sb = cine_sb.filter_against(gt_sb, predicate=overlaps())
        inaccurate_sb = cine_sb.minus(accurate_sb)

        found_human_sb = gt_sb.filter_against(cine_sb, predicate=overlaps())
        missed_human_sb = gt_sb.minus(found_human_sb)
        
        tp += size(accurate_sb)
        fp += size(inaccurate_sb)
        fn += size(missed_human_sb)
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    
    return precision, recall, f1, tp, fp, fn

In [14]:
print_per_video_precision_recall(gt_shot_boundaries_training, cinematic_shot_boundaries_training)

Video 104: 
Precision: 0.8695652173913043, 20 out of 23
Recall: 0.6451612903225806, 20 out of 31
Video 172: 
Precision: 0.7222222222222222, 13 out of 18
Recall: 0.52, 13 out of 25
Video 179: 
Precision: 1.0, 14 out of 14
Recall: 0.875, 14 out of 16
Video 148: 
Precision: 1.0, 8 out of 8
Recall: 1.0, 8 out of 8
Video 123: 
Precision: 1.0, 17 out of 17
Recall: 1.0, 17 out of 17


In [15]:
print_per_video_precision_recall(gt_shot_boundaries_test, cinematic_shot_boundaries_test)

Video 65: 
Precision: 1.0, 20 out of 20
Recall: 1.0, 20 out of 20
Video 515: 
Precision: 1.0, 2 out of 2
Recall: 1.0, 2 out of 2
Video 577: 
Precision: 1.0, 21 out of 21
Recall: 1.0, 21 out of 21
Video 585: 
Precision: 1.0, 12 out of 12
Recall: 0.9230769230769231, 12 out of 13
Video 34: 
Precision: 0.7777777777777778, 7 out of 9
Recall: 0.5, 7 out of 14
Video 144: 
Precision: 0.3333333333333333, 1 out of 3
Recall: 1.0, 1 out of 1
Video 504: 
Precision: 0.96875, 31 out of 32
Recall: 0.7380952380952381, 31 out of 42
Video 339: 
Precision: 0.9, 9 out of 10
Recall: 0.9, 9 out of 10
Video 23: 
Precision: 0.7894736842105263, 15 out of 19
Recall: 0.5769230769230769, 15 out of 26
Video 411: 
Precision: 1.0, 3 out of 3
Recall: 1.0, 3 out of 3
Video 226: 
Precision: 0.5, 3 out of 6
Recall: 0.75, 3 out of 4
Video 123: 
Precision: 0.8713450292397661, 149 out of 171
Recall: 0.9371069182389937, 149 out of 159
Video 359: 
Precision: 0.9333333333333333, 14 out of 15
Recall: 0.9333333333333333, 14 out 

In [231]:
prf1_boundary_intrvllist(
    gt_shot_boundaries_test.set_union(gt_shot_boundaries_training),
    cinematic_shot_boundaries_test.set_union(cinematic_shot_boundaries_training)
)

(0.8862876254180602, 0.8452950558213717, 0.8653061224489795, 530.0, 68.0, 97.0)

In [None]:
# Visualize the discrepancies. Ground truth is in red, heuristic results are in blue.
result = intrvllists_to_result(shots_gt_training, color='red')
add_intrvllists_to_result(result, cinematic_shots_training, color='blue')
esper_widget(result, jupyter_keybindings=True, disable_captions=True)

## Machine Learning

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import time
import datetime
from tqdm import tqdm
import copy
import scannertools as st
import random
from collections import OrderedDict

In [7]:
st.init_storage(os.environ['BUCKET'])

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
class ShotDetectionDataset(Dataset):
    def __init__(self, shots, window_size=1, height=224):
        """Constrcutor for ShotDetectionDataset.
        
        Args:
            shots: VideoIntervalCollection of all the intervals to get frames from. If the payload is -1,
            then the interval is not an actual shot and just needs to be included in the dataset.
        """
        self.window_size = window_size
        items = set()
        frame_nums = {}
        
        for video_id in shots.get_allintervals():
            frame_nums[video_id] = set()
            for intrvl in shots.get_intervallist(video_id).get_intervals():
                for f in range(intrvl.start, intrvl.end + 1):
                    items.add((
                        video_id,
                        f,
                        1 if f == intrvl.start and intrvl.payload != -1 else 0
                    ))
                    for i in range(intrvl.start - window_size, intrvl.end + window_size + 1):
                        frame_nums[video_id].add(i)
        self.items = sorted(list(items))
        
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((100, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        
        # Load frames into memory
        self.frames = {
            video_id: {
                'frame_nums': sorted(list(frame_nums[video_id])),
                'frames': [
                    self.transform(f)
                    for f in Video.objects.get(id=video_id).for_scannertools().frames(
                        sorted(list(frame_nums[video_id]))
                    )
                ]
            }
            for video_id in tqdm(frame_nums)
        }
    
    def __len__(self):
        return len(self.items)
    
    def __getitem__(self, idx):
        """
        Indexed by video ID, then frame number
        Returns self.window_size frames before the indexed frame to self.window_size
            frames after the indexed frame
        """
        video_id, frame_num, label = self.items[idx]
        
        start_index = self.frames[video_id]['frame_nums'].index(frame_num - self.window_size)
        img_tensors = self.frames[video_id]['frames'][start_index:start_index + 2*self.window_size + 1]
        
#         img_tensors = [
#             self.transform(f)
#             for f in Video.objects.get(id=video_id).for_scannertools().frames(
#                 list(range(frame_num - self.window_size, frame_num + self.window_size + 1))
#             )
#         ]
        
        return img_tensors, label

In [11]:
# construct a training set with good class balance
shot_boundaries = shots_gt_training.map(
    lambda intrvl: (intrvl.start, intrvl.start, intrvl.payload)
)
shots_without_boundaries = shots_gt_training.map(
    lambda intrvl: (intrvl.start + 1, intrvl.end, intrvl.payload)
).get_allintervals()
non_boundary_frames = [
    (video_id, f)
    for video_id in shots_without_boundaries
    for intrvl in shots_without_boundaries[video_id].get_intervals()
    for f in range(intrvl.start, intrvl.end + 1)
]
random.seed(0)
random.shuffle(non_boundary_frames) # seed of 0 for reproducibility
chosen_frames = collect(non_boundary_frames[:size(shot_boundaries)], lambda tup: tup[0])

training_frames = shot_boundaries.set_union(
    VideoIntervalCollection({
        video_id: [
            (frame, frame, -1)
            for vid, frame in chosen_frames[video_id]
        ]
        for video_id in chosen_frames
    })
).set_union(
    shots_gt_training.map(
        lambda intrvl: (intrvl.end, intrvl.end, -1)
    )
).set_union(
    shots_gt_training.map(
        lambda intrvl: (intrvl.start+1, intrvl.start+1, -1)
    )
)

In [12]:
dataset_training = ShotDetectionDataset(training_frames)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:46<00:00,  9.88s/it]


In [13]:
dataloader_training = DataLoader(dataset_training, batch_size=8, shuffle=True, num_workers=0)

In [14]:
dataset_training_test = ShotDetectionDataset(shots_gt_training)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:37<00:00, 20.76s/it]


In [15]:
dataloader_training_test = DataLoader(dataset_training_test, batch_size=8, shuffle=False, num_workers=0)

In [9]:
class VideoNet(nn.Module):
    def __init__(self, window_size=1):
        super(VideoNet, self).__init__()
#         self.resnet1 = models.ResNet(models.resnet.BasicBlock, [1, 1, 1, 1], num_classes=128)
#         self.resnet2 = models.ResNet(models.resnet.BasicBlock, [1, 1, 1, 1], num_classes=128)
#         self.resnet3 = models.ResNet(models.resnet.BasicBlock, [1, 1, 1, 1], num_classes=128)
        self.resnets = [
            models.resnet18(pretrained=True)
            for i in range(0, 2 * window_size + 1)
        ]
        self.resnet_fcs = [
            nn.Linear(rn.fc.in_features, 128)
            for rn in self.resnets
        ]
        
        for idx, resnet in enumerate(self.resnets):
            resnet.fc = self.resnet_fcs[idx]
            resnet.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.add_module('resnet{}'.format(idx), resnet)
    
        # Replace pooling layer with Adaptive Pooling
#         self.resnet1.avgpool = nn.AdaptiveAvgPool2d((1, 1))
#         self.resnet2.avgpool = nn.AdaptiveAvgPool2d((1, 1))
#         self.resnet3.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
#         self.embeddingpool = nn.MaxPool1d(5, stride=3)
        
#         self.rnfc1 = nn.Linear(1000, 128)
#         self.rnfc2 = nn.Linear(1000, 128)
#         self.rnfc3 = nn.Linear(1000, 128)
        
        self.embeddingconv = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.relu = nn.ReLU(inplace=True)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
        
    def init_weights(self):
        nn.init.kaiming_normal_(self.embeddingconv.weight, mode='fan_out', nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc.weight)
        for fc in self.resnet_fcs:
            nn.init.xavier_uniform_(fc.weight)
        
    def forward(self, images):
#         image1embedding = self.resnet1(image1).unsqueeze(1)
#         image2embedding = self.resnet2(image2).unsqueeze(1)
#         image3embedding = self.resnet3(image3).unsqueeze(1)
        
#         print(image1embedding.size())
        
#         embedding_image = torch.cat(
#             (self.embeddingpool(image1embedding),
#              self.embeddingpool(image2embedding),
#              self.embeddingpool(image3embedding)),
#             dim=1
#         )
        
#         embedding_image = torch.cat(
#             (image1embedding,
#              image2embedding,
#              image3embedding),
#             dim=1
#         )
        
#         print(embedding_image.size())

        embeddings = [
            resnet(image).unsqueeze(1)
            for image, resnet in zip(images, self.resnets)
        ]
        
        embedding_image = torch.cat(embeddings, dim=1)
        
        embedding_image = embedding_image.unsqueeze(1)
        
#         print(embedding_image.size())
        out = self.embeddingconv(embedding_image)
#         print(out.size())
        out = self.relu(out)
#         print(out.size())
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
#         out = nn.LogSoftmax(1)(out)
#         out = self.sigmoid(out)
#         out = F.softmax(out, dim=1)
        
        return out
    
#     def parameters(self):
#         return [self.embeddingconv.parameters(), self.fc.parameters()]

In [50]:
vnet = VideoNet()

In [51]:
vnet

VideoNet(
  (resnet0): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stat

In [52]:
vnet.init_weights()

In [54]:
vnet = vnet.to(device)

In [10]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25, dataloader=None):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            total_inputs = 0.0
            
            true_positive = 0.
            false_positive = 0.
            true_negative = 0.
            false_negative = 0.

            # Iterate over data.
            for idx, (inputs, labels) in enumerate(dataloader):
#                 if idx > 100:
#                     break
#                 print("Start loop {}".format(datetime.datetime.now()))
#                 crit = nn.BCELoss(
#                     weight = torch.tensor([
#                         1.0 if l.item() == 1 else .25
#                         for l in labels
#                     ]).to(device)
#                 )
                inputs = [i.to(device) for i in inputs]
                labels = labels.to(device)
#                 print("Moved inputs {}".format(datetime.datetime.now()))

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    batch_size = labels.size(0)
#                     _, preds = torch.max(outputs, 1)
#                     loss = criterion(outputs.view(1, 4), labels.view(1, 4))
#                     loss=torch.tensor([[0]]).to(device)
#                     print(outputs.view(1, batch_size), labels.view(1, batch_size))
#                     loss=criterion(outputs, labels)
                    loss=criterion(outputs.view(1, batch_size), labels.float().view(1, batch_size))
#                     print(outputs.view(1, 4), labels.float().view(1, 4), loss)
#                     if False:
                    if idx == 0:
                        print(outputs, labels, loss)
#                     print(labels)
#                     print(loss)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                
                
#                     for p, l in zip(preds, labels):
#                         if p.item() == l.item():
#                             if l.item() == 1:
#                                 true_positive += 1.
#                             else:
#                                 true_negative += 1.
#                         else:
#                             if p.item() == 1:
#                                 false_positive += 1.
#                             else:
#                                 false_negative += 1.
#                         total_inputs += 1

                    for o, l in zip(outputs, labels):
                        if o.item() > 0.:
                            if l.item() == 1:
                                true_positive += 1.
                            else:
                                false_positive += 1.
                        else:
                            if l.item() == 1:
                                false_negative += 1.
                            else:
                                true_negative += 1.
                        total_inputs += 1
                            
                    # statistics
                    running_loss += loss.item() * inputs[0].size(0)
                    running_corrects = true_positive + true_negative
#                     print(running_corrects, true_positive, true_negative, total_inputs)

#                 print("End loop {}".format(datetime.datetime.now()))

            epoch_loss = running_loss / total_inputs #/ len(dataset)
            epoch_acc = running_corrects / total_inputs #/ len(dataset)
            if true_positive + false_positive != 0:
                precision = true_positive / (true_positive + false_positive)
            else:
                precision = 0.
            if true_positive + false_negative != 0:
                recall = true_positive / (true_positive + false_negative)
            else:
                recall = 0.

            print('{} Loss: {:.4f} Acc: {:.4f} Precision: {:.4f} Recall: {:.4f}'.format(
                phase, epoch_loss, epoch_acc, precision, recall))
            print('TP: {} TN: {} FP: {} FN: {}'.format(
                true_positive, true_negative, false_positive, false_negative
            ))

            # deep copy the model
            if epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [56]:
# criterion = nn.CrossEntropyLoss(weight=torch.tensor([.1, 1.]).to(device))
# criterion = nn.CrossEntropyLoss()
# criterion = nn.NLLLoss(weight=torch.tensor([.01, .99]).to(device))
# criterion = nn.NLLLoss()
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([3.]).to(device))

In [57]:
optimizer = optim.SGD(vnet.parameters(), lr=0.01, momentum=0.9)

In [58]:
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

In [60]:
model = train_model(vnet, criterion, optimizer, exp_lr_scheduler, num_epochs=100, dataloader=dataloader_training)

Epoch 0/99
----------
tensor([[0.0976],
        [0.0828],
        [0.0795],
        [0.0984],
        [0.0855],
        [0.0816],
        [0.0980],
        [0.0927]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 0, 0, 1, 0], device='cuda:0') tensor(0.8880, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 1.0504 Acc: 0.4508 Precision: 0.2275 Recall: 0.4948
TP: 48.0 TN: 126.0 FP: 163.0 FN: 49.0

Epoch 1/99
----------
tensor([[0.1455],
        [0.1558],
        [0.1576],
        [0.1429],
        [0.1172],
        [0.1337],
        [0.1338],
        [0.1565]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 1, 0, 0, 0], device='cuda:0') tensor(0.9116, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 1.0495 Acc: 0.3834 Precision: 0.2340 Recall: 0.6392
TP: 62.0 TN: 86.0 FP: 203.0 FN: 35.0

Epoch 2/99
----------
tensor([[-0.0159],
        [-0.0226],
        [ 0.0100],
        [ 0.0003],
        [-0.0087],
        [-0.0130],
        [-0.0251],
    

train Loss: 0.6128 Acc: 0.8446 Precision: 0.6667 Recall: 0.7629
TP: 74.0 TN: 252.0 FP: 37.0 FN: 23.0

Epoch 20/99
----------
tensor([[-4.0902],
        [ 3.7784],
        [ 0.0263],
        [ 2.0861],
        [-3.5531],
        [-1.2741],
        [ 0.7312],
        [-2.2908]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 1, 0, 1, 0, 0, 1, 0], device='cuda:0') tensor(0.3365, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.8039 Acc: 0.7539 Precision: 0.5078 Recall: 0.6701
TP: 65.0 TN: 226.0 FP: 63.0 FN: 32.0

Epoch 21/99
----------
tensor([[-0.3801],
        [-0.4450],
        [-1.3742],
        [-0.2364],
        [ 0.0432],
        [-1.2608],
        [-0.7173],
        [-0.6770]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.7402, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.5901 Acc: 0.8368 Precision: 0.6328 Recall: 0.8351
TP: 81.0 TN: 242.0 FP: 47.0 FN: 16.0

Epoch 22/99
----------
tensor([[ 0.3803]

train Loss: 0.0900 Acc: 0.9845 Precision: 0.9505 Recall: 0.9897
TP: 96.0 TN: 284.0 FP: 5.0 FN: 1.0

Epoch 40/99
----------
tensor([[-3.4418],
        [-3.4868],
        [-7.9876],
        [ 6.3520],
        [-5.7267],
        [-4.6465],
        [-5.7950],
        [ 6.8538]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 1, 0, 0, 0, 1], device='cuda:0') tensor(0.0108, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0788 Acc: 0.9922 Precision: 0.9796 Recall: 0.9897
TP: 96.0 TN: 287.0 FP: 2.0 FN: 1.0

Epoch 41/99
----------
tensor([[-4.8861],
        [ 7.2075],
        [-6.7970],
        [-2.8533],
        [-3.5006],
        [-3.8369],
        [-5.3711],
        [-1.8163]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.0342, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.1131 Acc: 0.9845 Precision: 0.9789 Recall: 0.9588
TP: 93.0 TN: 287.0 FP: 2.0 FN: 4.0

Epoch 42/99
----------
tensor([[-6.2185],
    

train Loss: 0.0296 Acc: 0.9974 Precision: 0.9898 Recall: 1.0000
TP: 97.0 TN: 288.0 FP: 1.0 FN: 0.0

Epoch 60/99
----------
tensor([[-8.7828],
        [-8.4686],
        [ 0.4045],
        [-5.0997],
        [ 4.7755],
        [-6.8863],
        [10.3230],
        [-5.2302]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 1, 0, 1, 0, 1, 0], device='cuda:0') tensor(0.1965, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0471 Acc: 0.9870 Precision: 0.9510 Recall: 1.0000
TP: 97.0 TN: 284.0 FP: 5.0 FN: 0.0

Epoch 61/99
----------
tensor([[10.2449],
        [-5.3270],
        [-6.2652],
        [-5.6571],
        [-4.8000],
        [-5.3058],
        [-4.7592],
        [-2.5784]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.0131, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0286 Acc: 0.9974 Precision: 0.9898 Recall: 1.0000
TP: 97.0 TN: 288.0 FP: 1.0 FN: 0.0

Epoch 62/99
----------
tensor([[-2.4397],
    

train Loss: 0.0335 Acc: 0.9974 Precision: 0.9898 Recall: 1.0000
TP: 97.0 TN: 288.0 FP: 1.0 FN: 0.0

Epoch 80/99
----------
tensor([[-8.6783],
        [16.5687],
        [-6.5223],
        [-2.0024],
        [-8.3689],
        [-3.8813],
        [-5.7733],
        [-3.4049]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.0231, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0336 Acc: 0.9922 Precision: 0.9796 Recall: 0.9897
TP: 96.0 TN: 287.0 FP: 2.0 FN: 1.0

Epoch 81/99
----------
tensor([[-11.1943],
        [ -3.9515],
        [ -4.7845],
        [  9.5398],
        [ -5.8847],
        [ -7.4821],
        [ 10.2217],
        [ -7.2848]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 1, 0, 0, 1, 0], device='cuda:0') tensor(0.0040, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0364 Acc: 0.9922 Precision: 0.9896 Recall: 0.9794
TP: 95.0 TN: 288.0 FP: 1.0 FN: 2.0

Epoch 82/99
----------
tensor([[-9.094

train Loss: 0.0266 Acc: 1.0000 Precision: 1.0000 Recall: 1.0000
TP: 97.0 TN: 289.0 FP: 0.0 FN: 0.0

Training complete in 5m 18s
Best val Acc: 1.000000


In [11]:
def test_model(model, criterion, dataloader):
    since = time.time()

    model.eval()   # Set model to evaluate mode

    running_loss = 0.0
    running_corrects = 0
    total_inputs = 0.0

    true_positive = 0.
    false_positive = 0.
    true_negative = 0.
    false_negative = 0.
    
    results = []

    # Iterate over data.
    for idx, (inputs, labels) in tqdm(enumerate(dataloader)):
        inputs = [i.to(device) for i in inputs]
        labels = labels.to(device)
                
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            batch_size = labels.size(0)
#             _, preds = torch.max(outputs, 1)
#             loss=criterion(outputs, labels)
            loss=criterion(outputs.view(1, batch_size), labels.float().view(1, batch_size))
#             if False:
            if idx == 0:
                print(outputs, labels, loss)
#             print(labels)
#             print(loss)
                
                
#             for p, l in zip(preds, labels):
#                 if p.item() == l.item():
#                     if l.item() == 1:
#                         true_positive += 1.
#                     else:
#                         true_negative += 1.
#                 else:
#                     if p.item() == 1:
#                         false_positive += 1.
#                     else:
#                         false_negative += 1.
#                 total_inputs += 1

            for o, l in zip(outputs, labels):
                if o.item() > 0.:
                    if l.item() == 1:
                        true_positive += 1.
                    else:
                        false_positive += 1.
                else:
                    if l.item() == 1:
                        false_negative += 1.
                    else:
                        true_negative += 1.
                total_inputs += 1
                results.append((o.item(), l.item()))

        # statistics
        running_loss += loss.item() * inputs[0].size(0)
        running_corrects = true_positive + true_negative
#     print(running_corrects, true_positive, true_negative, total_inputs)

    epoch_loss = running_loss / total_inputs #/ len(dataset)
    epoch_acc = running_corrects / total_inputs #/ len(dataset)
    if true_positive + false_positive != 0:
        precision = true_positive / (true_positive + false_positive)
    else:
        precision = 0.
    if true_positive + false_negative != 0:
        recall = true_positive / (true_positive + false_negative)
    else:
        recall = 0.

    print('Loss: {:.4f} Acc: {:.4f} Precision: {:.4f} Recall: {:.4f}'.format(
        epoch_loss, epoch_acc, precision, recall))
    print('TP: {} TN: {} FP: {} FN: {}'.format(
        true_positive, true_negative, false_positive, false_negative
    ))
    
    return results

In [23]:
dataset_test = ShotDetectionDataset(shots_gt_test)



  0%|                                                                                                                                                                                                                 | 0/24 [00:00<?, ?it/s][A[A

  4%|████████▍                                                                                                                                                                                                | 1/24 [00:17<06:43, 17.54s/it][A[A

  8%|████████████████▊                                                                                                                                                                                        | 2/24 [00:35<06:29, 17.70s/it][A[A

 12%|█████████████████████████▏                                                                                                                                                                               | 3/24 [01:13<08:18, 23.73s/it][A[A

 17%|█████████████

In [24]:
dataloader_test = DataLoader(dataset_test, batch_size=8, shuffle=False, num_workers=0)

In [67]:
test_results = test_model(model, criterion, dataloader_test)



0it [00:00, ?it/s][A[A

6it [00:00, 55.80it/s][A[A

13it [00:00, 57.51it/s][A[A

tensor([[-2.7504],
        [ 4.5072],
        [ 1.5568],
        [ 1.9037],
        [ 0.7195],
        [ 2.6576],
        [ 0.4081],
        [ 0.1716]], device='cuda:0') tensor([1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(2.7860, device='cuda:0')




20it [00:00, 59.49it/s][A[A

27it [00:00, 60.10it/s][A[A

34it [00:00, 61.27it/s][A[A

41it [00:00, 62.36it/s][A[A

48it [00:00, 63.31it/s][A[A

55it [00:00, 64.07it/s][A[A

62it [00:00, 64.79it/s][A[A

69it [00:01, 64.99it/s][A[A

76it [00:01, 64.64it/s][A[A

83it [00:01, 60.88it/s][A[A

90it [00:01, 61.78it/s][A[A

97it [00:01, 62.96it/s][A[A

104it [00:01, 64.15it/s][A[A

111it [00:01, 64.57it/s][A[A

118it [00:01, 65.02it/s][A[A

125it [00:01, 65.62it/s][A[A

132it [00:02, 65.85it/s][A[A

139it [00:02, 65.65it/s][A[A

146it [00:02, 66.19it/s][A[A

153it [00:02, 66.22it/s][A[A

160it [00:02, 66.39it/s][A[A

167it [00:02, 66.32it/s][A[A

174it [00:02, 66.50it/s][A[A

181it [00:02, 66.18it/s][A[A

188it [00:02, 65.71it/s][A[A

195it [00:03, 65.07it/s][A[A

202it [00:03, 65.07it/s][A[A

209it [00:03, 65.61it/s][A[A

216it [00:03, 65.86it/s][A[A

223it [00:03, 66.31it/s][A[A

230it [00:03, 66.57it/s][A[A

237it [00:03, 66.6

3444it [00:54, 66.28it/s][A[A

3451it [00:55, 66.54it/s][A[A

3458it [00:55, 66.85it/s][A[A

3465it [00:55, 66.96it/s][A[A

3472it [00:55, 67.12it/s][A[A

3479it [00:55, 67.17it/s][A[A

3486it [00:55, 66.81it/s][A[A

3493it [00:55, 63.45it/s][A[A

3500it [00:55, 63.56it/s][A[A

3507it [00:55, 64.00it/s][A[A

3514it [00:56, 64.56it/s][A[A

3521it [00:56, 64.93it/s][A[A

3528it [00:56, 65.71it/s][A[A

3535it [00:56, 66.21it/s][A[A

3542it [00:56, 66.52it/s][A[A

3549it [00:56, 66.71it/s][A[A

3556it [00:56, 62.67it/s][A[A

3563it [00:56, 63.48it/s][A[A

3570it [00:56, 64.21it/s][A[A

3577it [00:57, 65.15it/s][A[A

3584it [00:57, 65.96it/s][A[A

3591it [00:57, 66.20it/s][A[A

3598it [00:57, 66.98it/s][A[A

3605it [00:57, 66.87it/s][A[A

3612it [00:57, 66.31it/s][A[A

3619it [00:57, 65.58it/s][A[A

3626it [00:57, 66.15it/s][A[A

3633it [00:57, 66.24it/s][A[A

3640it [00:57, 66.59it/s][A[A

3647it [00:58, 66.78it/s][A[A

3654it [00

Loss: 0.3313 Acc: 0.9048 Precision: 0.0183 Recall: 0.1547
TP: 82.0 TN: 45903.0 FP: 4389.0 FN: 448.0


In [61]:
training_test_results = test_model(model, criterion, dataloader_training_test)



0it [00:00, ?it/s][A[A

6it [00:00, 53.15it/s][A[A

12it [00:00, 54.91it/s][A[A

tensor([[ 9.3980],
        [-6.7511],
        [-7.4657],
        [-6.1058],
        [-6.3008],
        [-5.4602],
        [-4.9597],
        [-4.6259]], device='cuda:0') tensor([1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.0034, device='cuda:0')




18it [00:00, 54.70it/s][A[A

24it [00:00, 54.68it/s][A[A

31it [00:00, 55.77it/s][A[A

37it [00:00, 55.37it/s][A[A

44it [00:00, 57.51it/s][A[A

51it [00:00, 59.50it/s][A[A

58it [00:00, 60.83it/s][A[A

65it [00:01, 61.95it/s][A[A

72it [00:01, 60.12it/s][A[A

79it [00:01, 60.91it/s][A[A

86it [00:01, 62.10it/s][A[A

93it [00:01, 62.96it/s][A[A

100it [00:01, 63.54it/s][A[A

107it [00:01, 63.85it/s][A[A

114it [00:01, 64.07it/s][A[A

121it [00:01, 63.92it/s][A[A

128it [00:02, 64.29it/s][A[A

135it [00:02, 64.47it/s][A[A

142it [00:02, 65.03it/s][A[A

149it [00:02, 65.47it/s][A[A

156it [00:02, 65.77it/s][A[A

163it [00:02, 65.95it/s][A[A

170it [00:02, 66.10it/s][A[A

177it [00:02, 66.35it/s][A[A

184it [00:02, 66.42it/s][A[A

191it [00:03, 66.31it/s][A[A

198it [00:03, 66.38it/s][A[A

205it [00:03, 66.37it/s][A[A

212it [00:03, 66.36it/s][A[A

219it [00:03, 66.69it/s][A[A

226it [00:03, 67.11it/s][A[A

233it [00:03, 66.9

Loss: 0.1344 Acc: 0.9536 Precision: 0.2225 Recall: 1.0000
TP: 97.0 TN: 6865.0 FP: 339.0 FN: 0.0


In [62]:
true_positives = []
false_positives = []
for (output, label), item in zip(training_test_results, dataset_training_test.items):
    if output >= 0 and label == 1:
        true_positives.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives.append((output, label, item))

In [63]:
tp_collected = collect(true_positives, lambda tup: tup[2][0])
true_positive_intrvls = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in tp_collected[video_id]
    ]
    for video_id in tp_collected
})

In [64]:
fp_collected = collect(false_positives, lambda tup: tup[2][0])
false_positive_intrvls = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected[video_id]
    ]
    for video_id in fp_collected
})

In [None]:
esper_widget(
    intrvllists_to_result_with_objects(true_positive_intrvls, lambda a, b: []),
    jupyter_keybindings=True
)

In [65]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

VGridWidget(jsglobals={'queries': [['All faces', 'def all_faces():\n    from query.models import Face\n    fro…

In [None]:
torch.save(model, '2-5-19_529pm_videonet_1to1classbalance_bcewithlogitsloss.pth')

In [None]:
torch.save(model, '2-6-19_948am_videonet_10to1classbalance_bcewithlogitsloss.pth')

In [None]:
torch.save(model, '2-6-19_1016am_videonet_2to1classbalance_bcewithlogitsloss.pth')

In [None]:
torch.save(model, '2-6-19_5pm_videonet_3to1classbalance_bcewithlogitsloss.pth')

## Different Window Sizes

In [69]:
max_window_size = 11

In [70]:
training_set_full = ShotDetectionDataset(training_frames, window_size=max_window_size)



  0%|                                                                                                                                                                                                                  | 0/5 [00:00<?, ?it/s][A[A

 20%|████████████████████████████████████████▍                                                                                                                                                                 | 1/5 [00:24<01:38, 24.64s/it][A[A

 40%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2/5 [00:34<01:00, 20.06s/it][A[A

 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3/5 [00:42<00:33, 16.71s/it][A[A

 80%|█████████████

In [71]:
training_set_test_full = ShotDetectionDataset(shots_gt_training, window_size=max_window_size)



  0%|                                                                                                                                                                                                                  | 0/5 [00:00<?, ?it/s][A[A

 20%|████████████████████████████████████████▍                                                                                                                                                                 | 1/5 [00:28<01:53, 28.26s/it][A[A

 40%|████████████████████████████████████████████████████████████████████████████████▊                                                                                                                         | 2/5 [00:41<01:11, 23.84s/it][A[A

 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 3/5 [00:54<00:40, 20.43s/it][A[A

 80%|█████████████

In [72]:
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([3.]).to(device))

### Window Size 1

In [68]:
window_size = 1
training_set_full.window_size = 1
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = 1
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

NameError: name 'training_set_full' is not defined

In [None]:
vnet1 = VideoNet(window_size=1)
vnet1.init_weights()
vnet1 = vnet1.to(device)
optimizer1 = optim.SGD(vnet1.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler1 = lr_scheduler.StepLR(optimizer1, step_size=30, gamma=0.1)

In [None]:
model1 = train_model(vnet1, criterion, optimizer1, exp_lr_scheduler1, num_epochs=100, dataloader=training_set_loader)

In [None]:
training_test_results1 = test_model(model1, criterion, training_test_loader)

In [None]:
torch.save(model1, '2-7-19_10am_videonet_windowsize1.pth')

### Window Size 2

In [None]:
window_size = 2
training_set_full.window_size = window_size
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = window_size
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

In [None]:
vnet2 = VideoNet(window_size=window_size)
vnet2.init_weights()
vnet2 = vnet2.to(device)
optimizer2 = optim.SGD(vnet2.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler2 = lr_scheduler.StepLR(optimizer2, step_size=30, gamma=0.1)

In [None]:
model2 = train_model(vnet2, criterion, optimizer2, exp_lr_scheduler2, num_epochs=100, dataloader=training_set_loader)

In [None]:
training_test_results2 = test_model(model2, criterion, training_test_loader)

In [None]:
true_positives2 = []
false_positives2 = []
for (output, label), item in zip(training_test_results2, dataset_training_test.items):
    if output >= 0 and label == 1:
        true_positives2.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives2.append((output, label, item))

In [None]:
fp_collected2 = collect(false_positives2, lambda tup: tup[2][0])
false_positive_intrvls2 = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected2[video_id]
    ]
    for video_id in fp_collected2
})

In [None]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls2, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

In [None]:
torch.save(model2, '2-7-19_10am_videonet_windowsize2.pth')

### Window Size 3

In [None]:
window_size = 3
training_set_full.window_size = window_size
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = window_size
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

In [None]:
vnet3 = VideoNet(window_size=window_size)
vnet3.init_weights()
vnet3 = vnet3.to(device)
optimizer3 = optim.SGD(vnet3.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler3 = lr_scheduler.StepLR(optimizer3, step_size=30, gamma=0.1)

In [None]:
model3 = train_model(vnet3, criterion, optimizer3, exp_lr_scheduler3, num_epochs=100, dataloader=training_set_loader)

In [None]:
training_test_results3 = test_model(model3, criterion, training_test_loader)

In [None]:
true_positives3 = []
false_positives3 = []
for (output, label), item in zip(training_test_results3, dataset_training_test.items):
    if output >= 0 and label == 1:
        true_positives3.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives3.append((output, label, item))
fp_collected3 = collect(false_positives3, lambda tup: tup[2][0])
false_positive_intrvls3 = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected3[video_id]
    ]
    for video_id in fp_collected3
})

In [None]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls3, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

In [None]:
torch.save(model3, '2-7-19_10am_videonet_windowsize3.pth')

### Window Size 4

In [None]:
window_size = 4
training_set_full.window_size = window_size
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = window_size
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

In [None]:
vnet4 = VideoNet(window_size=window_size)
vnet4.init_weights()
vnet4 = vnet4.to(device)
optimizer4 = optim.SGD(vnet4.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler4 = lr_scheduler.StepLR(optimizer4, step_size=30, gamma=0.1)

In [None]:
model4 = train_model(vnet4, criterion, optimizer4, exp_lr_scheduler4, num_epochs=100, dataloader=training_set_loader)

In [None]:
training_test_results4 = test_model(model4, criterion, training_test_loader)

In [None]:
true_positives4 = []
false_positives4 = []
for (output, label), item in zip(training_test_results4, dataset_training_test.items):
    if output >= 0 and label == 1:
        true_positives4.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives4.append((output, label, item))
fp_collected4 = collect(false_positives4, lambda tup: tup[2][0])
false_positive_intrvls4 = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected4[video_id]
    ]
    for video_id in fp_collected4
})

In [None]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls4, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

In [None]:
torch.save(model4, '2-7-19_10am_videonet_windowsize4.pth')

### Window Size 5

In [None]:
window_size = 5
training_set_full.window_size = window_size
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = window_size
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

In [None]:
vnet5 = VideoNet(window_size=window_size)
vnet5.init_weights()
vnet5 = vnet5.to(device)
optimizer5 = optim.SGD(vnet5.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler5 = lr_scheduler.StepLR(optimizer5, step_size=30, gamma=0.1)

In [None]:
model5 = train_model(vnet5, criterion, optimizer5, exp_lr_scheduler5, num_epochs=100, dataloader=training_set_loader)

In [None]:
training_test_results5 = test_model(model5, criterion, training_test_loader)

In [None]:
true_positives5 = []
false_positives5 = []
for (output, label), item in zip(training_test_results5, dataset_training_test.items):
    if output >= 0 and label == 1:
        true_positives5.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives5.append((output, label, item))
fp_collected5 = collect(false_positives5, lambda tup: tup[2][0])
false_positive_intrvls5 = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected5[video_id]
    ]
    for video_id in fp_collected5
})

In [None]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls5, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

In [None]:
torch.save(model5, '2-7-19_10am_videonet_windowsize5.pth')

### Window Size 6

In [None]:
window_size = 6
training_set_full.window_size = window_size
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = window_size
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

In [None]:
vnet6 = VideoNet(window_size=window_size)
vnet6.init_weights()
vnet6 = vnet6.to(device)
optimizer6 = optim.SGD(vnet6.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler6 = lr_scheduler.StepLR(optimizer6, step_size=30, gamma=0.1)

In [None]:
model6 = train_model(vnet6, criterion, optimizer6, exp_lr_scheduler6, num_epochs=100, dataloader=training_set_loader)

In [None]:
training_test_results6 = test_model(model6, criterion, training_test_loader)

In [None]:
true_positives6 = []
false_positives6 = []
for (output, label), item in zip(training_test_results6, dataset_training_test.items):
    if output >= 0 and label == 1:
        true_positives6.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives6.append((output, label, item))
fp_collected6 = collect(false_positives6, lambda tup: tup[2][0])
false_positive_intrvls6 = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected6[video_id]
    ]
    for video_id in fp_collected6
})

In [None]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls6, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

In [None]:
torch.save(model6, '2-7-19_10am_videonet_windowsize6.pth')

### Window Size 7

In [27]:
window_size = 7
training_set_full.window_size = window_size
training_set_loader = DataLoader(training_set_full, batch_size=8, shuffle=True, num_workers=0)
training_set_test_full.window_size = window_size
training_test_loader = DataLoader(training_set_test_full, batch_size=8, shuffle=False, num_workers=0)

In [28]:
vnet7 = VideoNet(window_size=window_size)
vnet7.init_weights()
vnet7 = vnet7.to(device)
optimizer7 = optim.SGD(vnet7.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler7 = lr_scheduler.StepLR(optimizer7, step_size=30, gamma=0.1)

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.torch/models/resnet18-5c106cde.pth
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46827520/46827520 [00:00<00:00, 85481229.99it/s]


In [35]:
model7 = train_model(vnet7, criterion, optimizer7, exp_lr_scheduler7, num_epochs=100, dataloader=training_set_loader)

Epoch 0/99
----------
tensor([[-0.3573],
        [-0.2509],
        [-0.2530],
        [-0.4443],
        [-0.2554],
        [-0.4207],
        [-0.3908],
        [-0.4511]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 1, 0, 0, 0], device='cuda:0') tensor(0.7722, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 1.0703 Acc: 0.4663 Precision: 0.1697 Recall: 0.2887
TP: 28.0 TN: 152.0 FP: 137.0 FN: 69.0

Epoch 1/99
----------
tensor([[-0.0580],
        [-0.0485],
        [-0.0383],
        [-0.0416],
        [-0.0491],
        [-0.0465],
        [-0.0559],
        [-0.0414]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 0, 1, 0, 0], device='cuda:0') tensor(0.8547, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 1.0518 Acc: 0.5337 Precision: 0.2215 Recall: 0.3402
TP: 33.0 TN: 173.0 FP: 116.0 FN: 64.0

Epoch 2/99
----------
tensor([[-0.0384],
        [-0.0549],
        [-0.0263],
        [-0.0354],
        [-0.0423],
        [-0.0210],
      

train Loss: 0.9596 Acc: 0.5725 Precision: 0.3396 Recall: 0.7423
TP: 72.0 TN: 149.0 FP: 140.0 FN: 25.0

Epoch 20/99
----------
tensor([[ 0.1309],
        [ 0.6287],
        [-0.2682],
        [-0.4781],
        [ 0.4797],
        [-0.5462],
        [-0.1551],
        [-0.7161]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.6628, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.9023 Acc: 0.6451 Precision: 0.3901 Recall: 0.7320
TP: 71.0 TN: 178.0 FP: 111.0 FN: 26.0

Epoch 21/99
----------
tensor([[ 1.5394],
        [-0.0145],
        [-0.0954],
        [-0.3253],
        [-1.4357],
        [-0.3003],
        [-0.1818],
        [ 1.4641]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([1, 0, 0, 0, 0, 0, 0, 1], device='cuda:0') tensor(0.5572, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.8307 Acc: 0.7176 Precision: 0.4620 Recall: 0.7526
TP: 73.0 TN: 204.0 FP: 85.0 FN: 24.0

Epoch 22/99
----------
tensor([[-0.364

train Loss: 0.1186 Acc: 0.9948 Precision: 0.9798 Recall: 1.0000
TP: 97.0 TN: 287.0 FP: 2.0 FN: 0.0

Epoch 40/99
----------
tensor([[ 9.0117],
        [ 2.1470],
        [ 3.6825],
        [-1.7150],
        [ 3.3906],
        [-0.0788],
        [-2.5183],
        [ 3.9242]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([1, 1, 1, 0, 1, 0, 0, 1], device='cuda:0') tensor(0.1828, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.1295 Acc: 0.9896 Precision: 0.9604 Recall: 1.0000
TP: 97.0 TN: 285.0 FP: 4.0 FN: 0.0

Epoch 41/99
----------
tensor([[-2.6057],
        [-5.8777],
        [-2.0634],
        [ 4.0418],
        [-3.6091],
        [ 6.0218],
        [ 2.8495],
        [-0.7944]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 1, 0, 1, 1, 0], device='cuda:0') tensor(0.1027, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.1188 Acc: 0.9896 Precision: 0.9604 Recall: 1.0000
TP: 97.0 TN: 285.0 FP: 4.0 FN: 0.0

Epoch 42/99
----------
tensor([[ 3.5455],
    

train Loss: 0.0538 Acc: 0.9974 Precision: 0.9898 Recall: 1.0000
TP: 97.0 TN: 288.0 FP: 1.0 FN: 0.0

Epoch 60/99
----------
tensor([[ 5.0814],
        [ 3.7181],
        [-6.5989],
        [-4.4709],
        [-6.1740],
        [ 4.3924],
        [-3.7858],
        [ 4.7132]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([1, 1, 0, 0, 0, 1, 0, 1], device='cuda:0') tensor(0.0239, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0545 Acc: 0.9948 Precision: 0.9897 Recall: 0.9897
TP: 96.0 TN: 288.0 FP: 1.0 FN: 1.0

Epoch 61/99
----------
tensor([[-5.2671],
        [-5.3012],
        [-4.3632],
        [-6.4412],
        [ 3.6811],
        [ 4.7425],
        [-6.2291],
        [-6.0011]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0') tensor(0.0162, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0459 Acc: 0.9974 Precision: 0.9898 Recall: 1.0000
TP: 97.0 TN: 288.0 FP: 1.0 FN: 0.0

Epoch 62/99
----------
tensor([[-7.6122],
    

train Loss: 0.0519 Acc: 0.9974 Precision: 0.9898 Recall: 1.0000
TP: 97.0 TN: 288.0 FP: 1.0 FN: 0.0

Epoch 80/99
----------
tensor([[-5.9441],
        [-3.2590],
        [-6.4122],
        [-5.9280],
        [-2.9076],
        [-4.4543],
        [ 2.2821],
        [ 3.0335]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 0, 0, 0, 0, 1, 1], device='cuda:0') tensor(0.0677, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0395 Acc: 1.0000 Precision: 1.0000 Recall: 1.0000
TP: 97.0 TN: 289.0 FP: 0.0 FN: 0.0

Epoch 81/99
----------
tensor([[-3.2764],
        [-5.8824],
        [ 4.0548],
        [-6.2092],
        [-4.4902],
        [-7.0577],
        [-3.9041],
        [-2.6391]], device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.0243, device='cuda:0', grad_fn=<MeanBackward1>)
train Loss: 0.0472 Acc: 0.9948 Precision: 0.9897 Recall: 0.9897
TP: 96.0 TN: 288.0 FP: 1.0 FN: 1.0

Epoch 82/99
----------
tensor([[-4.0189],
    

train Loss: 0.0369 Acc: 1.0000 Precision: 1.0000 Recall: 1.0000
TP: 97.0 TN: 289.0 FP: 0.0 FN: 0.0

Training complete in 23m 52s
Best val Acc: 1.000000


In [36]:
training_test_results7 = test_model(model7, criterion, training_test_loader)

3it [00:00,  9.44it/s]

tensor([[ 9.2456],
        [-5.7520],
        [-4.4700],
        [-3.4864],
        [-3.4894],
        [-3.7337],
        [-4.6350],
        [-4.4590]], device='cuda:0') tensor([1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.0150, device='cuda:0')


913it [01:09, 12.56it/s]

Loss: 0.0314 Acc: 1.0000 Precision: 1.0000 Recall: 1.0000
TP: 97.0 TN: 7204.0 FP: 0.0 FN: 0.0





In [38]:
true_positives7 = []
false_positives7 = []
for (output, label), item in zip(training_test_results7, training_set_test_full.items):
    if output >= 0 and label == 1:
        true_positives7.append((output, label, item))        
    if output > 0 and label == 0:
        false_positives7.append((output, label, item))
fp_collected7 = collect(false_positives7, lambda tup: tup[2][0])
false_positive_intrvls7 = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collected7[video_id]
    ]
    for video_id in fp_collected7
})

In [41]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvls7, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

InterfaceError: connection already closed

In [43]:
torch.save(model7, '2-7-19_5pm_videonet_windowsize7.pth')

  "type " + obj.__name__ + ". It won't be checked "


In [44]:
modeltest = torch.load('models/2-7-19_5pm_videonet_windowsize7.pth')

In [45]:
test_results = test_model(modeltest, criterion, dataloader_test)



0it [00:00, ?it/s][A[A

5it [00:00, 45.06it/s][A[A

11it [00:00, 48.58it/s][A[A

tensor([[-2.1145],
        [-2.7111],
        [-2.8488],
        [-2.8055],
        [-2.2808],
        [-2.4658],
        [-2.7493],
        [-2.0564]], device='cuda:0') tensor([1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') tensor(0.9032, device='cuda:0')




18it [00:00, 51.97it/s][A[A

25it [00:00, 54.70it/s][A[A

32it [00:00, 56.76it/s][A[A

39it [00:00, 58.33it/s][A[A

46it [00:00, 59.52it/s][A[A

52it [00:00, 55.32it/s][A[A

58it [00:01, 56.57it/s][A[A

65it [00:01, 57.71it/s][A[A

72it [00:01, 58.65it/s][A[A

79it [00:01, 59.65it/s][A[A

86it [00:01, 60.52it/s][A[A

93it [00:01, 61.08it/s][A[A

100it [00:01, 61.74it/s][A[A

107it [00:01, 62.32it/s][A[A

114it [00:01, 62.78it/s][A[A

121it [00:02, 63.12it/s][A[A

128it [00:02, 63.48it/s][A[A

135it [00:02, 63.50it/s][A[A

142it [00:02, 63.67it/s][A[A

149it [00:02, 63.90it/s][A[A

156it [00:02, 63.92it/s][A[A

163it [00:02, 63.86it/s][A[A

170it [00:02, 63.97it/s][A[A

177it [00:02, 63.56it/s][A[A

184it [00:02, 63.72it/s][A[A

191it [00:03, 63.95it/s][A[A

198it [00:03, 64.11it/s][A[A

205it [00:03, 64.02it/s][A[A

212it [00:03, 64.00it/s][A[A

219it [00:03, 64.19it/s][A[A

226it [00:03, 64.31it/s][A[A

233it [00:03, 63.8

3365it [00:55, 64.19it/s][A[A

3372it [00:55, 64.11it/s][A[A

3379it [00:55, 64.14it/s][A[A

3386it [00:55, 64.13it/s][A[A

3393it [00:55, 64.08it/s][A[A

3400it [00:55, 64.07it/s][A[A

3407it [00:55, 63.88it/s][A[A

3414it [00:55, 63.97it/s][A[A

3421it [00:55, 63.85it/s][A[A

3428it [00:56, 63.94it/s][A[A

3435it [00:56, 63.56it/s][A[A

3442it [00:56, 63.83it/s][A[A

3449it [00:56, 64.04it/s][A[A

3456it [00:56, 64.35it/s][A[A

3463it [00:56, 64.55it/s][A[A

3470it [00:56, 64.67it/s][A[A

3477it [00:56, 64.75it/s][A[A

3484it [00:56, 64.70it/s][A[A

3491it [00:57, 64.68it/s][A[A

3498it [00:57, 64.70it/s][A[A

3505it [00:57, 64.74it/s][A[A

3512it [00:57, 64.68it/s][A[A

3519it [00:57, 64.67it/s][A[A

3526it [00:57, 64.66it/s][A[A

3533it [00:57, 64.64it/s][A[A

3540it [00:57, 64.72it/s][A[A

3547it [00:57, 64.75it/s][A[A

3554it [00:58, 64.53it/s][A[A

3561it [00:58, 64.44it/s][A[A

3568it [00:58, 63.31it/s][A[A

3575it [00

Loss: 0.1728 Acc: 0.9895 Precision: 0.2000 Recall: 0.0038
TP: 2.0 TN: 50284.0 FP: 8.0 FN: 528.0


In [46]:
true_positivestest = []
false_positivestest = []
for (output, label), item in zip(test_results, dataset_test.items):
    if output >= 0 and label == 1:
        true_positivestest.append((output, label, item))        
    if output > 0 and label == 0:
        false_positivestest.append((output, label, item))
tp_collectedtest = collect(true_positivestest, lambda tup: tup[2][0])
true_positive_intrvlstest = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in tp_collectedtest[video_id]
    ]
    for video_id in tp_collectedtest
})
fp_collectedtest = collect(false_positivestest, lambda tup: tup[2][0])
false_positive_intrvlstest = VideoIntervalCollection({
    video_id: [
        (item[1], item[1], 0)
        for output, label, item in fp_collectedtest[video_id]
    ]
    for video_id in fp_collectedtest
})

In [47]:
esper_widget(
    intrvllists_to_result_with_objects(true_positive_intrvlstest, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

VGridWidget(jsglobals={'queries': [['All faces', 'def all_faces():\n    from query.models import Face\n    fro…

In [48]:
esper_widget(
    intrvllists_to_result_with_objects(false_positive_intrvlstest, lambda a, b: []),
    jupyter_keybindings=True,
    display_captions=False
)

VGridWidget(jsglobals={'queries': [['All faces', 'def all_faces():\n    from query.models import Face\n    fro…

## DeepSBD

In [195]:
class DeepSBDDataset(Dataset):
    def __init__(self, shots, window_size=16, stride=8, size=128):
        """Constrcutor for ShotDetectionDataset.
        
        Args:
            shots: VideoIntervalCollection of all the intervals to get frames from. If the payload is -1,
            then the interval is not an actual shot and just needs to be included in the dataset.
        """
        self.window_size = window_size
        items = set()
        frame_nums = {}
        
        shot_boundaries = shots.map(
            lambda intrvl: (intrvl.start, intrvl.start, intrvl.payload)
        ).filter(lambda intrvl: intrvl.payload != -1)
        
        clips = shots.dilate(1).coalesce().dilate(-1).map(
            lambda intrvl: (
                intrvl.start - stride - ((intrvl.start - stride) % stride),
                intrvl.end + stride - ((intrvl.end + stride) % stride),
                intrvl.payload
            )
        ).dilate(1).coalesce().dilate(-1)
        
        items_intrvls = {}
        for video_id in clips.get_allintervals():
            items_intrvls[video_id] = []
            for intrvl in clips.get_intervallist(video_id).get_intervals():
                items_intrvls[video_id] += [
                    (f, f + window_size, 0)
                    for f in range(intrvl.start, intrvl.end - stride, stride)
                ]
        items_col = VideoIntervalCollection(items_intrvls)
        
        items_w_boundaries = items_col.filter_against(
            shot_boundaries,
            predicate=during_inv()
        ).map(
            lambda intrvl: (intrvl.start, intrvl.end, 2)
        )
        
        items_w_labels = items_col.minus(
            items_w_boundaries, predicate=equal()
        ).set_union(items_w_boundaries)

        for video_id in items_w_labels.get_allintervals():
            frame_nums[video_id] = set()
            for intrvl in items_w_labels.get_intervallist(video_id).get_intervals():
                items.add((
                    video_id,
                    intrvl.start,
                    intrvl.end,
                    intrvl.payload
                ))
                for f in range(intrvl.start, intrvl.end):
                    frame_nums[video_id].add(f)

        self.items = sorted(list(items))
        
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            Scale((128, 128)),
            ToTensor(1),
            Normalize(get_mean(1), (1, 1, 1))
        ])
        
        # Load frames into memory
        self.frames = {
            video_id: {
                'frame_nums': sorted(list(frame_nums[video_id])),
                'frames': [
                    self.transform(f)
                    for f in Video.objects.get(id=video_id).for_scannertools().frames(
                        sorted(list(frame_nums[video_id]))
                    )
                ]
            }
            for video_id in tqdm(frame_nums)
        }
    
    def __len__(self):
        return len(self.items)
    
    def __getitem__(self, idx):
        """
        Indexed by video ID, then frame number
        Returns self.window_size frames before the indexed frame to self.window_size
            frames after the indexed frame
        """
        video_id, start_frame, end_frame, label = self.items[idx]
        
        start_index = self.frames[video_id]['frame_nums'].index(start_frame)
        img_tensors = self.frames[video_id]['frames'][start_index:start_index + self.window_size]
        
#         img_tensors = [
#             self.transform(f)
#             for f in Video.objects.get(id=video_id).for_scannertools().frames(
#                 list(range(frame_num - self.window_size, frame_num + self.window_size + 1))
#             )
#         ]
        
        return torch.stack(img_tensors).permute(1, 0, 2, 3), label, (video_id, start_frame, end_frame)
#         return label, (video_id, start_frame, end_frame)

In [196]:
deepsbddata = DeepSBDDataset(shots_gt_training)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:38<00:00, 21.20s/it]


In [197]:
deepsbddataloader = DataLoader(deepsbddata, batch_size=8, shuffle=False, num_workers=0)

In [198]:
class deepSBD(nn.Module):
    def __init__(self):
        super(deepSBD, self).__init__()
        self.conv1=nn.Conv3d(3, 96, kernel_size=3, stride=(1, 2, 2),
                               padding=(0,0,0), bias=True)
        self.relu1=nn.ReLU(inplace=True)
        self.pool1=nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(1,2,2), padding=0)
        self.conv2=nn.Conv3d(96, 256, kernel_size=3, stride=(1, 2, 2),
                               padding=(0,0,0), bias=True)
        self.relu2=nn.ReLU(inplace=True)
        self.pool2=nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(1,2,2), padding=0)
        self.conv3=nn.Conv3d(256, 384, kernel_size=3, stride=1,
                               padding=1, bias=True)
        self.relu3=nn.ReLU(inplace=True)
        self.conv4=nn.Conv3d(384, 384, kernel_size=3, stride=1,
                               padding=1, bias=True)
        self.relu4=nn.ReLU(inplace=True)
        self.conv5=nn.Conv3d(384, 256, kernel_size=3, stride=1,
                               padding=1, bias=True)
        self.relu5=nn.ReLU(inplace=True)
        self.pool1=nn.MaxPool3d(kernel_size=(3, 3, 3), stride=(1,2,2), padding=0)
        self.fc6=nn.Linear(100352, 2048)
        self.relu6=nn.ReLU(inplace=True)
        self.fc7=nn.Linear(2048, 2048)
        self.relu7=nn.ReLU(inplace=True)
        self.fc8=nn.Linear(2048, 3)
    
    def forward(self,x):
        x=self.conv1(x)
        x=self.relu1(x)
        x=self.pool1(x)
        x=self.conv2(x)
        x=self.relu2(x)
        x=self.pool2(x)
        x=self.conv3(x)
        x=self.relu3(x)
        x=self.conv4(x)
        x=self.relu4(x)
        x=self.conv5(x)
        x=self.relu5(x)
        x=x.view(x.size(0),-1)
        x=self.fc6(x)
        x=self.relu6(x)
        x=self.fc7(x)
        x=self.relu7(x)
        x=self.fc8(x)
        return x

In [199]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
from functools import partial
import os

__all__ = ['ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']


def conv3x3x3(in_planes, out_planes, stride=1):
    # 3x3x3 convolution with padding
    return nn.Conv3d(in_planes, out_planes, kernel_size=3,
                     stride=stride, padding=1, bias=False)


def downsample_basic_block(x, planes, stride):
    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
    zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
                             out.size(2), out.size(3),
                             out.size(4)).zero_()
    if isinstance(out.data, torch.cuda.FloatTensor):
        zero_pads = zero_pads.cuda()

    out = Variable(torch.cat([out.data, zero_pads], dim=1))

    return out


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm3d(planes)
        self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    
    def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
                               padding=(3, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
        self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
        last_duration = math.ceil(sample_duration / 16)
        last_size = math.ceil(sample_size / 32)
        self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
                m.eval()

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(downsample_basic_block,
                                     planes=planes * block.expansion,
                                     stride=stride)
            else:
                bn=nn.BatchNorm3d(planes * block.expansion)
                bn.eval()
                downsample = nn.Sequential(
                    nn.Conv3d(self.inplanes, planes * block.expansion,
                              kernel_size=1, stride=stride, bias=False),
                    bn
                )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)

        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

    def load_weights(self, base_file):
        other, ext = os.path.splitext(base_file)
        if ext == '.pkl' or '.pth':
            print('Loading weights into state dict...')
            pretrained=torch.load(base_file, map_location=lambda storage, loc: storage)['state_dict']
            pretrained={"{}".format(s[7:]):v for s,v in pretrained.items()}
            current_param=self.state_dict()
            pretrained={k:v for k,v in pretrained.items() if k in current_param and k[:2]!='fc'}
            current_param.update(pretrained)
            print(pretrained.keys())
            #print(self.state_dict().keys())
            self.load_state_dict(current_param)
            print('Finished!')
        else:
            print('Sorry only .pth and .pkl files supported.')


def get_fine_tuning_parameters(model, ft_begin_index):
    if ft_begin_index == 0:
        return model.parameters()

    ft_module_names = []
    for i in range(ft_begin_index, 5):
        ft_module_names.append('layer{}'.format(ft_begin_index))
    ft_module_names.append('fc')

    parameters = []
    for k, v in model.named_parameters():
        for ft_module in ft_module_names:
            if ft_module in k:
                parameters.append({'params': v})
                break
        else:
            parameters.append({'params': v, 'lr': 0.0})

    return parameters


def resnet10(**kwargs):
    """Constructs a ResNet-18 model.
    """
    model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
    return model

def resnet18(**kwargs):
    """Constructs a ResNet-18 model.
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    return model

def resnet34(**kwargs):
    """Constructs a ResNet-34 model.
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    return model

def resnet50(**kwargs):
    """Constructs a ResNet-50 model.
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    return model

def resnet101(**kwargs):
    """Constructs a ResNet-101 model.
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    return model

def resnet152(**kwargs):
    """Constructs a ResNet-101 model.
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    return model

def resnet200(**kwargs):
    """Constructs a ResNet-101 model.
    """
    model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
    return model

In [200]:
deepsbd_alexnet_model = deepSBD()

In [201]:
alexnet_state_dict = torch.load('models/ClipShots-DeepSBD-Alexnet-final.pth')['state_dict']

In [202]:
new_state_dict = OrderedDict()
for k, v in alexnet_state_dict.items():
    name = k[7:]
    new_state_dict[name] = v

In [203]:
deepsbd_alexnet_model.load_state_dict(new_state_dict)

In [204]:
deepsbd_alexnet_model = deepsbd_alexnet_model.to(device)

In [205]:
deepsbd_alexnet_model = deepsbd_alexnet_model.eval()

In [206]:
deepsbd_resnet_model = resnet18(num_classes=3,
                                sample_size=128,
                                sample_duration=16)

In [207]:
resnet_state_dict = torch.load('models/ClipShots-DeepSBD-Resnet-18-final.pth')['state_dict']

In [208]:
new_resnet_state_dict = OrderedDict()
for k, v in resnet_state_dict.items():
    name = k[7:]
    new_resnet_state_dict[name] = v

In [209]:
deepsbd_resnet_model.load_state_dict(new_resnet_state_dict)

In [210]:
deepsbd_resnet_model = deepsbd_resnet_model.to(device)

In [211]:
deepsbd_resnet_model = deepsbd_resnet_model.eval()

In [212]:
def get_label(res_tensor):
    res_numpy=res_tensor.data.cpu().numpy()
    labels=[]
    for row in res_numpy:
        labels.append(np.argmax(row))
    return labels

In [213]:
def prf1_array(pos_label, neg_label, gt, preds):
    tp = 0.
    fp = 0.
    tn = 0.
    fn = 0.
    
    for truth, pred in zip(gt, preds):
        if truth == pred:
            if pred == pos_label:
                tp += 1.
            else:
                tn += 1.
        else:
            if pred == pos_label:
                fp += 1.
            else:
                fn += 1.
    
    precision = tp / (tp + fp) if tp + fp != 0 else 0
    recall = tp / (tp + fn) if tp + fn != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0
    
    return (precision, recall, f1, tp, tn, fp, fn)

In [214]:
def test_deepsbd(model, dataloader):
    preds = []
    labels = []
    outputs = []
    for clip_tensor, l, _ in tqdm(dataloader):
        o = model(clip_tensor.to(device))

        preds += get_label(o)
        labels += l.data.numpy().tolist()
        outputs += o.cpu().data.numpy().tolist()
    
    preds = [2 if p == 2 else 0 for p in preds]
        
    precision, recall, f1, tp, tn, fp, fn = prf1_array(2, 0, labels, preds)
    print("Precision: {}, Recall: {}, F1: {}".format(precision, recall, f1))
    print("TP: {}, TN: {}, FP: {}, FN: {}".format(tp, tn, fp, fn))
    
    return preds, labels, outputs

In [215]:
training_preds, training_labels, training_outputs = test_deepsbd(deepsbd_alexnet_model, deepsbddataloader)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [00:04<00:00, 24.31it/s]

Precision: 0.8601036269430051, Recall: 0.9120879120879121, F1: 0.8853333333333334
TP: 166.0, TN: 709.0, FP: 27.0, FN: 16.0





In [216]:
training_preds_rn, training_labels_rn, training_outputs_rn = test_deepsbd(deepsbd_resnet_model, deepsbddataloader)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [00:07<00:00, 14.66it/s]

Precision: 0.8461538461538461, Recall: 0.967032967032967, F1: 0.9025641025641026
TP: 176.0, TN: 704.0, FP: 32.0, FN: 6.0





# ClipShots

In [96]:
import json
from PIL import Image

In [32]:
# load up ground truth from the clipshots data
with open('/app/data/ClipShots/annotations/train.json', 'r') as f:
    train_gt = json.load(f)
with open('/app/data/ClipShots/annotations/test.json', 'r') as f:
    test_gt = json.load(f)
with open('/app/data/ClipShots/annotations/only_gradual.json', 'r') as f:
    only_gradual_gt = json.load(f)

In [47]:
# load up video lists
with open('/app/data/ClipShots/video_lists/train.txt', 'r') as f:
    train_videos = [
        l.strip() for l in f.readlines()
    ]
with open('/app/data/ClipShots/video_lists/test.txt', 'r') as f:
    test_videos = [
        l.strip() for l in f.readlines()
    ]
with open('/app/data/ClipShots/video_lists/only_gradual.txt', 'r') as f:
    only_gradual_videos = [
        l.strip() for l in f.readlines()
    ]

In [97]:
def pil_loader(path):
    if not os.path.exists(path):
        return None
    try:
        with open(path, 'rb') as f:
            with Image.open(f) as img:
                return img.convert('RGB')
    except:
        return None

In [168]:
import random
import math
import numbers
import collections
import numpy as np
import torch
from PIL import Image, ImageOps
try:
    import accimage
except ImportError:
    accimage = None


class Compose(object):
    """Composes several transforms together.
    Args:
        transforms (list of ``Transform`` objects): list of transforms to compose.
    Example:
        >>> transforms.Compose([
        >>>     transforms.CenterCrop(10),
        >>>     transforms.ToTensor(),
        >>> ])
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img):
        for t in self.transforms:
            img = t(img)
        return img

    def randomize_parameters(self):
        for t in self.transforms:
            t.randomize_parameters()


class ToTensor(object):
    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
    Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
    """

    def __init__(self, norm_value=255):
        self.norm_value = norm_value

    def __call__(self, pic):
        """
        Args:
            pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
        Returns:
            Tensor: Converted image.
        """
        if isinstance(pic, np.ndarray):
            # handle numpy array
            img = torch.from_numpy(pic.transpose((2, 0, 1)))
            # backward compatibility
            return img.float().div(self.norm_value)

        if accimage is not None and isinstance(pic, accimage.Image):
            nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
            pic.copyto(nppic)
            return torch.from_numpy(nppic)

        # handle PIL Image
        if pic.mode == 'I':
            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
        elif pic.mode == 'I;16':
            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
        else:
            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
        if pic.mode == 'YCbCr':
            nchannel = 3
        elif pic.mode == 'I;16':
            nchannel = 1
        else:
            nchannel = len(pic.mode)
        img = img.view(pic.size[1], pic.size[0], nchannel)
        # put it from HWC to CHW format
        # yikes, this transpose takes 80% of the loading time/CPU
        img = img.transpose(0, 1).transpose(0, 2).contiguous()
        if isinstance(img, torch.ByteTensor):
            return img.float().div(self.norm_value)
        else:
            return img

    def randomize_parameters(self):
        pass


class Normalize(object):
    """Normalize an tensor image with mean and standard deviation.
    Given mean: (R, G, B) and std: (R, G, B),
    will normalize each channel of the torch.*Tensor, i.e.
    channel = (channel - mean) / std
    Args:
        mean (sequence): Sequence of means for R, G, B channels respecitvely.
        std (sequence): Sequence of standard deviations for R, G, B channels
            respecitvely.
    """

    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        # TODO: make efficient
        for t, m, s in zip(tensor, self.mean, self.std):
            t.sub_(m).div_(s)
        return tensor

    def randomize_parameters(self):
        pass


class Scale(object):
    """Rescale the input PIL.Image to the given size.
    Args:
        size (sequence or int): Desired output size. If size is a sequence like
            (w, h), output size will be matched to this. If size is an int,
            smaller edge of the image will be matched to this number.
            i.e, if height > width, then image will be rescaled to
            (size * height / width, size)
        interpolation (int, optional): Desired interpolation. Default is
            ``PIL.Image.BILINEAR``
    """

    def __init__(self, size, interpolation=Image.BILINEAR):
        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
        self.size = size
        self.interpolation = interpolation

    def __call__(self, img):
        """
        Args:
            img (PIL.Image): Image to be scaled.
        Returns:
            PIL.Image: Rescaled image.
        """
        if isinstance(self.size, int):
            w, h = img.size
            if (w <= h and w == self.size) or (h <= w and h == self.size):
                return img
            if w < h:
                ow = self.size
                oh = int(self.size * h / w)
                return img.resize((ow, oh), self.interpolation)
            else:
                oh = self.size
                ow = int(self.size * w / h)
                return img.resize((ow, oh), self.interpolation)
        else:
            return img.resize(self.size, self.interpolation)

    def randomize_parameters(self):
        pass


class CenterCrop(object):
    """Crops the given PIL.Image at the center.
    Args:
        size (sequence or int): Desired output size of the crop. If size is an
            int instead of sequence like (h, w), a square crop (size, size) is
            made.
    """

    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, img):
        """
        Args:
            img (PIL.Image): Image to be cropped.
        Returns:
            PIL.Image: Cropped image.
        """
        w, h = img.size
        th, tw = self.size
        x1 = int(round((w - tw) / 2.))
        y1 = int(round((h - th) / 2.))
        return img.crop((x1, y1, x1 + tw, y1 + th))

    def randomize_parameters(self):
        pass


class CornerCrop(object):
    def __init__(self, size, crop_position=None):
        self.size = size
        if crop_position is None:
            self.randomize = True
        else:
            self.randomize = False
        self.crop_position = crop_position
        self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']

    def __call__(self, img):
        image_width = img.size[0]
        image_height = img.size[1]

        if self.crop_position == 'c':
            th, tw = (self.size, self.size)
            x1 = int(round((image_width - tw) / 2.))
            y1 = int(round((image_height - th) / 2.))
            x2 = x1 + tw
            y2 = y1 + th
        elif self.crop_position == 'tl':
            x1 = 0
            y1 = 0
            x2 = self.size
            y2 = self.size
        elif self.crop_position == 'tr':
            x1 = image_width - self.size
            y1 = 0
            x2 = image_width
            y2 = self.size
        elif self.crop_position == 'bl':
            x1 = 0
            y1 = image_height - self.size
            x2 = self.size
            y2 = image_height
        elif self.crop_position == 'br':
            x1 = image_width - self.size
            y1 = image_height - self.size
            x2 = image_width
            y2 = image_height

        img = img.crop((x1, y1, x2, y2))

        return img

    def randomize_parameters(self):
        if self.randomize:
            self.crop_position = self.crop_positions[
                random.randint(0, len(self.crop_positions) - 1)]


class RandomHorizontalFlip(object):
    """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""

    def __call__(self, img):
        """
        Args:
            img (PIL.Image): Image to be flipped.
        Returns:
            PIL.Image: Randomly flipped image.
        """
        if self.p < 0.5:
            return img.transpose(Image.FLIP_LEFT_RIGHT)
        return img

    def randomize_parameters(self):
        self.p = random.random()


class MultiScaleCornerCrop(object):
    """Crop the given PIL.Image to randomly selected size.
    A crop of size is selected from scales of the original size.
    A position of cropping is randomly selected from 4 corners and 1 center.
    This crop is finally resized to given size.
    Args:
        scales: cropping scales of the original size
        size: size of the smaller edge
        interpolation: Default: PIL.Image.BILINEAR
    """

    def __init__(self, scales, size, interpolation=Image.BILINEAR):
        self.scales = scales
        self.size = size
        self.interpolation = interpolation

        self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']

    def __call__(self, img):
        min_length = min(img.size[0], img.size[1])
        crop_size = int(min_length * self.scale)

        image_width = img.size[0]
        image_height = img.size[1]

        if self.crop_position == 'c':
            center_x = image_width // 2
            center_y = image_height // 2
            box_half = crop_size // 2
            x1 = center_x - box_half
            y1 = center_y - box_half
            x2 = center_x + box_half
            y2 = center_y + box_half
        elif self.crop_position == 'tl':
            x1 = 0
            y1 = 0
            x2 = crop_size
            y2 = crop_size
        elif self.crop_position == 'tr':
            x1 = image_width - crop_size
            y1 = 0
            x2 = image_width
            y2 = crop_size
        elif self.crop_position == 'bl':
            x1 = 0
            y1 = image_height - crop_size
            x2 = crop_size
            y2 = image_height
        elif self.crop_position == 'br':
            x1 = image_width - crop_size
            y1 = image_height - crop_size
            x2 = image_width
            y2 = image_height

        img = img.crop((x1, y1, x2, y2))

        return img.resize((self.size, self.size), self.interpolation)

    def randomize_parameters(self):
        self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
        self.crop_position = self.crop_positions[random.randint(0, len(self.scales) - 1)]

def get_mean(norm_value=255):
    return [114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value]

def get_train_spatial_transform(opt):
    return Compose([MultiScaleCornerCrop(opt.scales, opt.sample_size),
                                     RandomHorizontalFlip(),
                                     ToTensor(opt.norm_value),
                                     Normalize(get_mean(opt.norm_value), [1, 1, 1])])

def get_test_spatial_transform(opt):
    return Compose([Scale((opt.spatial_size,opt.spatial_size)),
                    ToTensor(opt.norm_value),
                    Normalize(get_mean(opt.norm_value), [1, 1, 1])])

In [190]:
class DeepSBDClipShotsDataset(Dataset):
    def __init__(self, root_path, video_list, annotations, window_size=16, stride=8, size=128):
        """Constrcutor for ShotDetectionDataset.
        
        Args:
            root_path: path to the folder that holds the videos
            shots: list of video names
            annotations: dict mapping video names to dicts of 'frame_num' and 'transitions' -
                frame_num is the number of frames in the video, transitions is a list of arrays of
                length two with start/end
        """
        self.window_size = window_size
        self.root_path = root_path
        items = set()
        frame_nums = {}
        
        for video in video_list:
            frame_nums[video] = set()
            items_intrvllist = IntervalList([
                (f, f+16, 0)
                for f in range(0, int(annotations[video]['frame_num']), stride)
            ])
            transitions = IntervalList([
                (t[0], t[1], 0)
                for t in annotations[video]['transitions']
            ])
            items_w_cuts = items_intrvllist.filter_against(
                transitions.filter_length(max_length=1),
                predicate=during_inv(),
                working_window=1
            ).map(
                lambda intrvl: (
                    intrvl.start,
                    intrvl.end,
                    2
                )
            )
            items_w_transitions = items_intrvllist.filter_against(
                transitions.filter_length(min_length=2),
                predicate=during_inv(),
                working_window=1
            ).map(
                lambda intrvl: (
                    intrvl.start,
                    intrvl.end,
                    1
                )
            )
            final_items = items_intrvllist.minus(
                items_w_cuts, predicate = equal()
            ).set_union(items_w_cuts).minus(
                items_w_transitions, predicate = equal()
            ).set_union(items_w_transitions)
            for intrvl in final_items.get_intervals():
                items.add((
                    video,
                    intrvl.start,
                    intrvl.end,
                    intrvl.payload
                ))
            
            for i in range(0, int(annotations[video]['frame_num'])):
                frame_nums[video].add(i)

        self.items = sorted(list(items))
        
        self.transform = get_test_spatial_transform(Opt(128, 1))
        
        # Load frames into memory. NEED TO REWRITE THIS!
#         self.frames = {
#             video_id: {
#                 'frame_nums': sorted(list(frame_nums[video_id])),
#                 'frames': [
#                     self.transform(f)
#                     for f in Video.objects.get(id=video_id).for_scannertools().frames(
#                         sorted(list(frame_nums[video_id]))
#                     )
#                 ]
#             }
#             for video_id in tqdm(frame_nums)
#         }
    
    def __len__(self):
        return len(self.items)
    
    def __getitem__(self, idx):
        """
        Indexed by video ID, then frame number
        Returns self.window_size frames before the indexed frame to self.window_size
            frames after the indexed frame
        """
        video, start_frame, end_frame, label = self.items[idx]
        
#         start_index = self.items[video_id]['frame_nums'].index(start_frame)
#         img_tensors = self.frames[video_id]['frames'][start_index:start_index + self.window_size]
        
#         img_tensors = [
#             self.transform(f)
#             for f in Video.objects.get(id=video_id).for_scannertools().frames(
#                 list(range(frame_num - self.window_size, frame_num + self.window_size + 1))
#             )
#         ]

        imgs = []
        for i in range(start_frame, end_frame):
            img = pil_loader(os.path.join(self.root_path, video, 'image_{}.jpg'.format(i)))
            if img is not None:
                imgs.append(self.transform(img))
        if len(imgs) == 0:
            return None
                
        while len(imgs) < self.window_size:
            imgs.append(imgs[-1])
            
        return torch.stack(imgs).permute(1, 0, 2, 3), label, (video, start_frame, end_frame)
#         return label, (video_id, start_frame, end_frame)

In [191]:
deepsbdclipshot_dataset_train = DeepSBDClipShotsDataset(
    '/app/data/ClipShots/frames/train',
    train_videos[:1],
    train_gt
)

In [175]:
train_videos[:1]

['4001498009.mp4']

In [161]:
im = pil_loader('/app/data/ClipShots/frames/train/4001498009.mp4/image_1.jpg')

In [163]:
print(im)

<PIL.Image.Image image mode=RGB size=360x360 at 0x7F5B450BF630>


In [169]:
class Opt:
    def __init__(self, spatial_size, norm_value):
        self.spatial_size = spatial_size
        self.norm_value = norm_value

In [171]:
get_test_spatial_transform(Opt(128, 1))(im)

tensor([[[-114.7748, -114.7748, -114.7748,  ..., -114.7748, -114.7748,
          -114.7748],
         [-114.7748, -114.7748, -114.7748,  ..., -114.7748, -114.7748,
          -114.7748],
         [-114.7748, -114.7748, -114.7748,  ..., -114.7748, -114.7748,
          -114.7748],
         ...,
         [-113.7748, -113.7748, -113.7748,  ..., -107.7748, -107.7748,
          -107.7748],
         [-113.7748, -113.7748, -113.7748,  ..., -107.7748, -107.7748,
          -107.7748],
         [-113.7748, -113.7748, -113.7748,  ..., -107.7748, -107.7748,
          -107.7748]],

        [[-107.7354, -107.7354, -105.7354,  ..., -106.7354, -107.7354,
          -107.7354],
         [-107.7354, -107.7354, -107.7354,  ..., -107.7354, -107.7354,
          -107.7354],
         [-107.7354, -107.7354, -107.7354,  ..., -107.7354, -107.7354,
          -107.7354],
         ...,
         [-107.7354, -107.7354, -107.7354,  ..., -107.7354, -107.7354,
          -107.7354],
         [-107.7354, -107.7354, -107.735

In [172]:
transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.ToTensor(),
            transforms.Normalize((114.7748, 107.7354, 99.475), (1, 1, 1))
        ])(im)

tensor([[[-114.7748, -114.7748, -114.7748,  ..., -114.7748, -114.7748,
          -114.7748],
         [-114.7748, -114.7748, -114.7748,  ..., -114.7748, -114.7748,
          -114.7748],
         [-114.7748, -114.7748, -114.7748,  ..., -114.7748, -114.7748,
          -114.7748],
         ...,
         [-114.7709, -114.7709, -114.7709,  ..., -114.7474, -114.7474,
          -114.7474],
         [-114.7709, -114.7709, -114.7709,  ..., -114.7474, -114.7474,
          -114.7474],
         [-114.7709, -114.7709, -114.7709,  ..., -114.7474, -114.7474,
          -114.7474]],

        [[-107.7354, -107.7354, -107.7276,  ..., -107.7315, -107.7354,
          -107.7354],
         [-107.7354, -107.7354, -107.7354,  ..., -107.7354, -107.7354,
          -107.7354],
         [-107.7354, -107.7354, -107.7354,  ..., -107.7354, -107.7354,
          -107.7354],
         ...,
         [-107.7354, -107.7354, -107.7354,  ..., -107.7354, -107.7354,
          -107.7354],
         [-107.7354, -107.7354, -107.735

In [148]:
train_gt['4001498009.mp4']['frame_num']

4833.0

In [192]:
deepsbdclipshot_train_loader = DataLoader(deepsbdclipshot_dataset_train, batch_size=8, shuffle=False, num_workers=0)

In [193]:
a, b, c = test_deepsbd(deepsbd_resnet_model, deepsbdclipshot_train_loader)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 76/76 [00:50<00:00,  1.76it/s]

Precision: 0.896551724137931, Recall: 0.9122807017543859, F1: 0.9043478260869565
TP: 104.0, TN: 479.0, FP: 12.0, FN: 10.0





In [194]:
aa, bb, cc = test_deepsbd(deepsbd_alexnet_model, deepsbdclipshot_train_loader)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 76/76 [00:50<00:00,  1.81it/s]

Precision: 0.896551724137931, Recall: 0.9122807017543859, F1: 0.9043478260869565
TP: 104.0, TN: 479.0, FP: 12.0, FN: 10.0





# Notes

## Model/loss: raw output of last FC layer to BCEWithLogitsLoss

Training with perfectly balanced classes - selected 58 positive examples from training dataset and randomly selected 58 negative examples:
* Achieved 100% accuracy on train.
* On test, precision/recall at 26.7%/24.4%. Confusion matrix `TP: 139.0 TN: 52399.0 FP: 381.0 FN: 430.0`. Output of model had absolute value < 0.5.
* Saved in `2-5-19_529pm_videonet_1to1classbalance_bcewithlogitsloss.pth`.

Training with 10:1 class imbalance - 58 positive examples, 580 randomly selected negative examples:
* Achieved 100% accuracy on train.
* On test, precision/recall at 57.9%/1.9%. Confusion matrix `TP: 11.0 TN: 52772.0 FP: 8.0 FN: 558.0`. Output of model had absolute value around 5-10.
* Saved in `2-6-19_948am_videonet_10to1classbalance_bcewithlogitsloss.pth`.

Training with 2:1 class imbalance - 58 positive examples, 58 randomly selected negative examples, 58 examples from the end of shots:
* Achieved 100% accuracy on train.
* On test, precision/recall at 18.2%/3.9%. Confusion matrix `TP: 22.0 TN: 52681.0 FP: 99.0 FN: 547.0`. Output of model had absolute value < 2.
* Saved in `2-6-19_1016am_videonet_2to1classbalance_bcewithlogitsloss.pth`.

Issue: if you train on a subset of frames from the training clips, you won't do great on the full range of frames from the training clips. I.e. if you train on all the shot transitions, along with some random selected non-transition frames, you'll be able to identify all the shot transitions in your training clips, but you'll also get a bunch of false positives.

Training with a 3:1 class imbalance and 97 positive examples - plus 97 randomly selected negative examples. 97 examples from the end of shots, and 97 examples from the frame right after each shot transition.
* 100% accuracy on the training set.
* On the full set of training clips, 100% recall with 66% precision.
* On the set of training clips, hallucinating that many frames in a row are shot boundaries. Confusion matrix `TP: 97.0 TN: 7155.0 FP: 49.0 FN: 0.0`.
* Saved in `2-6-19_5pm_videonet_3to1classbalance_bcewithlogitsloss.pth`.

### Scratchpad

In [None]:
for inputs, labels in dataloader:
    inputs = [i.to(device) for i in inputs]
    labels = labels.to(device)
    outputs = vnet(inputs[0], inputs[1], inputs[2])
    print(outputs, labels)
    break

In [None]:
criterion(
    torch.tensor([
        [0.3, 0.7],
        [0.7, 0.3],
        [0.7, 0.3],
        [0.7, 0.3]
    ]),
    torch.tensor([
        1, 0, 0, 0
    ])
)

In [None]:
criterion = nn.CrossEntropyLoss(weight=torch.tensor([.01, 1.]))

In [None]:
nn.CrossEntropyLoss(weight=torch.tensor([1., .5]).to(device))(
    torch.tensor(
        [[-0.9855, 1.1573]]
    ).to(device),
    torch.tensor([1]).to(device)
)

In [None]:
criterion(
    torch.tensor([
        [0.8, 0.2],
        [0.8, 0.2],
        [0.8, 0.2],
        [0.8, 0.2]
    ]),
    torch.tensor([
        1, 0, 0, 0
    ])
)

In [None]:
#tenlayer_resnet = models.ResNet(models.resnet.BasicBlock, [1, 1, 1, 1], num_classes=128)

In [None]:
# Replace the avgpool layer with an AdaptiveAvgPool so we don't have to worry about input size
#tenlayer_resnet.avgpool = nn.AdaptiveAvgPool2d((1, 1))

In [None]:
#print(tenlayer_resnet)

In [None]:
#params = list(tenlayer_resnet.parameters())

In [None]:
#len(params)

In [None]:
#params[-1].size()

In [None]:
# Load up an image and run it through the network
vid_id = list(shots_gt.get_allintervals().keys())[0]
frame = shots_gt.get_intervallist(vid_id).get_intervals()[0].start
img = cv2.cvtColor(load_frame(Video.objects.get(id=vid_id), frame, []), cv2.COLOR_BGR2RGB)
plt.imshow(img)

In [None]:
#img_tensor = transforms.ToTensor()(img)

In [None]:
#tenlayer_resnet(img_tensor.unsqueeze(0))

In [None]:
imgs = [
   cv2.cvtColor(load_frame(Video.objects.get(id=123), f, []), cv2.COLOR_BGR2RGB)
   for f in range(14455-1, 14455+2)
]

In [None]:
plt.imshow(imgs[0])

In [None]:
plt.imshow(imgs[1])

In [None]:
plt.imshow(imgs[2])

In [None]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [None]:
img_tensors = [
    transform(npimg).unsqueeze(0).to(device)
    for npimg in imgs
]

In [None]:
img_tensors[0]

In [None]:
o = vnet(img_tensors[0], img_tensors[1], img_tensors[2])

In [None]:
o

In [None]:
model(img_tensors[0], img_tensors[1], img_tensors[2])

In [None]:
torch.max(o, 1)

In [None]:
class ShotDetectionDataset(Dataset):
    def __init__(self, shots, window_size=1, height=224):
        """Constrcutor for ShotDetectionDataset.
        
        Args:
            shots: VideoIntervalCollection of all the intervals to get frames from.
        """
        self.window_size = window_size
        frames = set()
        
        for video_id in shots.get_allintervals():
            for intrvl in shots.get_intervallist(video_id).get_intervals():
                for f in range(intrvl.start, intrvl.end + 1):
                    frames.add((video_id, f, 1 if f == intrvl.start else 0))
        self.frames = sorted(list(frames))
        
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(224),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        """
        Indexed by video ID, then frame number
        Returns self.window_size frames before the indexed frame to self.window_size
            frames after the indexed frame
        """
        video_id, frame_num, label = self.frames[idx]
        npimgs = [
            cv2.cvtColor(load_frame(Video.objects.get(id=video_id), f, []), cv2.COLOR_BGR2RGB)
            for f in range(frame_num-self.window_size, frame_num+self.window_size + 1)
        ]
        img_tensors = [
            self.transform(npimg)
            for npimg in imgs
        ]
        
        return img_tensors, label

In [None]:
dataset = ShotDetectionDataset(shots_gt)

In [None]:
len(dataset)

In [None]:
for i in range(len(dataset)):
    sample = dataset[i]
    
    print(i, sample)
    
    if i == 3:
        break

In [None]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0)

In [None]:
for i_batch, sample_batched in enumerate(dataloader):
    sample, label = sample_batched
    print(i_batch, len(sample))
    print(sample[0].size())
    print(label)
    if i_batch == 3:
        break

In [None]:
vnet(sample_batched[0], sample_batched[1], sample_batched[2])

In [None]:
vnet.train()

In [None]:
outs = vnet(sample_batched[0], sample_batched[1], sample_batched[2])

In [None]:
torch.max(outs, 1)

In [None]:
len(list(vnet.modules()))