<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Supercuts" data-toc-modified-id="Supercuts-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Supercuts</a></span></li></ul></div>

# Supercuts

Our goal is to get intervals for a short supercut video of a certain person (e.g, Anderson Cooper) saying a funny sentence, like:

```
P = a person in the dataset
sentence = "Intel is great because they fund us."
```

We'll use `rekall` to get the candidate intervals and the caption index to get caption intervals. Make sure the caption index and `rekall` are installed in your Esper instance before running this notebook. If they aren't, the imports will fail.

Strategy:
1. Get all intervals where person P is on screen
2. For each word W in sentence, create list of intervals for W and intersect with person P intervals
3. Get all intervals where there is exactly one face on screen
4. For each word W in sentence, intersect P + W intervals with one face intervals to get P + W alone intervals
5. Pick one element from each P + W alone interval list

In [None]:
# import rekall
from esper.rekall import *
from rekall.interval_list import Interval, IntervalList
from rekall.temporal_predicates import *
from rekall.spatial_predicates import *

# import caption search
from esper.captions import *

# import face identities for person search
from query.models import Face, FaceIdentity

# import esper widget for debugging
from esper.prelude import esper_widget

import random
import os
from multiprocessing import Pool

In [None]:
# Set these parameters for the notebook.
person_name = "Anderson Cooper"
sentence = "Intel is great because they fund us"

In [None]:
# 1. Get all intervals of person P
person_intrvllists = qs_to_intrvllists(
    FaceIdentity.objects
        .filter(identity__name=person_name.lower())
        .filter(probability__gt=0.99)
        .annotate(video_id=F("face__shot__video_id"))
        .annotate(shot_id=F("face__shot_id"))
        .annotate(min_frame=F("face__shot__min_frame"))
        .annotate(max_frame=F("face__shot__max_frame")),
    schema={
        'start': 'min_frame',
        'end': 'max_frame',
        'payload': 'shot_id'
    })
print("Got all occurrences of {}".format(person_name))

In [None]:
# helper function for 2. to convert caption search to dict mapping from video ID to IntervalList
def caption_to_intrvllists(search_term, dilation=0, video_ids=None):
    results = topic_search([search_term], dilation)
    if video_ids == None:
        videos = {v.id: v for v in Video.objects.all()}
    else:
        videos = {v.id: v for v in Video.objects.filter(id__in=video_ids).all()}
    
    def convert_time(k, t):
        return int(t * videos[k].fps)
    
    segments_by_video = {}
    flattened = [
        (v.id, convert_time(v.id, l.start), convert_time(v.id, l.end)) 
        for v in results.documents if v.id in videos
        for l in v.locations
    ]
    
    for video_id, t1, t2 in flattened:
        if video_id in segments_by_video:
            segments_by_video[video_id].append((t1, t2, 0))
        else:
            segments_by_video[video_id] = [(t1, t2, 0)]
    
    for video in segments_by_video:
        segments_by_video[video] = IntervalList(segments_by_video[video])
        
    print("Got all occurrences of the word {} by searching".format(search_term))
    
    return segments_by_video

def search_terms_lists(video_id):
    valid_intervals = [[] for term in SEARCH_TERM_IDS]

    doc_id = VIDEO_ID_TO_DOCUMENT_ID.get(video_id, None)
    if doc_id is None:
        return valid_intervals
    for interval in DOCUMENT_DATA.token_intervals(doc_id, 0, DOCUMENTS[doc_id].duration):
        for token in interval.tokens:
            if token in SEARCH_TERM_IDS:
                index = SEARCH_TERM_IDS.index(token)
                valid_intervals[index].append(
                    (interval.start,
                    interval.end,
                    0))
    return valid_intervals

# scans for search terms across videos in parallel
def scan_for_search_terms_intrvllist(search_terms, video_ids, dilation=0):
    search_term_lexicon_ids = [LEXICON[term].id for term in search_terms]
    
    global SEARCH_TERM_IDS
    SEARCH_TERM_IDS = search_term_lexicon_ids
    
    with Pool(os.cpu_count()) as pool:
        results = pool.map(search_terms_lists, video_ids)
    
    search_terms_intrvllists = [{} for term in search_terms]
    videos = {v.id: v for v in Video.objects.filter(id__in=video_ids).all()}
    def convert_time(k, t):
        return int(t * videos[k].fps)
    
    for video_id, result in zip(video_ids, results):
        for i, term in enumerate(search_terms):
            term_result = result[i]
            interval_list = IntervalList([
                (convert_time(video_id, start - dilation),
                convert_time(video_id, end + dilation),
                payload)
                for start, end, payload in term_result
            ])
            if interval_list.size() > 0:
                search_terms_intrvllists[i][video_id] = interval_list
        
    print("Got all occurrences of the words {} by scanning".format(search_terms))
    
    return search_terms_intrvllists

In [None]:
# 2. for each word W in sentence, intersect list of intervals W with list of intervals of P alone

# # Get extremely frequent words
EXTREMELY_FREQUENT_WORDS = {
    w.token for w in caption_util.frequent_words(LEXICON, 99.997)
}

# Split words into words to search by index and words to scan through documents for
words = [word.upper() for word in sentence.split()]
words_to_scan = set()
words_to_search_by_index = set()
for word in words:
    if word in EXTREMELY_FREQUENT_WORDS:
        words_to_scan.add(word)
    else:
        words_to_search_by_index.add(word)
words_to_scan = list(words_to_scan)
words_to_search_by_index = list(words_to_search_by_index)

video_ids = list(person_intrvllists.keys())

scanned_words = scan_for_search_terms_intrvllist(words_to_scan, video_ids)
searched_words = [
    caption_to_intrvllists(word, video_ids=video_ids) for word in words_to_search_by_index 
]

sentence_intrvllists = [
    scanned_words[words_to_scan.index(word)]
    if word in words_to_scan else
    searched_words[words_to_search_by_index.index(word)]
    for word in words
]

# This will be a list of tuples (video id, start frame, end frame); there will be one tuple for each word
person_with_sentence_intrvllists = []
for i, word_intrvllists in enumerate(sentence_intrvllists):
    person_with_word_intrvllists = {}
    for video in person_intrvllists:
        if video in word_intrvllists:
            person_list = person_intrvllists[video]
            word_list = word_intrvllists[video]
            
            intersection_list = person_list.overlaps(word_list)
            
            if intersection_list.size() > 0:
                person_with_word_intrvllists[video] = intersection_list
    if person_with_word_intrvllists == {}:
        print("Could not find instance of person {} with word {}".format(person_name, words[i]))
    person_with_sentence_intrvllists.append(person_with_word_intrvllists)

In [None]:
# 3. Get all intervals where there is exactly one face on screen
relevant_shots = set()
for person_with_word_intrvllists in person_with_sentence_intrvllists:
    for intrvllist in list(person_with_word_intrvllists.values()):
        for interval in intrvllist.get_intervals():
            relevant_shots.add(interval.get_payload())

faces = Face.objects.filter(shot__in=list(relevant_shots)) \
        .annotate(video_id=F('shot__video_id')) \
        .annotate(min_frame=F('shot__min_frame')) \
        .annotate(max_frame=F('shot__max_frame'))
face_vids = {}
for face in faces:
    video_id = face.video_id
    shot_id = face.shot_id
    if video_id not in face_vids:
        face_vids[video_id] = {}
    if shot_id not in face_vids[video_id]:
        face_vids[video_id][shot_id] = {'min_frame': face.min_frame, 'max_frame': face.max_frame, 'objects': []}
    face_vids[video_id][shot_id]['objects'].append(
        {'x1': face.bbox_x1, 'y1': face.bbox_y1, 'x2': face.bbox_x2, 'y2': face.bbox_y2})

oneface_intrvllists = {}
for video in face_vids:
    oneface_intrvllist = IntervalList([(
        shot['min_frame'], 
        shot['max_frame'],
        {
            'type': 'bbox_list',
            'objects': shot['objects']
        }) for shot in list(face_vids[video].values())]).filter(exactly(1))
    if oneface_intrvllist.size() > 0:
        oneface_intrvllists[video] = oneface_intrvllist

print("Got all intervals where there is exactly one face on screen")

In [None]:
# 4. For each word W in sentence, intersect P with word intervals with one face intervals
person_with_sentence_alone_intrvllists = []
for person_with_word_intrvllists in person_with_sentence_intrvllists:
    person_with_word_alone_intrvllists = {}
    for video in person_with_word_intrvllists:
        if video in oneface_intrvllists:
            person_alone_intrvllist = person_with_word_intrvllists[video].overlaps(oneface_intrvllists[video])
            if person_alone_intrvllist.size() > 0:
                person_with_word_alone_intrvllists[video] = person_alone_intrvllist
    person_with_sentence_alone_intrvllists.append(person_with_word_alone_intrvllists)

In [None]:
# 5. Pick one element from each P + W alone interval list
supercut_intervals_all = []
for i, person_with_word_alone_intrvllist in enumerate(person_with_sentence_alone_intrvllists):
    supercut_intervals = []
    for video in list(person_with_word_alone_intrvllist.keys()):
        intrvllist = person_with_word_alone_intrvllist[video]
        for interval in intrvllist.get_intervals():
            supercut_intervals.append((video, interval.get_start(), interval.get_end()))
    supercut_intervals_all.append(supercut_intervals)
    if len(supercut_intervals) == 0:
        print("Could not find interval of person {} alone saying {}".format(person_name, words[i]))

# Select one element from each interval
supercut_intervals = [random.choice(intervals) for intervals in supercut_intervals_all]
print("Supercut intervals: ", supercut_intervals)

In [None]:
# Display the supercut intervals in Esper widget for debugging
supercut_intrvllists = {}
for video, start, end in supercut_intervals:
    supercut_intrvllists[video] = IntervalList([(start, end, 0)])
esper_widget(intrvllists_to_result(supercut_intrvllists,
                                   video_order = [video for video, start, end in supercut_intervals]))