# Introduction

We would like to explore the grammar of film as a sequence of shots. More specifically, we will represent each shot as a set of categorical features, thus breaking the film down to a sequence of "words" where each word is a certain combination of these features. We will then explore the frequencies of ngrams formed by these words in our corpus and see if there are discriminative ngrams for each genre/director/etc.

In [None]:
from esper.prelude import *
from query.models import Video, Shot, Labeler, Face, PoseMeta
import matplotlib.pyplot as plt
import numpy as np
from django.db.models import Avg
from tqdm import tqdm
import esper.pose_wrapper as pw
from esper.shot_scale import ShotScale as ShotScaleEnum
import rekall
from rekall.video_interval_collection import VideoIntervalCollection
from rekall.interval_list import IntervalList
from rekall.merge_ops import payload_plus, payload_second
from rekall.temporal_predicates import overlaps
from esper.rekall import intrvllists_to_result_with_objects
import pickle
from collections import Counter

# Features of a shot

We will use the following features to represent each shot:

1. ShotScale: One of ExtremeLong, Long, MediumLong, Medium, MediumCloseUp, CloseUp, ExtremeCloseUp and Unknown (when the shot contains no people).
    - From the scales of the sampled frames, we will ignore the unknown scales, and take the mode to be the scale of the shot. If all sampled frames have unknown scale, the shot will have unknown scale. Note that this can be problematic for shots where camera has lots of movement and changes framing.
2. Number of people: We will use the number of poses detected in the shot. 
    - From the number of poses detected in sampled frames, we take the max to be the number of people in the shot.
    - 5 or more people will be truncated at 5 to be consistent with James Cutting's analysis.
3. Position of people: We will use the number of poses that overlap the left, mid and right third of the screen.
    - We use the sampled frame with the max number of poses. Note that this can be problematic for a shot with moving subject.
    - The counts will similarly be truncated at 5.

In [None]:
# `poses` is a list of poses with 18 keypoints
def count_in_region(poses, region):
    def in_region(pose):
        # Take only detected keypoints
        xs = pose[pose[:,2]>0,0]
        result = (xs >= region[0]) & (xs <= region[1])
        return np.any(result)
    return len([pose for pose in poses if in_region(pose.pose_keypoints())])

def truncate(val, maxval):
    return val if val < maxval else maxval

# Find the scale for shot from scales of sampled frames
def scale_for_shot(scales):
    scales = [scale for scale in scales if (scale != ShotScaleEnum.UNKNOWN)]
    if len(scales) == 0:
        return ShotScaleEnum.UNKNOWN
    counter={}
    for s in ShotScaleEnum:
        counter[s]=0
    for scale in scales:
        counter[scale] += 1
    best_c = 0
    best = ShotScaleEnum.UNKNOWN
    for s in ShotScaleEnum:
        if counter[s] >= best_c:
            best_c = counter[s]
            best = s
    return best

# Find the poses for shot from pose_metas in sampled frames
def poses_for_shot(pose_metas_for_frames):
    pose_metas = max(pose_metas_for_frames, key=len)
    return pw.get(pose_metas)

class ShotFeatures():
    MAX_COUNT = 5
    REGIONS = [(0,1/3),(1/3,2/3),(2/3,1)]
    def __init__(self, scale, poses):
        self.scale = scale
        self.n_people = truncate(len(poses), ShotFeatures.MAX_COUNT)
        self.counts = tuple(truncate(count_in_region(poses, r), ShotFeatures.MAX_COUNT) for r in ShotFeatures.REGIONS)
        self.pose_ids = [pose.id for pose in poses]
    def __str__(self):
        return str(self.__class__) + ": " + str(self.__dict__) 

# Get ShotFeatures with Rekall

In [None]:
def get_shots_with_features_for_vid(vid):
    shots_qs = Shot.objects.filter(
        video__id=vid,
        labeler=Labeler.objects.get(name='shot-hsvhist-face')
    ).all()
    total = shots_qs.count()
    print("Total shots:", total)
    shots = VideoIntervalCollection.from_django_qs(
        shots_qs,
        with_payload=lambda row:[],
        progress=True,
        total=total
    )

    # Take all frames with known scale
    frames_qs = Frame.objects.filter(video__id=vid).annotate(
        numbermod=F('number')%12).filter(numbermod=0).annotate(scale=F("shot_scale__name"))
    total = frames_qs.count()
    print("Total frames with scale:", total)
    shot_scales = VideoIntervalCollection.from_django_qs(
        frames_qs,
        schema={
            "start": "number",
            "end": "number",
        },
        with_payload=lambda f: [ShotScaleEnum[f.scale.upper()]],
        progress=True, total=total)

    # Take all poses
    poses_qs = PoseMeta.objects.filter(frame__video__id=vid).annotate(
        min_frame=F('frame__number'),
        max_frame=F('frame__number'),
        video_id=F('frame__video_id')
    )
    total = poses_qs.count()
    print("Total Poses:", total)
    poses = VideoIntervalCollection.from_django_qs(
        poses_qs,
        with_payload=lambda row: [row],
        progress=True,
        total=total
    ).coalesce(payload_merge_op=payload_plus)

    print("Merging scales into shots")
    # Merge scales into shots
    shots_with_scale = shots.merge(
        shot_scales,
        payload_merge_op = payload_second,
        predicate=overlaps(),
        working_window=1
    ).coalesce(
        payload_merge_op=payload_plus
    ).map(
        lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(),
                              {"scale": scale_for_shot(shot_interval.get_payload())})
    )

    print("Merging poses into shots")
    # Merge poses into shots
    shots_with_poses = shots.merge(
        poses.map(lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(), [shot_interval.get_payload()])),
        payload_merge_op = payload_second,
        predicate=overlaps(),
        working_window=1
    ).coalesce(
        # Get a list of list of poses for each shot
        payload_merge_op = payload_plus
    ).map(lambda shot_interval: (shot_interval.get_start(), shot_interval.get_end(),
                               {"poses": poses_for_shot(shot_interval.get_payload())}))
                                 

    print("Computing shot features")
    # Get shots with shot features
    shots = shots_with_scale.merge(
        shots_with_poses,
        payload_merge_op = lambda d1, d2: {**d1,**d2},
        predicate=overlaps(),
        working_window=1
    ).coalesce().map(
        lambda intv: (intv.get_start(), intv.get_end(), ShotFeatures(intv.get_payload()["scale"], intv.get_payload()["poses"])))
    return shots_with_poses

We run `get_shots_with_features_for_vid` on all videos and save the interval lists to `../data/shot_features/{vid}_intervalllist.p`. See `/app/esper/shot_features.py` for the script.

# Exploring ShotFeatures

In [None]:
def shot_features_to_string(f):
    return "{0}-{1}-{2}-{3}-{4}".format(f.scale.name, f.n_people, f.counts[0], f.counts[1], f.counts[2])

# Returns a dict of document name to list of words
def get_documents(vids):
    docs = {}
    for vid in vids:
        shots = pickle.load(open("../data/shot_features/{0:03d}_intervallist.p".format(vid),"rb"))
        docs[vid]=[shot_features_to_string(intvl.get_payload()) for intvl in shots.get_intervals()]
    return docs

# Returns a list of ngrams in document
def expand_ngram(doc, ngram):
    args = []
    for i in range(ngram):
        args.append(doc[i:])
    return zip(*args)

# Returns a set of words
def get_vocabulary(docs, ngram=1):
    return set((word for doc in docs.values() for word in expand_ngram(doc, ngram)))

# Compute the frequencies of all ngrams, or just the ngrams in vocabulary.
def get_all_frequencies(docs, vocab=None, ngram=1):
    return Counter((word for doc in docs.values() for word in expand_ngram(
            doc, ngram) if vocab is None or word in vocab))

In [None]:
# Movie 344 has bad pose data
all_videos = Video.objects.filter(decode_errors=False).exclude(id=344).order_by('id').all()[:304]
vids = [v.id for v in all_videos]
docs = get_documents(vids)

## Over all movies

We plot the most frequent unigrams and bigrams in our overall dataset.

In [None]:
def plot_histograms(docs, ngram, n):
    total = sum([len(doc)-ngram+1 for doc in docs.values()])
    v = get_vocabulary(docs, ngram)
    c = get_all_frequencies(docs, v, ngram)
    ax = plt.gca()
    ax.set_title("Top {0}-grams by frequency".format(ngram))
    data = c.most_common(n)
    labels = [str(d[0]) for d in data]
    counts = np.array([d[1] for d in data])
    ax.barh(np.arange(n), counts)
    ax.set_yticks(np.arange(n))
    ax.set_yticklabels(labels)
    ax.set_xticklabels(['{:,.2%}'.format(x) for x in ax.get_xticks()/total])
    ax.invert_yaxis()
    plt.show()
    return v, c
    
unigrams_v, unigrams_count = plot_histograms(docs, 1, 15)
bigrams_v, bigrams_count = plot_histograms(docs, 2, 10)
trigrams_v, trigrams_count = plot_histograms(docs, 3, 10)
fourgrams_v, fourgrams_count = plot_histograms(docs, 4, 10)


## By Genre

We now look at ngrams with top frequencies within each genre.

In [None]:
# Print a list of genres
genres = [g.name for g in Genre.objects.all()]
# Get a map from genre to a list of videos in that genre
genre_to_vids = {}
for g in genres:
    genre_to_vids[g]=[v.id for v in all_videos.filter(genres__name=g)]
# Display a selection
import ipywidgets as widgets
from IPython.display import display
w=widgets.Dropdown(options=genres, value=genres[0], description="Select Genre:", disabled=False)
display(w)

In [None]:
vids_in_genre = genre_to_vids[w.value]
print("Number of movies: ", len(vids_in_genre))
docs_in_genre = dict((vid, docs[vid]) for vid in vids_in_genre)
unigrams_v, unigrams_count = plot_histograms(docs_in_genre, 1, 15)
bigrams_v, bigrams_count = plot_histograms(docs_in_genre, 2, 10)
trigrams_v, trigrams_count = plot_histograms(docs_in_genre, 3, 10)
fourgrams_v, fourgrams_count = plot_histograms(docs_in_genre, 4, 10)

In [None]:
# Find the shot indices for the starting shot of the matching n-grams.
def find_indices(ngram_to_find, doc):
    expanded_doc = expand_ngram(doc, len(ngram_to_find))
    return [i for i, ngram in enumerate(expanded_doc) if ngram == ngram_to_find]

def load_shots_for_ngram(ngram_to_find, docs):
    vid_to_shot_indices = dict((vid, find_indices(ngram_to_find, docs[vid])) for vid in docs.keys())
    results = {}
    for vid, indices in vid_to_shot_indices.items():
        if len(indices) > 0:
            inds = set(indices)
            shots = pickle.load(open("../data/shot_features/{0:03d}_intervallist.p".format(vid),"rb"))
            results[vid] = IntervalList([intvl for i, intvl in enumerate(shots.get_intervals()) if i in inds])
    return results

def display_ngrams_in_widget(ngram_to_find, docs):
    return esper_widget(intrvllists_to_result_with_objects(load_shots_for_ngram(ngram_to_find, docs), payload_to_objs=lambda p,v:[]),
            crop_bboxes=False, show_middle_frame=False)

In [None]:
ngram_to_find = ("UNKNOWN-0-0-0-0",)
display_ngrams_in_widget(ngram_to_find, docs_in_genre)

# Scratchpad

In [None]:
poses=pw.get(PoseMeta.objects.filter(frame__video__id=216))

In [None]:
print(poses[1].pose_keypoints())
f=ShotFeatures(ShotScaleEnum.CLOSE_UP, [poses[0].pose_keypoints(), poses[1].pose_keypoints()])
print(f)

In [None]:
ShotScaleEnum.UNKNOWN != ShotScaleEnum.CLOSE_UP

In [None]:
from collections import Counter
c=Counter([1,1,1,2,2,2,3])
c.most_common(1)

In [None]:
scale_for_shot([ShotScaleEnum.UNKNOWN, ShotScaleEnum.UNKNOWN, ShotScaleEnum.CLOSE_UP, ShotScaleEnum.MEDIUM, ShotScaleEnum.MEDIUM, ShotScaleEnum.CLOSE_UP, ShotScaleEnum.EXTREME_CLOSE_UP, ShotScaleEnum.EXTREME_CLOSE_UP])

In [None]:
metas=PoseMeta.objects.filter(frame__video__id=216)
metas.count()

In [None]:
shots.get_intervallist(VIDS[0]).get_intervals()[1]

In [None]:
esper_widget(intrvllists_to_result_with_objects(shots_with_poses.get_allintervals(), payload_to_objs=lambda p,v:[esper.stdlib.pose_to_dict(pose) for pose in p['poses']]),
             crop_bboxes=False, show_middle_frame=False)

In [None]:
import pickle
pickle.dump(shots, open("{0}.p".format(VIDS[0]), "wb"))

In [None]:
test = pickle.load(open('shot_features/216.p', "rb"))
esper_widget(intrvllists_to_result_with_objects({216:test}, payload_to_objs=lambda p,v:[]),
             crop_bboxes=False, show_middle_frame=False)

In [None]:
from esper.shot_features import ShotFeatures
test=pickle.load(open("../data/shot_features/216_intervallist.p","rb"))
esper_widget(intrvllists_to_result_with_objects({216:test}, payload_to_objs=lambda p,v:[]),
             crop_bboxes=False, show_middle_frame=False)

In [None]:
test.filter(lambda i:i.get_start()==6849)

In [None]:
all_videos = Video.objects.filter(decode_errors=False).order_by('id').all()
vids = [v.id for v in all_videos]
vids.remove(344)
docs = get_documents(vids)
v=get_vocabulary(docs, ngram=2)
print(len(v))
c=get_all_frequencies(docs, ngram=2)
print(c.most_common(20))
c=get_all_frequencies(docs, ngram=1)
print(c.most_common(5))

In [None]:
Video.objects.filter(pk=344)

In [None]:
pm=PoseMeta.objects.filter(frame__video_id=344, frame__number=1164)[0]

In [None]:
pm1=PoseMeta.objects.filter(id=9562306)[0]
pm2=PoseMeta.objects.filter(id=9590175)[0]
pm1.frame.id

In [None]:
pm2.frame.id

In [None]:
pw._POSE_DATA.get([9562306, 9590175])

In [None]:
FaceGender.objects.all().count()

In [None]:
Face.objects.all().count()

In [None]:
list(Genre.objects.all())

In [None]:
print(Frame.objects.count())
Frame.objects.exclude(shot_scale__name="unknown")[123].number % 12

In [None]:
unks = [v for v in unigrams_v if v[0].startswith("UNKNOWN")]
cc = len(list(unigrams_count.elements()))
max([unigrams_count[unk]/cc for unk in unks])