# Internet Archive TV news analysis <a class="tocSkip">
This document contains the code and corresponding visualizations/statistics for answering various questions about the TV news dataset.

All times shown are H*:MM:SS.

In [2]:
%matplotlib inline
from query.datasets.prelude import *
import query.datasets.tvnews.queries as queries
from pyspark.sql import SparkSession, Row
import IPython
import shutil

rudecarnie = Labeler.objects.get(name='rudecarnie')
mtcnn = Labeler.objects.get(name='mtcnn')

def format_time(seconds):
    return '{}:{:02d}:{:02d}'.format(seconds/3600, seconds/60 % 60, seconds % 60)

def show_df(table, ordering, clear=True):
    if clear:
        IPython.display.clear_output()
    return pd.DataFrame(table)[ordering]

spark = SparkSession.builder.master("spark://spark:7077").getOrCreate()
sc = spark.sparkContext

def qs_to_df(qs):
    qs.save_to_csv('tmp')
    return spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/app/pg/tmp.csv")
    
def dicts_to_df(ds):
    return spark.createDataFrame(sc.parallelize(data).map(lambda d: Row(**d)))
    
PREFIX = 'data'
def spark_load(key, fn, force=False):
    key = '{}/{}'.format(PREFIX, key)
    has_dir = os.path.isdir(key)
    if not has_dir or force:
        if force and has_dir:
            shutil.rmtree(key)
        df = fn()
        df.write.save(key)
        return df
    else:
        with Timer('Reading data'):
            return spark.read.load(key)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Dataset" data-toc-modified-id="Dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset</a></span><ul class="toc-item"><li><span><a href="#All-videos" data-toc-modified-id="All-videos-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>All videos</a></span></li><li><span><a href="#Videos-by-channel" data-toc-modified-id="Videos-by-channel-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Videos by channel</a></span></li><li><span><a href="#Videos-by-show" data-toc-modified-id="Videos-by-show-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Videos by show</a></span></li><li><span><a href="#Videos-by-time-of-day" data-toc-modified-id="Videos-by-time-of-day-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Videos by time of day</a></span></li></ul></li><li><span><a href="#Gender" data-toc-modified-id="Gender-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Gender</a></span><ul class="toc-item"><li><span><a href="#Detector-accuracy" data-toc-modified-id="Detector-accuracy-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Detector accuracy</a></span></li><li><span><a href="#Male-vs.-female-faces" data-toc-modified-id="Male-vs.-female-faces-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Male vs. female faces</a></span></li><li><span><a href="#Male-vs.-female-faces-across-channels" data-toc-modified-id="Male-vs.-female-faces-across-channels-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Male vs. female faces across channels</a></span></li><li><span><a href="#Male-vs.-female-faces-across-shows" data-toc-modified-id="Male-vs.-female-faces-across-shows-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Male vs. female faces across shows</a></span></li><li><span><a href="#Male-vs.-female-faces-across-time-of-day" data-toc-modified-id="Male-vs.-female-faces-across-time-of-day-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Male vs. female faces across time of day</a></span></li><li><span><a href="#Male-vs.-female-faces-across-topics" data-toc-modified-id="Male-vs.-female-faces-across-topics-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Male vs. female faces across topics</a></span></li><li><span><a href="#Male-vs.-female-faces-in-panels" data-toc-modified-id="Male-vs.-female-faces-in-panels-2.7"><span class="toc-item-num">2.7&nbsp;&nbsp;</span>Male vs. female faces in panels</a></span></li></ul></li><li><span><a href="#Pose" data-toc-modified-id="Pose-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Pose</a></span></li><li><span><a href="#Misc" data-toc-modified-id="Misc-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Misc</a></span><ul class="toc-item"><li><span><a href="#Number-of-people-in-frame" data-toc-modified-id="Number-of-people-in-frame-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Number of people in frame</a></span></li></ul></li></ul></div>

<hr />
# Dataset

In [3]:
def load_videos():
    return qs_to_df(
        Video.objects.all().annotate(hour=Extract('time', 'hour')) \
        .values('num_frames', 'fps', 'show_id', 'channel_id', 'hour'))
videos = spark_load('videos', load_videos)

def video_stats(key, labels):
    times = videos.select("*").rdd.map(lambda v: (v[key] if key is not None else 0, v['num_frames'] / v['fps']))
    counts = times.countByKey()
    durations = {k: v for k, v in times.aggregateByKey(0, (lambda a, b: a + b), (lambda a, b: a + b), 8).collect()}
    return [{
        'label': label['name'],
        'count': counts[label['id']],
        'duration': format_time(int(durations[label['id']])),
        'avg_duration': format_time(int(durations[label['id']] / counts[label['id']]))
    } for label in labels]

ds_ordering = ['label', 'count', 'duration', 'avg_duration']

<hr />
## All videos

In [None]:
show_df(
    video_stats(None, [{'id': 0, 'name': 'whole dataset'}]),
    ds_ordering)

<hr />
## Videos by channel

In [None]:
show_df(
    video_stats('channel_id', list(Channel.objects.all().values('id', 'name'))),
    ds_ordering)

<hr />
## Videos by show
"Situation Room with Wolf Blitzer" and "Special Report with Bret Baier" were ingested as 60 10-minute segments each, whereas the other shows have 10 ≥1 hour segments.

In [None]:
show_df(
    video_stats('show_id', list(Show.objects.all().values('id', 'name'))),
    ds_ordering)

<hr />
## Videos by time of day
Initial selection of videos was only prime-time, so between 4pm-11pm.

In [None]:
hours = [r['hour'] for r in 
         Video.objects.annotate(hour=Extract('time', 'hour')).distinct('hour').order_by('hour').values('hour')]

def format_hour(h):
    if h <= 12:
        return '{} AM'.format(h)
    else:
        return '{} PM'.format(h-12)
    
show_df(
    video_stats('hour', [{'id': hour, 'name': format_hour(hour)} for hour in hours]),
    ds_ordering)

<hr />
# Gender
These queries analyze the distribution of men vs. women across a number of axes. We use faces detected by [MTCNN](https://github.com/kpzhang93/MTCNN_face_detection_alignment/) and gender detected by [rude-carnie](https://github.com/dpressel/rude-carnie). We only consider faces with a height > 20% of the frame to eliminate people in the background. Face detection was run at 2 frames per second on all videos. If a person's face is detected, we count that as 0.5 seconds of screen time. Total screen times reported double-count frames depending on the number of people in them, e.g. 2 women in one frame is 1 second of screen time.

Remaining questions:
* No. of unique women vs. unique men (identity-major)
* No. of frames w/ men vs. women (frame-major)

Add data:
* Sunday morning news
* General morning news shows (e.g. Today Show)


In [None]:
def load_genders():
    return qs_to_df(FaceGender.objects \
        .annotate(height=F('face__bbox_y2') - F('face__bbox_y1')) \
        .filter(labeler=rudecarnie, face__labeler=mtcnn, height__gte=0.2) \
        .annotate(duration=Cast(
            (F('face__shot__max_frame') - F('face__shot__min_frame')) / F('face__shot__video__fps'),
            models.FloatField())) \
        .values('gender', 'duration', 'face__person__frame__video__channel', 'face__person__frame__video__show'))
genders = spark_load('genders', load_genders, force=True)

def calc_derived_gender_fields(row):
    total = float(row['M'] + row['F'])
    if total > 0:
        row['M_percent'] = '{:.0f}%'.format(row['M'] / total * 100)
        row['M_screentime'] = format_time(row['M'] / 2)
        row['F_percent'] = '{:.0f}%'.format(row['F'] / total * 100)
        row['F_screentime'] = format_time(row['F'] / 2)
    
def filter_gender(key, labels):
    times = genders.select("*").rdd.map(lambda g: ((g[key] if key is not None else 0, g['gender']), g['duration']))
    counts = times.countByKey()
    durations = {k: v for k, v in times.aggregateByKey(0, (lambda a, b: a + b), (lambda a, b: a + b), 8).collect()}
        
    print(counts)
    print(durations)
    
#     genders = list(Gender.objects \
#         .annotate(count=Subquery(
#             face_filter(FaceGender.objects \
#             .filter(gender=OuterRef('pk'), labeler=rudecarnie, face__labeler=mtcnn) \
#             .annotate(height=F('face__bbox_y2') - F('face__bbox_y1')) \
#             .filter(height__gte=0.2)) \
#             .values('gender').annotate(count=Count('gender')) \
#             .values('count'), models.IntegerField())).values('name', 'count'))
#     def zero_bad_val(n):
#         if n is None or math.isnan(float(n)):
#             return 0
#         else:
#             return n
#     df = pd.DataFrame(genders)
#     row = {
#         'M': int(zero_bad_val(df.loc[df['name'] == 'M']['count'].values[0])),
#         'F': int(zero_bad_val(df.loc[df['name'] == 'F']['count'].values[0] or 0)),
#     }
#     calc_derived_gender_fields(row)

#     total_length = int(sum([v['length'] for v in video_filter(Video.objects) \
#         .annotate(length=Sum(Cast(F('num_frames'), models.FloatField()) / F('fps'))) \
#         .values('length')]))
#     row['length'] = format_time(total_length)
    
#     return row

ordering = ['length', 'M', 'M_percent', 'M_screentime', 'F', 'F_percent', 'F_screentime']

## Detector accuracy
* Handlabels are just for the "main person in the frame" (instruction to labelers), so precision is expected to be low.
* Recall for face and gender detectors is high.

In [None]:
# TODO: add data table for handlabeled subset
# TODO: deal with negative case
# TODO: how many errors on women vs. men?

face_labeler = Labeler.objects.get(name='mtcnn')
hand_labeler = Labeler.objects.get(name='handlabeled')
gender_labeler = Labeler.objects.get(name='rudecarnie')

face_tp = 0
face_fp = 0
face_fn = 0

gender_t = 0
gender_f = 0

handlabeled = [t['person__frame__video__id'] for t in Face.objects \
    .filter(labeler=hand_labeler) \
    .values('person__frame__video__id') \
    .distinct('person__frame__video__id') \
    .values('person__frame__video__id')]

for i, video in enumerate(Video.objects.filter(id__in=handlabeled)):
    frames_with_faces = Frame.objects \
        .filter(video=video) \
        .annotate(c=Subquery(
            Face.objects.filter(person__frame=OuterRef('pk')) \
            .values('person__frame') \
            .annotate(c=Count('*')).values('c'))) \
        .filter(c__gt=0)
    #print(i, video.path)
    for frame in frames_with_faces:
        handlabeled_faces = list(Face.objects.filter(person__frame=frame, labeler=hand_labeler))
        autolabeled_faces = list(Face.objects.filter(person__frame=frame, labeler=face_labeler))
        
        for autoface in autolabeled_faces:
            good = np.where(np.array([bbox_iou(autoface, handface) > 0.5 for handface in handlabeled_faces]))
            index = good[0][0] if len(good[0]) > 0 else None
            if index is not None:
                face_tp += 1
                auto_gender = FaceGender.objects.get(face=autoface)
                hand_gender = FaceGender.objects.get(face=handlabeled_faces[index])
                if auto_gender.gender == hand_gender.gender:
                    gender_t += 1
                else:
                    gender_f += 1
            else:
                face_fp += 1
            
        for handface in handlabeled_faces:
            good = any([bbox_iou(autoface, handface) > 0.5 for autoface in autolabeled_faces])
            if not good:
                face_fn += 1
    
print('Face precision: {:.2f}'.format(face_tp / float(face_tp + face_fp)))
print('Face recall: {:.2f}'.format(face_tp / float(face_tp + face_fn)))
print('Gender accuracy: {:.2f}'.format(gender_t / float(gender_t + gender_f)))

<hr />
## Male vs. female faces
* Male:female ratio is 2:1

In [None]:
# TODO: have frame-major instead of face-major, e.g. split into male-only, female-only, mixed

show_df([filter_gender(lambda qs: qs, lambda qs: qs)], ordering)

<hr />
## Male vs. female faces across channels
* No meaningful differnce in gender balance between CNN and FOX.

In [None]:
# compute t-test
counts = []
for channel in tqdm(Channel.objects.all()):
    c = filter_gender(
        lambda qs: qs.filter(face__person__frame__video__channel=channel), 
        lambda qs: qs.filter(channel=channel))
    c['channel'] = channel.name
    counts.append(c)
    
show_df(counts, ['channel'] + ordering)

<hr />
## Male vs. female faces across shows
* Female-hosted shows (Poppy Harlow, Gretchen Carlson) have higher female percentages than any other show.
* Farreed Zakaria and Bret Baier are most male-imbalanced.

In [None]:
counts = []
for show in tqdm(Show.objects.all()):
    c = filter_gender(
        lambda qs: qs.filter(face__person__frame__video__show=show),
        lambda qs: qs.filter(show=show))
    c['show'] = show.name
    counts.append(c)
    
show_df(counts, ['show'] + ordering)

<hr />
## Male vs. female faces across time of day
* No meaningful trend in gender balance across time of day.

In [None]:
hours = Video.objects.annotate(hour=Extract('time', 'hour')).distinct('hour').order_by('hour').values('hour')

counts = []
for hour in hours:
    hour = hour['hour']
    c = filter_gender(
        lambda qs: qs.filter(face__person__frame__video__time__hour=hour),
        lambda qs: qs.filter(time__hour=hour))
    c['hour'] = datetime.time(hour, 0).strftime('%I %p')
    counts.append(c)

show_df(counts, ['hour'] + ordering)    

<hr />
## Male vs. female faces across topics
* Topic labeling is not robust enough to draw conclusions yet.

In [None]:
track_labeler = Labeler.objects.get(name='word2vec')
all_counts = []
for topic in Topic.objects.all():
    topic_counts = {'topic': topic.name, 'female': 0, 'male': 0, 'length': 0}
    for track in TopicTrack.objects.filter(topic=topic, labeler=track_labeler).select_related('video'):
        track_counts = filter_gender(
            lambda qs: qs.filter(
                face__person__frame__video=track.video, 
                face__person__frame__number__gte=track.min_frame, 
                face__person__frame__number__lte=track.max_frame),
            lambda qs: qs.filter(id=track.video.id)
        )
        topic_counts['male'] += track_counts['male']
        topic_counts['female'] += track_counts['female']
        topic_counts['length'] += int((track.max_frame - track.min_frame) / track.video.fps)
        
    calc_derived_gender_fields(topic_counts)
    topic_counts['length'] = format_time(topic_counts['length'])
    all_counts.append(topic_counts)
    
show_df(all_counts, ['topic'] + ordering)

<hr />
## Male vs. female faces in panels
* Smaller percentage of women in panels relative to overall dataset.

In [None]:
# TODO: female-domainated situations?
# TODO: slice this on # of people in the panel
# TODO: small visualization that shows sample of segments
# TODO: panels w/ majority male vs. majority female

print('Computing panels')
panels = queries.panels()
print('Computing gender stats')
frame_ids = [frame.id for (frame, _) in panels]
counts = filter_gender(lambda qs: qs.filter(face__person__frame__id__in=frame_ids), lambda qs: qs)
show_df([counts], ordering)

<hr />
# Pose
* Animatedness of people (specifically hosts)
    * e.g. Rachel Maddow vs. others
    * Pick 3-4 hours of a few specific hosts, compute dense poses and tracks
    * Devise acceleration metric
* More gesturing on heated exchanges?
* Sitting vs. standing
* Repeated gestures (debates vs. state of the union)
* Head/eye orientation (are people looking at each other?)
* Camera orientation (looking at someone from above/below)
* How much are the hosts facing each other
* Quantify aggressive body language

<hr />
# Misc

<hr />
## Number of people in frame
* Surprisingly, a plurality of frames have no detected faces in them.

In [None]:
from django.db.models import Func
frames = Frame.objects.annotate(c=Subquery(
    Face.objects.filter(person__frame=OuterRef('pk'), labeler=mtcnn) \
    .values('person__frame') \
    .annotate(c=Count('*')) \
    .values('c'), models.IntegerField())) \
    .annotate(mod=F('number') % Cast(Func(F('video__fps'), function='ROUND'), models.IntegerField())) \
    .filter(mod=0) \
    .values('c')

all_counts = [f['c'] or 0 for f in frames]
ax = plt.figure(figsize=(16, 6)).add_subplot(111)
bins = range(10)
arr = ax.hist(all_counts, bins=bins, align='left', rwidth=0.75)
for i in bins:
    if i == 9: continue
    ax.text(arr[1][i]-0.15, arr[0][i], str(int(arr[0][i])))
ax.set_xticks(bins)
ax.set_xticklabels(bins)
ax.set_xlabel("Number of people in frame")
_ = ax.set_ylabel("Number of frames")