In [1]:
import os
import json

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

import utils as ut

In [2]:
os.chdir(os.path.expanduser('~/notebooks/Data-Provenance-Collection'))

# Prepare data

## Dimension data

In [3]:
with open('constants/license_classes.json', 'rt') as f:
    license_classes = json.load(f)
    license_classes = {k : v[0] for k, v in license_classes.items()}
    license_classes['Custom'] = 'Unspecified'

## Summary files

In [4]:
summaries = {}
for file in os.listdir('data_summaries-video'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries-video', file), 'rt') as f:
        key = '.'.join(file.split('.')[:-1]).strip()
        summaries[key] = json.load(f)

## List of datasets

In [5]:
dat = pd.read_csv('src/summary-tables/papers.csv') \
    .rename({'collection': 'Collection'}, axis=1) \
    .set_index('Collection')
dat = dat.loc[dat['modality'] == 'video'].drop(['modality', 'cites'], axis=1)

In [6]:
files = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]

assert set(files) <= set(summaries.keys())

### Datasets without entries in papers.csv

In [7]:
set(summaries.keys()) - set(files)

{'YT-Temporal-180m', 'YT-Temporal-1B', 'cinepile', 'egopet', 'egoschema'}

# Make tables

In [8]:
dataset_level = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            assert 'Video Hours' in summaries[file][k].keys()
            assert 'Taken Down' in summaries[file][k].keys()
            assert 'Year Released' in summaries[file][k].keys()
            
            assert 'Text Sources' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Text Sources'], (list, tuple))
            
            assert 'Video Sources' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Video Sources'], (list, tuple))

            assert 'Creators' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Creators'], (list, tuple))
            
            assert 'Countries' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Countries'], (list, tuple))

            assert 'Licenses' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Licenses'], (list, tuple))
            
            sources = list(set(summaries[file][k]['Text Sources']) | set(summaries[file][k]['Video Sources']))
            avail = False if summaries[file][k]['Taken Down'] in ('True', 'true') else True
            licenses = [lic['License'] for lic in summaries[file][k]['Licenses']]
            
            dataset_level += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,

                'datasets': 1,
                'hours': summaries[file][k]['Video Hours'],
                'creators': summaries[file][k]['Creators'],
                'countries': summaries[file][k]['Countries'],
                'year': summaries[file][k]['Year Released'],
                
                'sources': sources,
                'avail': avail,
                'licenses': licenses,
            }]
dataset_level = pd.DataFrame(dataset_level)

dat['Num Datasets'] = dataset_level.groupby('collection')['datasets'].sum().fillna(0).astype(int)
dat['Num Hours'] = dataset_level.groupby('collection')['hours'].sum().fillna(0).round().astype(int)
dat['Num Creators'] = dataset_level.groupby('collection')['creators'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Countries'] = dataset_level.groupby('collection')['countries'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Sources'] = dataset_level.groupby('collection')['sources'].apply(ut.count_unique_with_none).fillna(0).astype(int)

dat['Year'] = dataset_level.groupby('collection')['year'].apply(lambda s: s.iloc[0] if len(set(s)) == 1 else 'Mult.').astype(str)

dat['Avail'] = dataset_level.groupby('collection')['avail'].apply(ut.video_avail_status)

dat['Use'] = dataset_level.groupby('collection')['licenses'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: list(set([license_classes[v] for v in s]))) \
    .apply(ut.color_license_classes)

## Format for LaTeX output

In [9]:
dat = dat[[
    'Year',
    'Num Hours',
    'Num Datasets',
    'Num Countries',
    'Num Creators',
    'Num Sources',
    'Use',
    'Avail',
]].sort_values('Year')

In [10]:
column_mapping = {
    'Year': ('', 'Year'),
    'Num Hours': ('Property Counts', 'Hours'),
    'Num Datasets': ('Property Counts', 'Datasets'),
    'Num Countries': ('Property Counts', 'Countries'),
    'Num Creators': ('Property Counts', 'Creators'),
    'Num Sources': ('Property Counts', 'Sources'),
    'Use': ('Permissions', 'Use'),
    'Avail': ('Permissions', 'Avail'),
}

dat.columns = pd.MultiIndex.from_arrays([
    [column_mapping[col][0] for col in dat.columns],
    [column_mapping[col][1] for col in dat.columns]
])

dat.columns = pd.MultiIndex.from_tuples([
    (r'\textsc{' + c[0] + r'}', r'\textsc{\thead{' + c[1] + r'}}')
    for c in dat.columns
])

dat.index.name = r'\textsc{' + dat.index.name + r'}'

def color_map(value, cmap='BrBG', vmin=None, vmax=None):
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    colormap = mp.colormaps[cmap]
    color = [int(255*x) for x in colormap(norm(value))[:3]]
    return ','.join(map(str, color))  # Convert the color to a CSV string

color_def = ''
formatters = {}
tmp_val_color = {}
num_cols = [c for c in dat.columns if 'Property Counts' in c[0]]

for col in num_cols:
    tmp_val_color[col] = {}
    
    vmin = np.log(dat[col].min() + 1e-6)
    vmax = np.log(dat[col].max() + 1e-6)
    midpt = (vmax + vmin) / 2
    vmin, vmax = vmin - midpt, vmax - midpt
    
    for row in dat.index:
        value = np.log(dat.loc[row, col] + 1e-6)
        value -= midpt
        
        if pd.notnull(value):
            row_color_name = row.replace(' ', '') \
                .replace(r'\textsc{', '').replace('}', '')
            col_color_name = '_'.join(col).replace(' ', '') \
                .replace(r'\textsc{\thead{', '').replace('}}', '') \
                .replace(r'\textsc{', '').replace('}', '')
            
            color_name = f"color{row_color_name}{col_color_name}"
            
            color_def += f"\\definecolor{{{color_name}}}{{RGB}}{{{color_map(value / 4, vmin=vmin, vmax=vmax)}}}\n"
            
            tmp_val_color[col][dat.loc[row, col]] = color_name

    def func(v, col=col):
        color_name = tmp_val_color[col][v]

        if col in num_cols and v >= 1000:
            v /= 1000
            return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
        elif col in num_cols and v == 0:
            return '-'
        else:
            return f'\\cellcolor{{{color_name}}}{{{v:,.0f}}}' if pd.notnull(v) else '-'

    formatters[col] = func

In [11]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:collections-video',
    'column_format': 'lc|ccccc|cc',
    'multicol_align': 'c',
    
    'caption': (r'''
    \textbf{Video collections and properties}. Collection properties include numbers of hours of video, datasets, creator institutions, countries of creator institutions, and data sources. The \textsc{Use} column indicates whether a collection includes data freely usable even for commercial purposes (\protect\CommercialDataCircle), data usable only for noncommercial purposes or academic research (\protect\NCDataCircle) and data whose license status is not specified precisely enough to allow us to determine commercial use permissions (\protect\UnspecifiedDataCircle). Note that each collection may have different datasets with one, two, or all three of these statuses. Finally, the \textsc{Avail} column indicates whether a dataset is available online (\greencheck) or has been taken down, usually for legal reasons (\redcross). Datasets are sorted chronologically to highlight trends over time.
    '''.strip(), r'\textbf{Video collections and properties}'),
    
    'hrules': True,
    'convert_css': True,
}

latex = dat \
    .reset_index() \
    .style \
    .hide() \
    .format(formatter=formatters) \
    .to_latex(**kwargs)

print('\n'.join([
    r'\setlength{\tabcolsep}{1.9pt}',
    color_def,
    latex,
]))

\setlength{\tabcolsep}{1.9pt}
\definecolor{colorHOLLYWOOD2PropertyCounts_Hours}{RGB}{229,241,239}
\definecolor{colorCollectivePropertyCounts_Hours}{RGB}{240,223,178}
\definecolor{colorHMDBPropertyCounts_Hours}{RGB}{204,235,230}
\definecolor{colorUCF101PropertyCounts_Hours}{RGB}{227,240,239}
\definecolor{colorYouCookPropertyCounts_Hours}{RGB}{213,237,234}
\definecolor{color50SaladsPropertyCounts_Hours}{RGB}{226,240,238}
\definecolor{colorStoryGraphsPropertyCounts_Hours}{RGB}{235,242,241}
\definecolor{colorHollywoodExt.PropertyCounts_Hours}{RGB}{233,242,240}
\definecolor{colorBreakfastPropertyCounts_Hours}{RGB}{224,240,237}
\definecolor{colorSports-1MPropertyCounts_Hours}{RGB}{187,229,223}
\definecolor{colorTHUMOSPropertyCounts_Hours}{RGB}{218,238,235}
\definecolor{colorVideoStoryPropertyCounts_Hours}{RGB}{213,237,234}
\definecolor{colorSumMePropertyCounts_Hours}{RGB}{242,244,244}
\definecolor{colorTVSumPropertyCounts_Hours}{RGB}{236,243,242}
\definecolor{colorVolleyballPropertyCounts_Ho