In [1]:
import os
import json

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

import utils as ut

In [2]:
os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection'))

# Prepare data

## Dimension data

In [3]:
with open('constants/license_classes.json', 'rt') as f:
    license_classes = json.load(f)
    license_classes = {k : v[0] for k, v in license_classes.items()}
    license_classes['Custom'] = 'Unspecified'

    # FIXME - as in impending PR
    license_classes['CDLA-Sharing-1.0'] = license_classes['CDLA Sharing 1.0']
    license_classes['CC BY-NC-ND 3.0'] = 'NC'

## Summary files

In [4]:
summaries = {}
for file in os.listdir('data_summaries-speech'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries-speech', file), 'rt') as f:
        key = '.'.join(file.split('.')[:-1]).strip()
        summaries[key] = json.load(f)

## List of datasets

In [5]:
dat = pd.read_csv('notebooks/papers.csv') \
    .rename({'collection': 'Collection'}, axis=1) \
    .set_index('Collection')
dat = dat.loc[dat['modality'] == 'audio'].drop(['modality', 'cites'], axis=1)

In [6]:
files = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]

assert set(files) <= set(summaries.keys())

# Make tables

In [7]:
dataset_level = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            assert 'Hours' in summaries[file][k].keys()
            assert 'Year Released' in summaries[file][k].keys()
            assert 'Speakers' in summaries[file][k].keys()
            
            assert 'Languages (ISO)' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Languages (ISO)'], (list, tuple))
            
            assert 'Creators' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Creators'], (list, tuple))
            
            assert 'Licenses' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Licenses'], (list, tuple))
            
            assert 'Tasks' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Tasks'], (list, tuple))
            
            assert 'Location' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Location'], (list, tuple))
            
            assert 'Creator Categories' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Creator Categories'], (list, tuple))
            
            assert 'Source' in summaries[file][k].keys()

            source = [summaries[file][k]['Source']] if isinstance(summaries[file][k]['Source'], str) else summaries[file][k]['Source']
            source_categories = summaries[file][k]['Source Categories'] if 'Source Categories' in summaries[file][k].keys() else []
            topics = summaries[file][k]['Topics'] if 'Topics' in summaries[file][k].keys() else []
            licenses = [lic['License'] for lic in summaries[file][k]['Licenses']]
            
            dataset_level += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,

                'datasets': 1,
                'hours': summaries[file][k]['Hours'],
                'speakers': summaries[file][k]['Speakers'],
                'languages': summaries[file][k]['Languages (ISO)'],
                'creators': summaries[file][k]['Creators'],
                'tasks': summaries[file][k]['Creators'],
                'locations': summaries[file][k]['Location'],
                'creator_categories': summaries[file][k]['Creator Categories'],
                
                'year': summaries[file][k]['Year Released'],
                
                'source': source,
                'source_categories': source_categories,
                'topics': topics,                
                'licenses': licenses,
            }]
dataset_level = pd.DataFrame(dataset_level)

dataset_level['hours'] = dataset_level['hours'].apply(lambda s: np.nan if s == 'Unclear' else float(s))
dataset_level['speakers'] = dataset_level['speakers'].apply(lambda s: np.nan if s == 'Unclear' else float(s))

dat['Num Datasets'] = dataset_level.groupby('collection')['datasets'].sum().fillna(0).astype(int)
dat['Num Hours'] = dataset_level.groupby('collection')['hours'].sum().fillna(0).round().astype(int)
dat['Num Speakers'] = dataset_level.groupby('collection')['speakers'].sum().fillna(0).round().astype(int)
dat['Num Languages'] = dataset_level.groupby('collection')['languages'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Creators'] = dataset_level.groupby('collection')['creators'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Tasks'] = dataset_level.groupby('collection')['tasks'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Locations'] = dataset_level.groupby('collection')['locations'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Sources'] = dataset_level.groupby('collection')['source'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Topics'] = dataset_level.groupby('collection')['topics'].apply(ut.count_unique_with_none).fillna(0).astype(int)

dat['Year'] = dataset_level.groupby('collection')['year'].apply(lambda s: s.iloc[0] if len(set(s)) == 1 else 'Mult.').astype(str)

dat['US'] = dataset_level.groupby('collection')['locations'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: 'United States' in s) \
    .replace({True: r'\greencheck', False: '\emojiblank'})

dat['Academic'] = dataset_level.groupby('collection')['creator_categories'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: 'Academia' in s) \
    .replace({True: r'\greencheck', False: '\emojiblank'})

dat['Industry'] = dataset_level.groupby('collection')['creator_categories'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: 'Industry' in s) \
    .replace({True: r'\greencheck', False: '\emojiblank'})

dat['Use'] = dataset_level.groupby('collection')['licenses'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: list(set([license_classes[v] for v in s]))) \
    .apply(ut.color_license_classes)

## Format for LaTeX output

In [8]:
dat = dat[[
    'Year',
    
    # 'Num Datasets',
    'Num Hours',
    'Num Speakers',    
    'Num Languages',
    'Num Creators',
    'Num Tasks',
    # 'Num Locations',
    'Num Sources',
    'Num Topics',

    'US',
    'Academic',
    'Industry',

    'Use',
]].sort_values('Year')

In [9]:
column_mapping = {
    'Year': ('', 'Year'),
    
    # 'Num Datasets': ('Property Counts', 'Dset'),
    'Num Hours': ('Property Counts', 'Hr'),
    'Num Speakers': ('Property Counts', 'Spkr'),
    'Num Languages': ('Property Counts', 'Lang'),
    'Num Creators': ('Property Counts', 'Creat'),
    'Num Tasks': ('Property Counts', 'Tasks'),
    # 'Num Locations': ('Property Counts', 'Loc'),
    'Num Sources': ('Property Counts', 'Src'),
    'Num Topics': ('Property Counts', 'Top'),

    'US': ('Category', 'US'),
    'Academic': ('Category', 'Ac'),
    'Industry': ('Category', 'Ind'),
    
    'Use': ('Perm', 'Use'),
}

dat.columns = pd.MultiIndex.from_arrays([
    [column_mapping[col][0] for col in dat.columns],
    [column_mapping[col][1] for col in dat.columns]
])

dat.columns = pd.MultiIndex.from_tuples([
    (r'\textsc{' + c[0] + r'}', r'\textsc{\thead{' + c[1] + r'}}')
    for c in dat.columns
])

dat.index.name = r'\textsc{' + dat.index.name + r'}'

def color_map(value, cmap='BrBG', vmin=None, vmax=None):
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    colormap = mp.colormaps[cmap]
    color = [int(255*x) for x in colormap(norm(value))[:3]]
    return ','.join(map(str, color))  # Convert the color to a CSV string

color_def = ''
formatters = {}
tmp_val_color = {}
num_cols = [c for c in dat.columns if 'Property Counts' in c[0]]

for col in num_cols:
    tmp_val_color[col] = {}
    
    vmin = np.log(dat[col].min() + 1e-6)
    vmax = np.log(dat[col].max() + 1e-6)
    midpt = (vmax + vmin) / 2
    vmin, vmax = vmin - midpt, vmax - midpt
    
    for row in dat.index:
        value = np.log(dat.loc[row, col] + 1e-6)
        value -= midpt
        
        if pd.notnull(value):
            row_color_name = row.replace(' ', '') \
                .replace(r'\textsc{', '').replace('}', '')
            col_color_name = '_'.join(col).replace(' ', '') \
                .replace(r'\textsc{\thead{', '').replace('}}', '') \
                .replace(r'\textsc{', '').replace('}', '')
            
            color_name = f"color{row_color_name}{col_color_name}"
            
            color_def += f"\\definecolor{{{color_name}}}{{RGB}}{{{color_map(value / 4, vmin=vmin, vmax=vmax)}}}\n"
            
            tmp_val_color[col][dat.loc[row, col]] = color_name

    def func(v, col=col):
        color_name = tmp_val_color[col][v]

        if col in num_cols and v >= 1000:
            v /= 1000
            return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
        elif col in num_cols and v == 0:
            return '-'
        else:
            return f'\\cellcolor{{{color_name}}}{{{v:,.0f}}}' if pd.notnull(v) else '-'

    formatters[col] = func

In [10]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:collections-audio',
    'column_format': 'lc|ccccccc|ccc|c',
    'multicol_align': 'c',
    
    'caption': (r'''
    \textbf{Audio collections and properties}. Collection properties include numbers of audio hours (\textsc{HR}), speakers (\textsc{SPKR}), languages (\textsc{Lang}), creator institutions (\textsc{Creat}), tasks (\textsc{Tasks}), data sources (\textsc{Src}), and topics (\textsc{Topics}). The number of datasets is not listed because all collections include only one dataset, except for M2ASR which has four. The \textsc{US} column indicates datasets from or partly from the United States, the \textsc{Ac} column datasets created by academic institutions, and the \textsc{Ind} column datasets created by industry. Note that a dataset can have all of these, none of them, or any combination of them. The \textsc{Use} column indicates whether a collection includes data freely usable even for commercial purposes (\protect\CommercialDataCircle), data usable only for noncommercial purposes or academic research (\protect\NCDataCircle) and data whose license status is not specified precisely enough to allow us to determine commercial use permissions (\protect\UnspecifiedDataCircle). Note that each collection may have different datasets with one, two, or all three of these statuses. Datasets are sorted chronologically to highlight trends over time.
    '''.strip(), r'\textbf{Audio collections and properties}'),
    
    'hrules': True,
    'convert_css': True,
}

latex = dat \
    .reset_index() \
    .style \
    .hide() \
    .format(formatter=formatters) \
    .to_latex(**kwargs)

print('\n'.join([
    r'\setlength{\tabcolsep}{1.9pt}',
    color_def,
    latex,
]))

\setlength{\tabcolsep}{1.9pt}
\definecolor{colorTIMITPropertyCounts_Hr}{RGB}{244,229,189}
\definecolor{colorSwitchboardPropertyCounts_Hr}{RGB}{245,241,232}
\definecolor{colorAfricanAcc.FrenchPropertyCounts_Hr}{RGB}{245,235,206}
\definecolor{colorCSJPropertyCounts_Hr}{RGB}{245,244,242}
\definecolor{colorFisherPropertyCounts_Hr}{RGB}{236,243,242}
\definecolor{colorCSLU22Langs.PropertyCounts_Hr}{RGB}{245,238,220}
\definecolor{colorAMIPropertyCounts_Hr}{RGB}{245,239,222}
\definecolor{colorCSLU1.2PropertyCounts_Hr}{RGB}{245,235,208}
\definecolor{colorALLSSTARPropertyCounts_Hr}{RGB}{245,238,220}
\definecolor{colorTED-LIUM3PropertyCounts_Hr}{RGB}{245,243,238}
\definecolor{colorNSTNorwegianPropertyCounts_Hr}{RGB}{245,243,240}
\definecolor{colorNSTDanishPropertyCounts_Hr}{RGB}{245,243,240}
\definecolor{colorNSTSwedishPropertyCounts_Hr}{RGB}{245,242,234}
\definecolor{colorVystadialPropertyCounts_Hr}{RGB}{245,237,216}
\definecolor{colorTHCHS-30PropertyCounts_Hr}{RGB}{245,236,212}
\definecolor{col