In [None]:
import os
import json

import numpy as np
import pandas as pd

import utils as ut

In [None]:
os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection'))

# Prepare data

## List of datasets

In [None]:
dat = pd.read_csv('notebooks/papers.csv') \
    .rename({'collection': 'Collection', 'cites': 'Cite'}, axis=1) \
    .drop('modality', axis=1) \
    .set_index('Collection')

dat['Cite'] = dat['Cite'].fillna('')

In [None]:
files = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]

assert set(files) <= set([
    os.path.splitext(f)[0]
    for f in os.listdir('data_summaries')
    if not f.startswith('_template')
])

In [None]:
tmp = []
short_names = pd.DataFrame(dat['summary_keys'].str.split('|')).to_records().tolist()
for short, keys in short_names:
    for key in keys:
        tmp += [(short, key)]
short_names = pd.DataFrame(tmp, columns=['short_name', 'summary_key']).set_index('summary_key')

## Dimension data

In [None]:
summaries = {}
for file in os.listdir('data_summaries'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries', file), 'rt') as f:
        summaries[file.split('.')[0]] = json.load(f)

# Make tables

In [None]:
licenses = {}
for k in summaries.keys():
    for ds in summaries[k].keys():
        for lic in summaries[k][ds]['Licenses']:
            licenses[k] = licenses.get(k, []) + [lic['License']]
licenses = pd.Series({k : list(set(v)) for k, v in licenses.items()})

tmp = short_names.copy()
tmp['licenses'] = licenses
dat['License'] = tmp.groupby('short_name')['licenses'].apply(lambda s: list(set([y for x in s for y in x])))

dat['License'] = dat['License'].apply(lambda s: [v for v in s if v != 'OpenAI' and v != 'OANC'])
license_table = dat[['License', 'Cite']].copy()

license_table['License'] = license_table['License'].apply(lambda s: [v for v in s if v != 'OpenAI'])
license_table['License'] = license_table['License'].apply(lambda s: [v if v != 'Academic Research Purposes Only' else 'Academic Only' for v in s])
license_table['License'] = license_table['License'].apply(lambda s: s if 'Various' not in s else ['Various'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) > 0 else ['Unspecified'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) <= 3 else ['Various'])
license_table['License'] = license_table['License'].str.join(', ')
license_table.rename({'License': 'Licenses'}, axis=1, inplace=True)

license_table['Cite'] = license_table['Cite'].apply(lambda s: r'\cite{' + s.replace('|', ',') + '}')
license_table.loc[license_table['Cite'] == r'\cite{}', 'Cite'] = '--'

In [None]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:licenses',
    'column_format': 'l|p{5cm}|p{5cm}',
    
    'caption': (r'''
    \textbf{References and licenses} for the dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ''Various`` licenses, and we refer readers to our raw data for the full details.
    '''.strip(), r'\textbf{References and licenses}'),
    
    'hrules': True,
    'convert_css': True,
}

latex = license_table \
    .sort_index() \
    .reset_index() \
    .style \
    .hide() \
    .to_latex(**kwargs)

print(latex)