In [1]:
import os
import json

import numpy as np
import pandas as pd

import utils as ut

In [2]:
os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection'))

# Prepare data

## List of datasets

In [3]:
dat = pd.read_csv('notebooks/papers.csv') \
    .rename({'collection': 'Collection', 'cites': 'Cite', 'modality': 'Modality'}, axis=1) \
    .set_index('Collection')

dat['Cite'] = dat['Cite'].fillna('--').replace('--', '')

In [4]:
keys = set([y for x in dat['summary_keys'].str.split('|').tolist() for y in x])

ft_files = set([
    os.path.splitext(f)[0]
    for f in os.listdir('data_summaries')
    if not f.startswith('_template') and f not in ('video', 'audio')
])

vd_files = set([
    os.path.splitext(f)[0].strip()
    for f in os.listdir('data_summaries-video')
    if not f.startswith('_template')
])

ad_files = set([
    os.path.splitext(f)[0].strip()
    for f in os.listdir('data_summaries-speech')
    if not f.startswith('_template')
])

assert keys <= ft_files | vd_files | ad_files

In [5]:
allcites = dat['Cite'].str.split('|').tolist()
allcites = [x for y in allcites for x in y if x != '']
allcites = list(set(allcites))

len(allcites)

284

## Dimension data

In [6]:
summaries = {}
for file in os.listdir('data_summaries'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries', file), 'rt') as f:
        key = '.'.join(file.split('.')[:-1]).strip()
        summaries[key] = json.load(f)

for file in os.listdir('data_summaries-video'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries-video', file), 'rt') as f:
        key = '.'.join(file.split('.')[:-1]).strip()
        summaries[key] = json.load(f)

for file in os.listdir('data_summaries-speech'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries-speech', file), 'rt') as f:
        key = '.'.join(file.split('.')[:-1]).strip()
        summaries[key] = json.load(f)

# Make tables

In [7]:
licenses = {}
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            lics = summaries[file][k]['Licenses']
            for lic in lics:
                licenses[collection] = licenses.get(collection, []) + [lic['License']]
licenses = pd.Series({k : list(set(v)) for k, v in licenses.items()})

dat['License'] = licenses.apply(lambda s: [v for v in s if v != 'OpenAI' and v != 'OANC'])
license_table = dat[['License', 'Cite', 'Modality']].copy()

license_table['License'] = license_table['License'].apply(lambda s: [v for v in s if v != 'OpenAI'])
license_table['License'] = license_table['License'].apply(lambda s: [v if v != 'Academic Research Purposes Only' else 'Academic Only' for v in s])
license_table['License'] = license_table['License'].apply(lambda s: s if 'Various' not in s else ['Various'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) > 0 else ['Unspecified'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) <= 3 else ['Various'])
license_table['License'] = license_table['License'].str.join(', ')
license_table.rename({'License': 'Licenses'}, axis=1, inplace=True)

license_table['Cite'] = license_table['Cite'].apply(lambda s: r'\cite{' + s.replace('|', ',') + '}')
license_table.loc[license_table['Cite'] == r'\cite{}', 'Cite'] = '--'

audio_license_table = license_table.loc[license_table['Modality'] == 'audio'].drop('Modality', axis=1)
text_license_table = license_table.loc[license_table['Modality'] == 'finetune'].drop('Modality', axis=1)
video_license_table = license_table.loc[license_table['Modality'] == 'video'].drop('Modality', axis=1)

In [8]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:refs-licenses-text',
    'column_format': 'p{5cm}|p{5cm}|p{5cm}',
    
    'caption': (r'''
    \textbf{References and licenses for alignment-tuning (text)} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.
    '''.strip(), r'\textbf{References and licenses: alignment tuning (text}'),
    
    'hrules': True,
    'convert_css': True,
}

print(text_license_table
    .sort_index()
    .reset_index()
    .style
    .hide()
    .to_latex(**kwargs))

\begin{longtable}{p{5cm}|p{5cm}|p{5cm}}
\caption[\textbf{References and licenses: alignment tuning (text}]{\textbf{References and licenses for alignment-tuning (text)} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.} \label{tab:refs-licenses-text} \\
\toprule
Collection & Licenses & Cite \\
\midrule
\endfirsthead
\caption[]{\textbf{References and licenses for alignment-tuning (text)} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.} \\
\toprule
Collection & Licenses & Cite \\
\midrule
\endhead
\midrule
\multicolumn{3}{r}{Contin

In [9]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:refs-licenses-audio',
    'column_format': 'p{5cm}|p{5cm}|p{5cm}',
    
    'caption': (r'''
    \textbf{References and licenses for audio} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.
    '''.strip(), r'\textbf{References and licenses: audio}'),
    
    'hrules': True,
    'convert_css': True,
}

print(audio_license_table
    .sort_index()
    .reset_index()
    .style
    .hide()
    .to_latex(**kwargs))

\begin{longtable}{p{5cm}|p{5cm}|p{5cm}}
\caption[\textbf{References and licenses: audio}]{\textbf{References and licenses for audio} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.} \label{tab:refs-licenses-audio} \\
\toprule
Collection & Licenses & Cite \\
\midrule
\endfirsthead
\caption[]{\textbf{References and licenses for audio} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.} \\
\toprule
Collection & Licenses & Cite \\
\midrule
\endhead
\midrule
\multicolumn{3}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\

In [10]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:refs-licenses-video',
    'column_format': 'p{5cm}|p{5cm}|p{5cm}',
    
    'caption': (r'''
    \textbf{References and licenses for video} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.
    '''.strip(), r'\textbf{References and licenses: video}'),
    
    'hrules': True,
    'convert_css': True,
}

print(video_license_table
    .sort_index()
    .reset_index()
    .style
    .hide()
    .to_latex(**kwargs))

\begin{longtable}{p{5cm}|p{5cm}|p{5cm}}
\caption[\textbf{References and licenses: video}]{\textbf{References and licenses for video} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.} \label{tab:refs-licenses-video} \\
\toprule
Collection & Licenses & Cite \\
\midrule
\endfirsthead
\caption[]{\textbf{References and licenses for video} dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ``Various'' licenses, and we refer readers to our raw data for the full details. Datasets are sorted alphabetically for ease of dataset lookup.} \\
\toprule
Collection & Licenses & Cite \\
\midrule
\endhead
\midrule
\multicolumn{3}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\