In [1]:
import os
import json

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

In [2]:
os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection'))

# Prepare data

## Utils

In [3]:
def count_unique_with_none(s):
    all_vals = []
    for lst in s:
        if lst is None:
            pass
        else:
            all_vals += lst
    
    return len(set(all_vals))

## Datasets and some metadata

In [4]:
dat = pd.DataFrame([
    ('Anthropic HH', 'Anthropic HH-RLHF', 'bai2022training, gangulired'),
    ('Dolly 15k', 'Dolly 15k', 'dolly15k_2023'),
    ('OpenAssistant', 'Open Assistant', 'kopf2023openassistant'),
    ('Flan Collection', 'Flan Collection (Chain-of-Thought)|Flan Collection (Dialog)|Flan Collection (Flan 2021)|Flan Collection (P3)|Flan Collection (Super-NaturalInstructions)', 'longpre2023flan'),
    ('xP3x', 'xP3x', 'muennighoff2022crosslingual'),
    ('Tasksource Ins.', 'Tasksource Instruct', 'sileo2023tasksource'),
    ('LAION OIG', 'OIG', 'oig2023'),
    ('SHP', 'Stanford Human Preferences', 'SHP'),
    ('ShareGPT', 'ShareGPT Vicuna', r'sharegpt'),
    ('Self-Instruct', 'Self-Instruct', 'selfinstruct2022'),
    ('WebGPT', 'OpenAI (WebGPT)', 'nakano2021webgpt'),
    ('OpenAI Summ.', 'OpenAI (Summarize from Feedback)', 'stienon2020learning'),
    ('Airoboros', 'Airoboros', 'Durbin2023Airoboros'),
    ('Alpaca', 'Alpaca', 'alpaca'),
    ('BaizeChat', 'Baize Chat Data', 'xu2023baize'),
    ('BookSum', 'Book Summaries', 'kryscinski2022booksum'),
    ('CamelAI Sci.', 'Camel-AI Science', 'li2023camel'),
    ('CoT Coll.', 'CoT Collection', 'kim2023cot'),
    ('Code Alpaca', 'Code Alpaca', ''),
    ('GPT-4-Alpaca', 'GPT-4-Alpaca', 'peng2023instruction'),
    ('GPTeacher', 'GPTeacher', ''),
    ('Gorilla', 'Gorilla', 'patil2023gorilla'),
    ('HC3', 'HC3 (Chinese)|HC3 (English)', 'guo2023close'),
    ('Joke Expl.', 'Joke Explanation',''),
    ('LIMA', 'LIMA', 'zhou2023lima'),
    ('Longform', 'Longform', 'koksal2023longform'),
    ('GPT4AllJ', 'NomicAI GPT4AllJ', 'gpt4all'),
    ('OpenOrca', 'Open Orca', 'mukherjee2023orca'),
    ('Tool-Llama', 'Tool-Llama', 'qin2023toolllm'),
    ('UltraChat', 'UltraChat', 'ding2023enhancing'),
    ('Unnatural Instr.', 'Unnatural Instructions', 'honovich2022unnatural'),
    ('Evol-Instr.', 'WizardLM Evol-Instruct|WizardLM Evol-Instruct V2', 'xu2023wizardlm'),
    ('StarCoder', 'StarCoder Self-Instruct', 'li2023starcoder'),
    ('TinyStories', 'Tiny Stories', 'eldan2023tinystories'),
    ('StackExchange', 'Stack Exchange Instruction', ''),
    ('Tasksource ST', 'Tasksource Symbol-Tuning', 'weston2015aicomplete'),
    ('CommitPackFT', 'CommitPackFT', 'muennighoff2023octopack'),
    ('OpAsst OctoPack', 'Open Assistant OctoPack', 'muennighoff2023octopack'),
], columns=['Collection', 'summary_keys', 'Cite']).set_index('Collection')

In [5]:
sk = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]

files = [
    os.path.splitext(f)[0]
    for f in os.listdir('data_summaries')
    if not f.startswith('_template')
]

assert len(set(sk)) == len(sk)
assert set(sk) <= set(files)

## Other supporting data

In [6]:
tmp = []
short_names = pd.DataFrame(dat['summary_keys'].str.split('|')).to_records().tolist()
for short, keys in short_names:
    for key in keys:
        tmp += [(short, key)]
short_names = pd.DataFrame(tmp, columns=['short_name', 'summary_key']).set_index('summary_key')

In [7]:
summaries = {}
for file in os.listdir('data_summaries'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries', file), 'rt') as f:
        summaries[file.split('.')[0]] = json.load(f)

In [8]:
with open('constants/domain_groups.json', 'rt') as f:
    domain_groups = json.load(f)
    domain_groups = {
        v: k
        for k, vs in domain_groups.items()
        for v in vs
    }

In [9]:
with open('constants/task_groups.json', 'rt') as f:
    task_groups = json.load(f)
    task_groups = {
        v: k
        for k, vs in task_groups.items()
        for v in vs
    }

In [10]:
with open('constants/license_classes.json', 'rt') as f:
    license_classes = json.load(f)
    license_classes = {k : v[-1] for k, v in license_classes.items()}
    license_classes['Custom'] = 'Unspecified'

In [11]:
hf_downloads_new = pd.read_csv('src/summary-tables/hf_downloads.csv', sep='\t')

## Licenses

In [12]:
licenses = {}
for k in summaries.keys():
    for ds in summaries[k].keys():
        for lic in summaries[k][ds]['Licenses']:
            licenses[k] = licenses.get(k, []) + [lic['License']]
licenses = pd.Series({k : list(set(v)) for k, v in licenses.items()})

tmp = short_names.copy()
tmp['licenses'] = licenses
dat['License'] = tmp.groupby('short_name')['licenses'].apply(lambda s: list(set([y for x in s for y in x])))

license_table = dat[['License', 'Cite']].copy()

In [13]:
def color_license_classes(s):
    assert len(s) > 0
    
    ret = []

    if 'All' in s:
        ret += [r'\CommercialDataCircle']
    else:
        ret += [r'\TransparentCircle']
    
    if 'Unspecified' in s:
        ret += [r'\UnspecifiedDataCircle']
    else:
        ret += [r'\TransparentCircle']
    
    if 'Acad' in s or 'NC' in s:
        ret += [r'\NCDataCircle']
    else:
        ret += [r'\TransparentCircle']
    
    return ' '.join(ret)

dat['Use'] = dat['License'].apply(lambda s: list(set([license_classes[v] for v in s])))
dat['Use'] = dat['Use'].apply(color_license_classes)

In [14]:
dat.drop(['License', 'Cite'], axis=1, inplace=True)

## Property counts and text lens

In [15]:
raw = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            if 'Languages' in summaries[file][k].keys():
                langs = summaries[file][k]['Languages']
            else:
                langs = None
            
            metrics = summaries[file][k].get('Text Metrics', None)
            if metrics is None or metrics == '' or metrics == {}:
                num_dialogs = np.nan
                mean_inputs_length = np.nan
                mean_targets_length = np.nan
            else:
                num_dialogs = metrics['Num Dialogs']
                mean_inputs_length = metrics['Mean Inputs Length']
                mean_targets_length = metrics['Mean Targets Length']
                
            if 'Text Sources' not in summaries[file][k].keys():
                domains = None
            elif not isinstance(summaries[file][k]['Text Sources'], (list, tuple)):
                domains = None
            else:
                domains = summaries[file][k]['Text Sources']
                domains = [domain_groups[d] for d in domains]

            if 'Task Categories' not in summaries[file][k].keys():
                tasks = None
            elif not isinstance(summaries[file][k]['Task Categories'], (list, tuple)):
                tasks = None
            else:
                tasks = summaries[file][k]['Task Categories']
                tasks = [task_groups[d] for d in tasks]

            inf_metadata = summaries[file][k].get('Inferred Metadata', None)
            if inf_metadata is None or inf_metadata == '' or inf_metadata == {}:
                topics = np.nan
            else:
                if 'Text Topics' not in inf_metadata.keys():
                    topics = None
                elif not isinstance(inf_metadata['Text Topics'], (list, tuple)):
                    topics = None
                else:
                    topics = inf_metadata['Text Topics']

            raw += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,

                'num_dialogs': num_dialogs,
                'mean_inputs_length': mean_inputs_length,
                'mean_targets_length': mean_targets_length,

                'langs': langs,
                'topics': topics,
                'domains': domains,
                'tasks': tasks,
                'datasets': 1,
            }]
raw = pd.DataFrame(raw)

total_input_length = raw['num_dialogs'] * raw['mean_inputs_length']
total_targets_length = raw['num_dialogs'] * raw['mean_targets_length']

num_dialogs = raw.groupby('collection')['num_dialogs'].sum()
mean_inputs_length = total_input_length.groupby(raw['collection']).sum() / num_dialogs
mean_targets_length = total_targets_length.groupby(raw['collection']).sum() / num_dialogs

num_langs = raw.groupby('collection')['langs'].apply(count_unique_with_none)
num_topics = raw.groupby('collection')['topics'].apply(count_unique_with_none)
num_domains = raw.groupby('collection')['domains'].apply(count_unique_with_none)
num_tasks = raw.groupby('collection')['tasks'].apply(count_unique_with_none)
num_datasets = raw.groupby('collection')['datasets'].sum()

dat['Num Langs'] = num_langs.fillna(0).astype(int)
dat['Num Dialogs'] = num_dialogs.fillna(0).astype(int)
dat['Mean Inputs Length'] = mean_inputs_length.fillna(0).astype(int)
dat['Mean Targets Length'] = mean_targets_length.fillna(0).astype(int)
dat['Num Topics'] = num_topics.fillna(0).astype(int)
dat['Num Datasets'] = num_datasets.fillna(0).astype(int)
dat['Num Tasks'] = num_tasks.fillna(0).astype(int)
dat['Num Domains'] = num_domains.fillna(0).astype(int).apply(lambda s: max(s, 1))

In [16]:
num_downs = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        if file in hf_downloads_new['Collection'].tolist():
            downs = hf_downloads_new.loc[hf_downloads_new['Collection'] == file, 'sum HF Downloads (October 2023)'].item()
        else:
            downs = np.nan

        num_downs += [(file, downs)]

downs = short_names.copy().rename({'short_name': 'collection'}, axis=1)
downs['downs'] = pd.Series(dict(num_downs))
num_downs = downs.groupby('collection').sum()
dat['Num Downs'] = num_downs.fillna(0).astype(int)

## Source

In [17]:
mgen = {}
for k in summaries.keys():
    for ds in summaries[k].keys():
        models = summaries[k][ds]['Model Generated']
        models = [m for m in models if m != '']
        mgen[k] = mgen.get(k, []) + [len(models) > 0]
mgen = pd.Series({k : list(set(v)) for k, v in mgen.items()})
tmp = short_names.copy()
tmp['mgen'] = mgen
dat['Source'] = tmp.groupby('short_name')['mgen'] \
    .agg(lambda x: [item for sublist in x for item in sublist]) \
    .apply(lambda s: 1 - sum(s) / len(s))

dat.loc[(dat['Source'] > 0) & (dat['Source'] < 1), 'Source'] = r'\emoji{globe-with-meridians}\emoji{robot}'
dat.loc[dat['Source'] == 1, 'Source'] = r'\emoji{globe-with-meridians}\emojiblank'
dat.loc[dat['Source'] == 0, 'Source'] = r'\emojiblank\emoji{robot}'

  dat.loc[(dat['Source'] > 0) & (dat['Source'] < 1), 'Source'] = r'\emoji{globe-with-meridians}\emoji{robot}'


## Format

In [18]:
formats_map = {
    'Chain-of-Thought': 'CT',
    'Few-shot': 'FS',
    'Multi-turn Dialog': 'MD',
    'Response Ranking': 'RR',
    'Zero-shot': 'ZS',
}

In [19]:
found_formats = [
    summaries[file][k]['Format']
    for file in summaries.keys()
    for k in summaries[file].keys()
]
found_formats = set([y for x in found_formats for y in x])
# assert found_formats <= set(formats_map.keys())

fmts = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            tmp_fmts = summaries[file][k].get('Format', [])
            tmp_fmts = [
                formats_map[f]
                for f in tmp_fmts
                if f in formats_map.keys()
            ]
            
            fmts += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,
                
                'formats': tmp_fmts,
            }]
fmts = pd.DataFrame(fmts)
fmts = fmts.groupby('collection')['formats'] \
           .apply(lambda s: list(set([y for x in s for y in x]))) \
           .rename('formats')

for fmt in formats_map.values():
    dat[fmt] = fmts.apply(lambda s: fmt in s) \
        .replace(True, r'\greencheck') \
        .replace(False, r'\emojiblank')

## Format for LaTeX output

In [20]:
dat = dat[[
    'Num Datasets',
    'Num Dialogs',
    'Num Tasks',
    'Num Langs',
    'Num Topics',
    'Num Domains',
    'Num Downs',
    
    'Mean Inputs Length',
    'Mean Targets Length',
    
    'Source',
    
    'ZS',
    'FS',
    'CT',
    'RR',
    'MD',
    
    'Use',
]]

In [21]:
column_mapping = {
    'Num Datasets': ('Property Counts', 'Datasets'),
    'Num Dialogs': ('Property Counts', 'Dialogs'),
    'Num Tasks': ('Property Counts', 'Tasks'),
    'Num Langs': ('Property Counts', 'Langs'),
    'Num Topics': ('Property Counts', 'Topics'),
    'Num Cites': ('Property Counts', 'Cites'),
    'Num Downs': ('Property Counts', 'Downs'),
    'Num Domains': ('Property Counts', 'Domains'),
    'Mean Inputs Length': ('Text Lens', 'Inpt'),
    'Mean Targets Length': ('Text Lens', 'Tgt'),
    'Source': ('Dataset Types', 'Source'),

    'CT': ('Dataset Types', 'C'),
    'ZS': ('Dataset Types', 'Z'),
    'RR': ('Dataset Types', 'R'),
    'MD': ('Dataset Types', 'M'),
    'FS': ('Dataset Types', 'F'),
    
    'Use': ('Dataset Types', 'Use'),
}

dat.columns = pd.MultiIndex.from_arrays([
    [column_mapping[col][0] for col in dat.columns],
    [column_mapping[col][1] for col in dat.columns]
])

dat.columns = pd.MultiIndex.from_tuples([
    (r'\textsc{' + c[0] + r'}', r'\textsc{\thead{' + c[1] + r'}}')
    for c in dat.columns
])

dat.index.name = r'\textsc{' + dat.index.name + r'}'

def color_map(value, cmap='BrBG', vmin=None, vmax=None):
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    colormap = mp.colormaps[cmap]
    color = [int(255*x) for x in colormap(norm(value))[:3]]
    return ','.join(map(str, color))  # Convert the color to a CSV string

color_def = ''
formatters = {}
tmp_val_color = {}
num_cols = [
    c
    for c in dat.columns
    if (
        'Property Counts' in c[0] or
        'Text Lens' in c[0]
    )
]

for col in num_cols:
    tmp_val_color[col] = {}
    
    vmin = np.log(dat[col].min() + 1e-6)
    vmax = np.log(dat[col].max() + 1e-6)
    midpt = (vmax + vmin) / 2
    vmin, vmax = vmin - midpt, vmax - midpt
    
    for row in dat.index:
        value = np.log(dat.loc[row, col] + 1e-6)
        value -= midpt
        
        if pd.notnull(value):
            row_color_name = row.replace(' ', '') \
                .replace(r'\textsc{', '').replace('}', '')
            col_color_name = '_'.join(col).replace(' ', '') \
                .replace(r'\textsc{\thead{', '').replace('}}', '') \
                .replace(r'\textsc{', '').replace('}', '')
            
            color_name = f"color{row_color_name}{col_color_name}"
            
            color_def += f"\\definecolor{{{color_name}}}{{RGB}}{{{color_map(value / 4, vmin=vmin, vmax=vmax)}}}\n"
            
            tmp_val_color[col][dat.loc[row, col]] = color_name

    if 'Dialogs' in col[1] or 'Downs' in col[1]:
        def func(v, col=col):
            color_name = tmp_val_color[col][v]
    
            if col in num_cols and v >= 1000:
                v /= 1000
                return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
            elif col in num_cols and v == 0:
                return '-'
            else:
                return f'\\cellcolor{{{color_name}}}{{<1k}}' if pd.notnull(v) else '-'
    else:
        def func(v, col=col):
            color_name = tmp_val_color[col][v]
    
            if col in num_cols and v >= 1000:
                v /= 1000
                return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
            elif col in num_cols and v == 0:
                return '-'
            else:
                return f'\\cellcolor{{{color_name}}}{{{v:,.0f}}}' if pd.notnull(v) else '-'
    
    formatters[col] = func

# Main table

In [22]:
kwargs = {
    'environment': 'table*',
    'label': 'tab:collections',
    'position_float': 'centering',
    'column_format': 'l|ccccccc|rr|cp{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}c',
    'multicol_align': 'c',
    
    'caption': r'''
    \textbf{Alignment tuning collections and their characteristics.} Properties of the collections include the numbers of datasets, dialogs, unique tasks, languages, topics, text domains, Huggingface monthly downloads (“Downs”), and the average length of input and target text, by characters. The \textsc{Source} column indicates whether a collection includes human web text (\emoji{globe-with-meridians}), or model generated text (\emoji{robot}). The dialog formats of each collection can be: zero-shot (Z), few-shot (F), chain-of-thought (C), response ranking (R), and multi-turn dialog (M). The \textsc{Use} column indicates whether a collection includes data licensed for commercial use (\protect\CommercialDataCircle), data with no license (“unspecified”: \protect\UnspecifiedDataCircle), data only licensed for non-commercial or academic use (\protect\NCDataCircle). \emph{Note that these licenses are self-reported and their applicability is complicated, requiring legal consultation.} The ``O'' column indicates if the collection includes OpenAI model generations, which may or may not affect commercial viability (see \cref{llm-generation}).
    '''.strip(),
    
    'hrules': True,
    'convert_css': True,
}

latex = dat \
    .sort_index() \
    .style \
    .format(formatter=formatters) \
    .to_latex(**kwargs)

print('\n'.join([
    r'\setlength{\tabcolsep}{1.9pt}',
    color_def,
    latex,
]))

\setlength{\tabcolsep}{1.9pt}
\definecolor{colorAnthropicHHPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorDolly15kPropertyCounts_Datasets}{RGB}{245,239,222}
\definecolor{colorOpenAssistantPropertyCounts_Datasets}{RGB}{245,244,242}
\definecolor{colorFlanCollectionPropertyCounts_Datasets}{RGB}{182,227,220}
\definecolor{colorxP3xPropertyCounts_Datasets}{RGB}{179,226,219}
\definecolor{colorTasksourceIns.PropertyCounts_Datasets}{RGB}{196,232,227}
\definecolor{colorLAIONOIGPropertyCounts_Datasets}{RGB}{242,244,244}
\definecolor{colorSHPPropertyCounts_Datasets}{RGB}{245,244,242}
\definecolor{colorShareGPTPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorSelf-InstructPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorWebGPTPropertyCounts_Datasets}{RGB}{245,237,214}
\definecolor{colorOpenAISumm.PropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorAiroborosPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorAlpacaPropertyCounts_Datasets}{RGB}{2

# Appendix license/cite table

In [23]:
license_table['OpenAI NC'] = license_table['License'] \
    .apply(lambda s: 'OpenAI' in s) \
    .replace(True, r'\redcross') \
    .replace(False, r'\emojiblank')

license_table['License'] = license_table['License'].apply(lambda s: [v for v in s if v != 'OpenAI'])
license_table['License'] = license_table['License'].apply(lambda s: [v if v != 'Academic Research Purposes Only' else 'Academic Only' for v in s])
license_table['License'] = license_table['License'].apply(lambda s: s if 'Various' not in s else ['Various'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) > 0 else ['Unspecified'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) <= 3 else ['Various'])

In [24]:
license_table = license_table[['OpenAI NC', 'Cite', 'License']]

In [25]:
license_table['Cite'] = license_table['Cite'].apply(lambda s: r'\citet{' + s + '}')
license_table.loc[license_table['Cite'] == r'\citet{}', 'Cite'] = '--'

license_table['License'] = license_table['License'].str.join(', ')

In [26]:
license_table.rename({'License': 'Licenses'}, axis=1, inplace=True)

In [27]:
kwargs = {
    'environment': 'table*',
    'label': 'tab:licenses',
    'position_float': 'centering',
    'column_format': 'l|clp{5.5cm}',
    
    'caption': r'''
    \textbf{Licenses and citations} for the dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ''Various`` licenses, and we refer readers to our raw data for the full details. Datasets which were generated with the use of OpenAI APIs and implicate those APIs' noncompetition restrictions are marked with a red X (\redcross) in the ``OpenAI NC'' column.
    '''.strip(),
    
    'hrules': True,
    'convert_css': True,
}

latex = license_table \
    .sort_index() \
    .style \
    .to_latex(**kwargs)

print(latex)

\begin{table*}
\centering
\caption{\textbf{Licenses and citations} for the dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ''Various`` licenses, and we refer readers to our raw data for the full details. Datasets which were generated with the use of OpenAI APIs and implicate those APIs' noncompetition restrictions are marked with a red X (\redcross) in the ``OpenAI NC'' column.}
\label{tab:licenses}
\begin{tabular}{l|clp{5.5cm}}
\toprule
 & OpenAI NC & Cite & Licenses \\
Collection &  &  &  \\
\midrule
Airoboros & \redcross & \citet{Durbin2023Airoboros} & Various \\
Alpaca & \redcross & \citet{alpaca} & CC BY-NC 4.0 \\
Anthropic HH & \emojiblank & \citet{bai2022training, gangulired} & MIT License \\
BaizeChat & \redcross & \citet{xu2023baize} & CC BY-NC 4.0 \\
BookSum & \emojiblank & \citet{kryscinski2022booksum} & Academic Only \\
CamelAI Sci. & \redcross & \citet{li2023camel} & CC BY-NC 4.0 \\
C