In [1]:
import os
import json

from urllib.parse import urlparse

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

import utils as ut

In [2]:
os.chdir(os.path.expanduser('~/notebooks/Data-Provenance-Collection'))

# Prepare data

## Dimension data

In [3]:
with open('constants/domain_groups.json', 'rt') as f:
    domain_groups = json.load(f)
    domain_groups = {
        v: k
        for k, vs in domain_groups.items()
        for v in vs
    }

In [4]:
with open('constants/task_groups.json', 'rt') as f:
    task_groups = json.load(f)
    task_groups = {
        v: k
        for k, vs in task_groups.items()
        for v in vs
    }

In [5]:
with open('constants/license_classes.json', 'rt') as f:
    license_classes = json.load(f)
    license_classes = {k : v[0] for k, v in license_classes.items()}
    license_classes['Custom'] = 'Unspecified'

## Summary files

In [6]:
summaries = {}
for file in os.listdir('data_summaries'):
    if file.startswith('_template') or file in ('audio', 'video'):
        continue
    
    with open(os.path.join('data_summaries', file), 'rt') as f:
        summaries[file.split('.')[0]] = json.load(f)

## List of datasets

In [7]:
dat = pd.read_csv('src/summary-tables/papers.csv') \
    .rename({'collection': 'Collection'}, axis=1) \
    .set_index('Collection')
dat = dat.loc[dat['modality'] == 'finetune'].drop(['modality', 'cites'], axis=1)

In [8]:
files = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]

assert set(files) <= set(summaries.keys())

### Datasets without entries in papers.csv

In [9]:
set(summaries.keys()) - set(files)

{'Airoboros',
 'Alpaca',
 'Anthropic HH-RLHF',
 'Baize Chat Data',
 'Book Summaries',
 'Camel-AI Science',
 'CoT Collection',
 'Code Alpaca',
 'CommitPackFT',
 'Dolly 15k',
 'Flan Collection (Chain-of-Thought)',
 'Flan Collection (Dialog)',
 'Flan Collection (Flan 2021)',
 'Flan Collection (P3)',
 'Flan Collection (Super-NaturalInstructions)',
 'GPT-4-Alpaca',
 'GPTeacher',
 'Glaive Code Assistant v2',
 'Glaive Code Assistant v3',
 'Gorilla',
 'HC3 (Chinese)',
 'HC3 (English)',
 'Joke Explanation',
 'LIMA',
 'Longform',
 'Lumos Grounding',
 'Lumos Planning',
 'NomicAI GPT4AllJ',
 'OIG',
 'Open Assistant',
 'Open Assistant OctoPack',
 'Open Orca',
 'OpenAI (Summarize from Feedback)',
 'OpenAI (WebGPT)',
 'PII-Masking-200k',
 'Preference Collection',
 'Seacrowd',
 'Self-Instruct',
 'ShareGPT Vicuna',
 'Stack Exchange Instruction',
 'Stanford Human Preferences',
 'StarCoder Self-Instruct',
 'Tasksource Instruct',
 'Tasksource Symbol-Tuning',
 'Tiny Stories',
 'Tool-Llama',
 'ToxicChat',
 

# Make tables

In [10]:
dataset_level = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            assert 'Licenses' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Licenses'], (list, tuple))
            
            assert 'Languages' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Languages'], (list, tuple))
            
            assert 'Text Sources' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Text Sources'], (list, tuple))

            assert 'Task Categories' in summaries[file][k].keys()
            assert isinstance(summaries[file][k]['Task Categories'], (list, tuple))

            langs = summaries[file][k]['Languages']
            domains = [domain_groups[d] for d in summaries[file][k]['Text Sources']]
            tasks = [task_groups[d] for d in summaries[file][k]['Task Categories']]
            licenses = [lic['License'] for lic in summaries[file][k]['Licenses']]
            mgen = len(set(summaries[file][k]['Model Generated']) - set([''])) > 0

            year = None
            if 'arxiv.org' in summaries[file][k]['ArXiv URL']:
                year = summaries[file][k]['ArXiv URL']
                year = urlparse(year).path.split('/')[-1]
                year = '20' + year[0:2]
            
            if year is None and 'Inferred Metadata' in summaries[file][k].keys():
                year = summaries[file][k]['Inferred Metadata']['HF Date']
                year = year.split('-')[0]
            
            dataset_level += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,

                'datasets': 1,
                'year': year,
                'langs': langs,
                'domains': domains,
                'tasks': tasks,
                'licenses': licenses,
                'mgen': mgen,
            }]
dataset_level = pd.DataFrame(dataset_level)

dat['Num Datasets'] = dataset_level.groupby('collection')['datasets'].sum().fillna(0).astype(int)
dat['Num Langs'] = dataset_level.groupby('collection')['langs'].apply(ut.count_unique_with_none).fillna(0).astype(int)
dat['Num Domains'] = dataset_level.groupby('collection')['domains'].apply(ut.count_unique_with_none).fillna(0).astype(int).apply(lambda s: max(s, 1))
dat['Num Tasks'] = dataset_level.groupby('collection')['tasks'].apply(ut.count_unique_with_none).fillna(0).astype(int)

dat['Year'] = dataset_level.groupby('collection')['year'].apply(lambda s: s.iloc[0] if len(set(s)) == 1 else 'Mult.').astype(str)
dat.loc['OpenGPT Healthcare', 'Year'] = '2023'  # FIXME - not in json
dat.loc['WildChat', 'Year'] = '2024'  # FIXME - not in json

dat['Use'] = dataset_level.groupby('collection')['licenses'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: [v for v in s if v != 'OpenAI' and v != 'OANC']) \
    .apply(lambda s: list(set([license_classes[v] for v in s]))) \
    .apply(ut.color_license_classes)

dat['OAI'] = dataset_level.groupby('collection')['licenses'] \
    .apply(lambda s: list(set([y for x in s for y in x]))) \
    .apply(lambda s: 'OpenAI' in s or 'OANC' in s)\
    .replace({True: r'\greencheck', False: '\emojiblank'})

sources = dataset_level.groupby('collection')['mgen'] \
    .apply(lambda s: 1 - sum(s) / len(s))
dat.loc[(sources > 0) & (sources < 1), 'Source'] = r'\emojiglobe\emojirobot'
dat.loc[sources == 1, 'Source'] = r'\emojiglobe\emojiblank'
dat.loc[sources == 0, 'Source'] = r'\emojiblank\emojirobot'

## Format for LaTeX output

In [11]:
dat = dat[[
    'Year',
    'Num Datasets',
    'Num Tasks',
    'Num Langs',
    'Num Domains',
    'Source',
    'Use',
    'OAI',
]].sort_values('Year')

In [12]:
column_mapping = {
    'Year': ('', 'Year'),
    'Num Datasets': ('Property Counts', 'Datasets'),
    'Num Tasks': ('Property Counts', 'Tasks'),
    'Num Langs': ('Property Counts', 'Langs'),
    'Num Domains': ('Property Counts', 'Domains'),
    'Source': ('Types', 'Source'),
    'Use': ('Permissions', 'Use'),
    'OAI': ('Permissions', 'OAI'),
}

dat.columns = pd.MultiIndex.from_arrays([
    [column_mapping[col][0] for col in dat.columns],
    [column_mapping[col][1] for col in dat.columns]
])

dat.columns = pd.MultiIndex.from_tuples([
    (r'\textsc{' + c[0] + r'}', r'\textsc{\thead{' + c[1] + r'}}')
    for c in dat.columns
])

dat.index.name = r'\textsc{' + dat.index.name + r'}'

def color_map(value, cmap='BrBG', vmin=None, vmax=None):
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    colormap = mp.colormaps[cmap]
    color = [int(255*x) for x in colormap(norm(value))[:3]]
    return ','.join(map(str, color))  # Convert the color to a CSV string

color_def = ''
formatters = {}
tmp_val_color = {}
num_cols = [
    c
    for c in dat.columns
    if (
        'Property Counts' in c[0] or
        'Text Lens' in c[0]
    )
]

for col in num_cols:
    tmp_val_color[col] = {}
    
    vmin = np.log(dat[col].min() + 1e-6)
    vmax = np.log(dat[col].max() + 1e-6)
    midpt = (vmax + vmin) / 2
    vmin, vmax = vmin - midpt, vmax - midpt
    
    for row in dat.index:
        value = np.log(dat.loc[row, col] + 1e-6)
        value -= midpt
        
        if pd.notnull(value):
            row_color_name = row.replace(' ', '') \
                .replace(r'\textsc{', '').replace('}', '')
            col_color_name = '_'.join(col).replace(' ', '') \
                .replace(r'\textsc{\thead{', '').replace('}}', '') \
                .replace(r'\textsc{', '').replace('}', '')
            
            color_name = f"color{row_color_name}{col_color_name}"
            
            color_def += f"\\definecolor{{{color_name}}}{{RGB}}{{{color_map(value / 4, vmin=vmin, vmax=vmax)}}}\n"
            
            tmp_val_color[col][dat.loc[row, col]] = color_name

    def func(v, col=col):
        color_name = tmp_val_color[col][v]

        if col in num_cols and v >= 1000:
            v /= 1000
            return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
        elif col in num_cols and v == 0:
            return '-'
        else:
            return f'\\cellcolor{{{color_name}}}{{{v:,.0f}}}' if pd.notnull(v) else '-'
    
    formatters[col] = func

In [13]:
kwargs = {
    'environment': 'longtable',
    
    'label': 'tab:collections-text',
    'column_format': 'lc|cccc|c|cc',
    'multicol_align': 'c',
    
    'caption': (r'''
    \textbf{Alignment tuning (text) collections and properties}. Collection properties include numbers of datasets, tasks, languages, and text domains. The \textsc{Source} column indicates whether a collection contains human-generated web text (\emojiglobe), language model outputs (\emojirobot) or both (\emojiglobe\emojirobot). The \textsc{Use} column indicates whether a collection includes data freely usable even for commercial purposes (\protect\CommercialDataCircle), data usable only for noncommercial purposes or academic research (\protect\NCDataCircle) and data whose license status is not specified precisely enough to allow us to determine commercial use permissions (\protect\UnspecifiedDataCircle). Note that each collection may have different datasets with one, two, or all three of these statuses. Finally, the \textsc{OAI} column indicates collections which include OpenAI model generations. Datasets are sorted chronologically to highlight trends over time.
    '''.strip(), r'\textbf{Alignment tuning (text) collections and properties}'),
    
    'hrules': True,
    'convert_css': True,
}

latex = dat \
    .reset_index() \
    .style \
    .hide() \
    .format(formatter=formatters) \
    .to_latex(**kwargs)

print('\n'.join([
    r'\setlength{\tabcolsep}{1.9pt}',
    color_def,
    latex,
]))

\setlength{\tabcolsep}{1.9pt}
\definecolor{colorRiddleSensePropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorMathInstr.PropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorNoRobotsPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorNectarPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorMetaMathQAPropertyCounts_Datasets}{RGB}{245,238,218}
\definecolor{colorMegaWikaPropertyCounts_Datasets}{RGB}{240,243,243}
\definecolor{colorMedInstr.PropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorMathDialPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorPII-Masking-200kPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorPure-DovePropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorLMSYS-Chat-1MPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorPygmalionAI-PIPPAPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorHelpSteerPropertyCounts_Datasets}{RGB}{240,223,178}
\definecolor{colorSeaBenchPropertyCounts_Da