Add this to the preamble to get symbols used in the tables:
```
\newcommand{\emojiblank}{\phantom{\emoji{smile}}}
\newcommand{\NCDataCircle}{\tikz[baseline=-0.85ex]{\definecolor{mycolor}{HTML}{e04c71} \fill[mycolor] (0,0) circle (0.85ex);}}
\newcommand{\UnspecifiedDataCircle}{\tikz[baseline=-0.85ex]{\definecolor{mycolor}{HTML}{e0cd92} \fill[mycolor] (0,0) circle (0.85ex);}}
\newcommand{\CommercialDataCircle}{\tikz[baseline=-0.85ex]{\definecolor{mycolor}{HTML}{82b5cf} \fill[mycolor] (0,0) circle (0.85ex);}}
\newcommand{\TransparentCircle}{\tikz[baseline=-0.85ex]{\fill[fill opacity=0] (0,0) circle (0.85ex);}}
```

Formatting tweaks still needed after pasting these tables into the LaTeX doc:
* Move the caption and label down to the bottom. Pandas doesn't have an option to specify caption position.
* Put the "Collection" column header on the same line as the others.
* Put `\resizebox{\textwidth}{!}{` and `}` around the tabular environment.

If you want [horizontal spacing](https://tex.stackexchange.com/questions/509393/table-cellcolor-any-way-to-paint-only-part-of-cell) between the color blocks, add \addlinespace[.25em] after every row's `\\`.

In [None]:
import os
import json

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

import utils as ut

In [None]:
os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection'))

# Prepare data

## List of datasets

In [None]:
dat = pd.DataFrame([
    ('Aya', 'Aya Dataset', 'singhAyaDatasetOpenAccess2024'),
    ('ChatbotArena', 'ChatbotArena', 'zhengJudgingLLMasaJudgeMTBench2023'),
    ('Open Assistant v2', 'Open Assistant v2', 'kopfOpenAssistantConversationsDemocratizing2023'),
    ('UltraChat_200k', 'UltraChat_200k', 'dingEnhancingChatLanguage2023'),
    ('AgentInstruct', 'AgentInstruct', 'shridharALFWorldAligningText2021|yaoWebShopScalableRealWorld2023|liuAgentBenchEvaluatingLLMs2023|zengAgentTuningEnablingGeneralized2023|dengMind2WebGeneralistAgent2023'),
    ('Bactrian-X', 'Bactrian-X', 'liBactrianXMultilingualReplicable2023'),
    ('COIG', 'COIG|COIG-CQIA', 'zhangChineseOpenInstruction2023|baiCOIGCQIAQualityAll2024'),
    ('ChatDoctor', 'ChatDoctor', 'liChatDoctorMedicalChat2023'),
    ('Cidar', 'Cidar', 'alyafeaiCIDARCulturallyRelevant2024'),
    ('Cobra Frames', 'Cobra Frames', 'zhouCOBRAFramesContextual2023'),
    ('Conifer', 'Conifer', 'sunConiferImprovingComplex2024'),
    ('Deita 10K', 'Deita 10K', 'liuWhatMakesGood2024'),
    ('ExpertQA', 'ExpertQA', 'malaviyaExpertQAExpertCuratedQuestions2024'),
    ('Feedback Collection', 'Feedback Collection', 'kimPrometheusInducingFinegrained2024'),
    ('HelpSteer', 'HelpSteer', 'wangHelpSteerMultiattributeHelpfulness2023'),
    ('Indic-Instruct', 'Indic-Instruct', 'galaAiravataIntroducingHindi2024'),
    ('KIWI', 'KIWI', 'xuKIWIDatasetKnowledgeIntensive2024'),
    ('Llama2-MedTuned-Instructions', 'Llama2-MedTuned-Instructions', 'rohanianExploringEffectivenessInstruction2023'),
    ('LongAlign-10k', 'LongAlign-10k', 'baiLongAlignRecipeLong2024'),
    ('MathDial', 'MathDial', 'macinaMathDialDialogueTutoring2023'),
    ('MathInstruct', 'MathInstruct', 'yueMAmmoTHBuildingMath2023'),
    ('MedInstruct', 'MedInstruct', 'zhangAlpaCareInstructiontunedLarge2024'),
    ('Medical Meadow', 'Medical Meadow', 'hanMedAlpacaOpenSourceCollection2023|wangCORD19COVID19Open2020|jinWhatDiseaseDoes2020|saveryQuestionDrivenSummarizationAnswers2020'),
    ('MegaWika', 'MegaWika', 'barhamMegaWikaMillionsReports2023'),
    ('MetaMathQA', 'MetaMathQA', 'yuMetaMathBootstrapYour2023'),
    ('OpenMathInstruct-1', 'OpenMathInstruct-1', 'toshniwalOpenMathInstruct1MillionMath2024'),
    ('Orca-Math', 'Orca-Math', 'mitraOrcaMathUnlockingPotential2024'),
    ('PMC-LLaMA Instructions', 'PMC-LLaMA Instructions', 'wuPMCLLaMABuildingOpensource2023|jinPubMedQADatasetBiomedical2019'),
    ('PygmalionAI-PIPPA', 'PygmalionAI-PIPPA', 'goslingPIPPAPartiallySynthetic2023'),
    ('RiddleSense', 'RiddleSense', 'linRiddleSenseReasoningRiddle2021'),
    ('SeaBench', 'SeaBench', 'nguyenSeaLLMsLargeLanguage2023'),
    ('SelFee', 'SelFee', 'yeSelFeeIterativeSelfRevising2023'),
    ('WildChat', 'WildChat', 'zhaoWildChat1MChatGPT2023'),
    ('lmsys_chat_1m', 'lmsys_chat_1m', 'zhengLMSYSChat1MLargeScaleRealWorld2024'),
    ('Open-Platypus', 'Open-Platypus', 'sawadaARBAdvancedReasoning2023|dettmersQLoRAEfficientFinetuning2023|lightmanLetVerifyStep2023a|yuReClorReadingComprehension2020|wangSciBenchEvaluatingCollegeLevel2024|luLearnExplainMultimodal2022|chenTheoremQATheoremdrivenQuestion2023'),
    ('DialogStudio', 'DialogStudio', 'chenActionBasedConversationsDataset2021|weiAirDialogueEnvironmentGoalOriented2018|linBiToDBilingualMultiDomain2021|chawlaCaSiNoCorpusCampsite2021|heDecouplingStrategyGeneration2018|mrksicNeuralBeliefTracker2017|qianDatabaseSearchResults2022|liuDuRecDialBilingualParallel2021|elasriFramesCorpusAdding2017|quanGECOREndtoEndGenerative2019|chenSemanticallyConditionedDialog2019a|chenKETODKnowledgeEnrichedTaskOriented2022|ericKeyValueRetrievalNetworks2017|zangMultiWOZDialogueDataset2020|shalyminovFewShotDialogueGeneration2019|martinMuDoCoCorpusMultidomain2020|peskovMultiDomainGoalOrientedDialogues2019|ericMultiWOZConsolidatedMultiDomain2019|moonOpenDialKGExplainableConversational2019|rastogiScalableMultidomainConversational2020|mosigSTARSchemaGuidedDialog2020|chiuSalesBotTransitioningChitChat2022|shahBuildingConversationalAgent2018|byrneTaskmaster1RealisticDiverse2019|mrksicFullyStatisticalNeural2018|shangUnsupervisedAbstractiveMeeting2018|rameshkumarStorytellingDialogueCritical2020|fabbriConvoSummConversationSummarization2021|chenDialogSumRealLifeScenario2021|mukherjeeECTSumNewBenchmark2022|shangUnsupervisedAbstractiveMeeting2018|zhuMediaSumLargescaleMedia2021|zhongQMSumNewBenchmark2021|gliwaSAMSumCorpusHumanannotated2019|chenSummScreenDatasetAbstractive2022|feigenblatTWEETSUMMDialogSummarization2021|liEndtoEndTrainableNonCollaborative2019|dinanSecondConversationalIntelligence2019|rashkinEmpatheticOpendomainConversation2019|baiTrainingHelpfulHarmless2022|chenPLACESPromptingLanguage2023|kimProsocialDialogProsocialBackbone2022|myersConversationalScaffoldingAnalogybased2020|reddyCoQAConversationalQuestion2019|yuCoSQLConversationalTexttoSQL2019|talmorWebKnowledgeBaseAnswering2018|nanDARTOpenDomainStructured2021|nanFeTaQAFreeformTable2022|guThreeLevelsGeneralization2021|chenHybridQADatasetMultiHop2020|guptaMMQAMultidomainMultilingual2018|liMTOPComprehensiveMultilingual2021|talmorMultiModalQAComplexQuestion2021|yuSParCCrossDomainSemantic2019|iyyerSearchbasedNeuralStructured2017|yuSpiderLargeScaleHumanLabeled2019|parikhToTToControlledTableToText2020|yihValueSemanticParse2016|zhongSeq2SQLGeneratingStructured2017|pasupatCompositionalSemanticParsing2015|komeiliInternetAugmentedDialogueGeneration2022|dinanWizardWikipediaKnowledgePowered2019|hemphillATISSpokenLanguage1990|casanuevaEfficientIntentDetection2020|zhangArePretrainedTransformers2022|larsonEvaluationDatasetIntent2019|rastogiScalableMultidomainConversational2020|liuBenchmarkingNaturalLanguage2019|liuAsgardPortableArchitecture2013|coopeSpanConveRTFewshotSpan2020|couckeSnipsVoicePlatform2018|guptaSemanticParsingTask2018'),
    
    # no paper
    ('10k Prompt Ranked', '10k Prompt Ranked', ''),
    ('Capybara', 'Capybara', ''),
    ('CollectiveCognition', 'CollectiveCognition', ''),
    ('EverythingLM', 'EverythingLM', ''),
    ('Glaive Code Assistant', 'Glaive Code Assistant', ''),
    ('Gretel Text-to-SQL', 'Gretel Text-to-SQL', ''),
    ('Nectar', 'Nectar', ''),
    ('No Robots', 'No Robots', ''),
    ('OpenGPT Healthcare', 'OpenGPT Healthcare', ''),
    ('PII-masking-200k', 'PII-masking-200k', ''),
    ('Pure-Dove', 'Pure-Dove', ''),
    ('Thai Gen AI', 'Thai Gen AI (Alpaca)|Thai Gen AI (Dolly)|Thai Gen AI (GPTeacher)', ''),
    ('UltraFeedback Argilla', 'UltraFeedback Argilla', ''),    
], columns=['Collection', 'summary_keys']).set_index('Collection')

In [None]:
sk = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]

files = [
    os.path.splitext(f)[0]
    for f in os.listdir('data_summaries')
    if not f.startswith('_template')
]

assert set(sk) <= set(files)

In [None]:
tmp = []
short_names = pd.DataFrame(dat['summary_keys'].str.split('|')).to_records().tolist()
for short, keys in short_names:
    for key in keys:
        tmp += [(short, key)]
short_names = pd.DataFrame(tmp, columns=['short_name', 'summary_key']).set_index('summary_key')

## Dimension data

In [None]:
with open('constants/domain_groups.json', 'rt') as f:
    domain_groups = json.load(f)
    domain_groups = {
        v: k
        for k, vs in domain_groups.items()
        for v in vs
    }

In [None]:
with open('constants/task_groups.json', 'rt') as f:
    task_groups = json.load(f)
    task_groups = {
        v: k
        for k, vs in task_groups.items()
        for v in vs
    }

In [None]:
with open('constants/license_classes.json', 'rt') as f:
    license_classes = json.load(f)
    license_classes = {k : v[-1] for k, v in license_classes.items()}
    license_classes['Custom'] = 'Unspecified'

## Load summary files

In [None]:
summaries = {}
for file in os.listdir('data_summaries'):
    if file.startswith('_template'):
        continue
    
    with open(os.path.join('data_summaries', file), 'rt') as f:
        summaries[file.split('.')[0]] = json.load(f)

## Licenses

In [None]:
licenses = {}
for k in summaries.keys():
    for ds in summaries[k].keys():
        for lic in summaries[k][ds]['Licenses']:
            licenses[k] = licenses.get(k, []) + [lic['License']]
licenses = pd.Series({k : list(set(v)) for k, v in licenses.items()})

tmp = short_names.copy()
tmp['licenses'] = licenses
dat['License'] = tmp.groupby('short_name')['licenses'].apply(lambda s: list(set([y for x in s for y in x])))

dat['OAI'] = dat['License'].apply(lambda s: 'OpenAI' in s or 'OANC' in s)
dat['OAI'] = dat['OAI'].replace({True: r'\greencheck', False: '\emojiblank'})
dat['License'] = dat['License'].apply(lambda s: [v for v in s if v != 'OpenAI' and v != 'OANC'])

license_table = dat[['License', 'Cite']].copy()

In [None]:
dat['Use'] = dat['License'].apply(lambda s: list(set([license_classes[v] for v in s])))

In [None]:
def color_license_classes(s):
    ret = []

    if 'All' in s:
        ret += [r'\CommercialDataCircle']
    else:
        ret += [r'\TransparentCircle']
    
    if 'Unspecified' in s or len(s) == 0:
        ret += [r'\UnspecifiedDataCircle']
    else:
        ret += [r'\TransparentCircle']
    
    if 'Acad' in s or 'NC' in s:
        ret += [r'\NCDataCircle']
    else:
        ret += [r'\TransparentCircle']
    
    return ' '.join(ret)

dat['Use'] = dat['License'].apply(lambda s: list(set([license_classes[v] for v in s])))
dat['Use'] = dat['Use'].apply(color_license_classes)

In [None]:
dat.drop(['License', 'Cite'], axis=1, inplace=True)

## Property counts and text lens

In [None]:
raw = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            if 'Languages' in summaries[file][k].keys():
                langs = summaries[file][k]['Languages']
            else:
                langs = None
            
            metrics = summaries[file][k].get('Text Metrics', None)
            if metrics is None or metrics == '' or metrics == {}:
                num_dialogs = np.nan
                mean_inputs_length = np.nan
                mean_targets_length = np.nan
            else:
                num_dialogs = metrics['Num Dialogs']
                mean_inputs_length = metrics['Mean Inputs Length']
                mean_targets_length = metrics['Mean Targets Length']
                
            if 'Text Sources' not in summaries[file][k].keys():
                domains = None
            elif not isinstance(summaries[file][k]['Text Sources'], (list, tuple)):
                domains = None
            else:
                domains = summaries[file][k]['Text Sources']
                domains = [domain_groups[d] for d in domains]

            if 'Task Categories' not in summaries[file][k].keys():
                tasks = None
            elif not isinstance(summaries[file][k]['Task Categories'], (list, tuple)):
                tasks = None
            else:
                tasks = summaries[file][k]['Task Categories']
                tasks = [task_groups[d] for d in tasks]

            inf_metadata = summaries[file][k].get('Inferred Metadata', None)
            if inf_metadata is None or inf_metadata == '' or inf_metadata == {}:
                topics = None
            else:
                if 'Text Topics' not in inf_metadata.keys():
                    topics = None
                elif not isinstance(inf_metadata['Text Topics'], (list, tuple)):
                    topics = None
                else:
                    topics = inf_metadata['Text Topics']

            raw += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,

                'num_dialogs': num_dialogs,
                'mean_inputs_length': mean_inputs_length,
                'mean_targets_length': mean_targets_length,

                'langs': langs,
                'topics': topics,
                'domains': domains,
                'tasks': tasks,
                'datasets': 1,
            }]
raw = pd.DataFrame(raw)

total_input_length = raw['num_dialogs'] * raw['mean_inputs_length']
total_targets_length = raw['num_dialogs'] * raw['mean_targets_length']

num_dialogs = raw.groupby('collection')['num_dialogs'].sum()
mean_inputs_length = total_input_length.groupby(raw['collection']).sum() / num_dialogs
mean_targets_length = total_targets_length.groupby(raw['collection']).sum() / num_dialogs

num_langs = raw.groupby('collection')['langs'].apply(ut.count_unique_with_none)
num_topics = raw.groupby('collection')['topics'].apply(ut.count_unique_with_none)
num_domains = raw.groupby('collection')['domains'].apply(ut.count_unique_with_none)
num_tasks = raw.groupby('collection')['tasks'].apply(ut.count_unique_with_none)
num_datasets = raw.groupby('collection')['datasets'].sum()

dat['Num Langs'] = num_langs.fillna(0).astype(int)
dat['Num Dialogs'] = num_dialogs.fillna(0).astype(int)
dat['Mean Inputs Length'] = mean_inputs_length.fillna(0).astype(int)
dat['Mean Targets Length'] = mean_targets_length.fillna(0).astype(int)
dat['Num Topics'] = num_topics.fillna(0).astype(int)
dat['Num Datasets'] = num_datasets.fillna(0).astype(int)
dat['Num Tasks'] = num_tasks.fillna(0).astype(int)
dat['Num Domains'] = num_domains.fillna(0).astype(int).apply(lambda s: max(s, 1))

## Source

In [None]:
mgen = {}
for k in summaries.keys():
    for ds in summaries[k].keys():
        models = summaries[k][ds]['Model Generated']
        models = [m for m in models if m != '']
        mgen[k] = mgen.get(k, []) + [len(models) > 0]
mgen = pd.Series({k : list(set(v)) for k, v in mgen.items()})
tmp = short_names.copy()
tmp['mgen'] = mgen
dat['Source'] = tmp.groupby('short_name')['mgen'] \
    .agg(lambda x: [item for sublist in x for item in sublist]) \
    .apply(lambda s: 1 - sum(s) / len(s))

dat.loc[(dat['Source'] > 0) & (dat['Source'] < 1), 'Source'] = r'\emoji{globe-with-meridians}\emoji{robot}'
dat.loc[dat['Source'] == 1, 'Source'] = r'\emoji{globe-with-meridians}\emojiblank'
dat.loc[dat['Source'] == 0, 'Source'] = r'\emojiblank\emoji{robot}'

## Format

In [None]:
formats_map = {
    'Chain-of-Thought': 'CT',
    'Few-shot': 'FS',
    'Multi-turn Dialog': 'MD',
    'Response Ranking': 'RR',
    'Zero-shot': 'ZS',
}

In [None]:
found_formats = [
    summaries[file][k]['Format']
    for file in summaries.keys()
    for k in summaries[file].keys()
]
found_formats = set([y for x in found_formats for y in x])
assert found_formats <= set(formats_map.keys())

fmts = []
for collection in dat.index:
    for file in dat.loc[collection, 'summary_keys'].split('|'):
        for k in summaries[file].keys():
            tmp_fmts = summaries[file][k].get('Format', [])
            tmp_fmts = [formats_map[f] for f in tmp_fmts]
            
            fmts += [{
                'collection': collection,
                'summary_key': file,
                'sub': k,
                
                'formats': tmp_fmts,
            }]
fmts = pd.DataFrame(fmts)
fmts = fmts.groupby('collection')['formats'] \
           .apply(lambda s: list(set([y for x in s for y in x]))) \
           .rename('formats')

for fmt in formats_map.values():
    dat[fmt] = fmts.apply(lambda s: fmt in s) \
        .replace(True, r'\greencheck') \
        .replace(False, r'\emojiblank')

## Format for LaTeX output

In [None]:
dat = dat[[
    'Num Datasets',
    'Num Dialogs',
    'Num Tasks',
    'Num Langs',
    'Num Topics',
    'Num Domains',
    
    'Mean Inputs Length',
    'Mean Targets Length',
    
    'Source',
    
    'ZS',
    'FS',
    'CT',
    'RR',
    'MD',
    
    'Use',
    'OAI',
]]

In [None]:
column_mapping = {
    'Num Datasets': ('Property Counts', 'Datasets'),
    'Num Dialogs': ('Property Counts', 'Dialogs'),
    'Num Tasks': ('Property Counts', 'Tasks'),
    'Num Langs': ('Property Counts', 'Langs'),
    'Num Topics': ('Property Counts', 'Topics'),
    'Num Cites': ('Property Counts', 'Cites'),
    'Num Domains': ('Property Counts', 'Domains'),
    'Mean Inputs Length': ('Text Lens', 'Inpt'),
    'Mean Targets Length': ('Text Lens', 'Tgt'),
    'Source': ('Dataset Types', 'Source'),

    'CT': ('Dataset Types', 'C'),
    'ZS': ('Dataset Types', 'Z'),
    'RR': ('Dataset Types', 'R'),
    'MD': ('Dataset Types', 'M'),
    'FS': ('Dataset Types', 'F'),
    
    'Use': ('Dataset Types', 'Use'),
    'OAI': ('Dataset Types', 'O'),
}

dat.columns = pd.MultiIndex.from_arrays([
    [column_mapping[col][0] for col in dat.columns],
    [column_mapping[col][1] for col in dat.columns]
])

dat.columns = pd.MultiIndex.from_tuples([
    (r'\textsc{' + c[0] + r'}', r'\textsc{\thead{' + c[1] + r'}}')
    for c in dat.columns
])

dat.index.name = r'\textsc{' + dat.index.name + r'}'

def color_map(value, cmap='BrBG', vmin=None, vmax=None):
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    colormap = mp.colormaps[cmap]
    color = [int(255*x) for x in colormap(norm(value))[:3]]
    return ','.join(map(str, color))  # Convert the color to a CSV string

color_def = ''
formatters = {}
tmp_val_color = {}
num_cols = [
    c
    for c in dat.columns
    if (
        'Property Counts' in c[0] or
        'Text Lens' in c[0]
    )
]

for col in num_cols:
    tmp_val_color[col] = {}
    
    vmin = np.log(dat[col].min() + 1e-6)
    vmax = np.log(dat[col].max() + 1e-6)
    midpt = (vmax + vmin) / 2
    vmin, vmax = vmin - midpt, vmax - midpt
    
    for row in dat.index:
        value = np.log(dat.loc[row, col] + 1e-6)
        value -= midpt
        
        if pd.notnull(value):
            row_color_name = row.replace(' ', '') \
                .replace(r'\textsc{', '').replace('}', '')
            col_color_name = '_'.join(col).replace(' ', '') \
                .replace(r'\textsc{\thead{', '').replace('}}', '') \
                .replace(r'\textsc{', '').replace('}', '')
            
            color_name = f"color{row_color_name}{col_color_name}"
            
            color_def += f"\\definecolor{{{color_name}}}{{RGB}}{{{color_map(value / 4, vmin=vmin, vmax=vmax)}}}\n"
            
            tmp_val_color[col][dat.loc[row, col]] = color_name

    if 'Dialogs' in col[1]:  # or 'Downs' in col[1]:
        def func(v, col=col):
            color_name = tmp_val_color[col][v]
    
            if col in num_cols and v >= 1000:
                v /= 1000
                return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
            elif col in num_cols and v == 0:
                return '-'
            else:
                return f'\\cellcolor{{{color_name}}}{{<1k}}' if pd.notnull(v) else '-'
    else:
        def func(v, col=col):
            color_name = tmp_val_color[col][v]
    
            if col in num_cols and v >= 1000:
                v /= 1000
                return f'\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'
            elif col in num_cols and v == 0:
                return '-'
            else:
                return f'\\cellcolor{{{color_name}}}{{{v:,.0f}}}' if pd.notnull(v) else '-'
    
    formatters[col] = func

# Main table

In [None]:
kwargs = {
    'environment': 'table*',
    'label': 'tab:collections',
    'position_float': 'centering',
    'column_format': 'l|ccccccc|rr|cp{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}cp{0.3cm}',
    'multicol_align': 'c',
    
    'caption': r'''
    \textbf{Alignment tuning collections and their characteristics}. Collection properties include numbers of datasets, dialogs, tasks, languages, topics, Huggingface downloads ("Downs"), text domains, and average length of input and target text. The \textsc{Source} column indicates whether a collection contains human-generated web text (\emoji{globe-with-meridians}), language model outputs (\emoji{robot}) or both (\emoji{globe-with-meridians}\emoji{robot}). Several columns indicate the type of dialogs, with some collections having more than one: zero-shot (Z), few-shot (F), response ranking (R), chain-of-thought (C), and multi-turn dialog (M). Finally, the \textsc{Use} column indicates whether a collection includes data freely usable even for commercial purposes (\protect\CommercialDataCircle), data usable only for noncommercial purposes or academic research (\protect\NCDataCircle) and data whose license status is not specified precisely enough to allow us to determine commercial use permissions (\protect\UnspecifiedDataCircle). Note that each collection may have different datasets with one, two, or all three of these statuses. The O column indicates collections which include OpenAI model generations.
    '''.strip(),
    
    'hrules': True,
    'convert_css': True,
}

latex = dat \
    .sort_index() \
    .style \
    .format(formatter=formatters) \
    .to_latex(**kwargs)

print('\n'.join([
    r'\setlength{\tabcolsep}{1.9pt}',
    color_def,
    latex,
]))

# Appendix license/cite table

In [None]:
license_table['OpenAI NC'] = license_table['License'] \
    .apply(lambda s: 'OpenAI' in s) \
    .replace(True, r'\redcross') \
    .replace(False, r'\emojiblank')

license_table['License'] = license_table['License'].apply(lambda s: [v for v in s if v != 'OpenAI'])
license_table['License'] = license_table['License'].apply(lambda s: [v if v != 'Academic Research Purposes Only' else 'Academic Only' for v in s])
license_table['License'] = license_table['License'].apply(lambda s: s if 'Various' not in s else ['Various'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) > 0 else ['Unspecified'])
license_table['License'] = license_table['License'].apply(lambda s: s if len(s) <= 3 else ['Various'])

In [None]:
license_table = license_table[[
#    'OpenAI NC',
    'Cite',
    'License'
]]

In [None]:
license_table['Cite'] = license_table['Cite'].apply(lambda s: r'\citet{' + s + '}')
license_table.loc[license_table['Cite'] == r'\citet{}', 'Cite'] = '--'

license_table['License'] = license_table['License'].str.join(', ')

In [None]:
license_table.rename({'License': 'Licenses'}, axis=1, inplace=True)

In [None]:
kwargs = {
    'environment': 'table*',
    'label': 'tab:licenses',
    'position_float': 'centering',
    'column_format': 'l|lp{5.5cm}',
    
    'caption': r'''
    \textbf{Licenses and citations} for the dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ''Various`` licenses, and we refer readers to our raw data for the full details.
    '''.strip(),
    
    'hrules': True,
    'convert_css': True,
}

latex = license_table \
    .sort_index() \
    .style \
    .to_latex(**kwargs)

print(latex)