In [None]:
!pip3 install -q -U pandas altair vega_datasets iso3166

In [None]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../..'
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import logging
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
from collections import defaultdict
from vega_datasets import data
from iso3166 import countries
from helpers import io, filters
from typing import Any


%load_ext autoreload
%autoreload 2

## Helper Functions

Utility functions to process and transform data summaries:

---
```python
def invert_dict_of_lists(
  d: dict[str, list[str]]
) -> dict[str, str]
```
- Inverts a dictionary of lists for easier mapping of constants.
---
```python
def remap_licenses_with_paraphrases(
  summaries: list[dict[str, Any]],
  paraphrases: dict[str, str]
) -> dict[str, Any]
``` 
- Standardizes inconsistent license names in data summaries using predefined paraphrases.
---
```python
def map_license_criteria_multimodal(
  data_summary: list[dict[str, Any]],
  all_constants: dict[str, dict[str, list[str]]]
) -> list[dict[str, Any]]
```
- Maps license criteria for multimodal datasets, resolving them according to predefined constants.
---
```python
def get_country(x: str) -> list[int]
```
- Takes a country name as input and returns a list of ISO3166 codes (mostly, of length 1). It handles a special case that appears in some text annotations ("African Continent" -> list of ISO codes) and logs a warning for any countries not found in the mapping.
---

In [None]:
def invert_dict_of_lists(d: dict[str, list[str]]) -> dict[str, str]:
    """Useful for mapping constants, paraphrases, etc.
    These are normally in the form:
        { "Category": ["item1", "item2", … ] }
    Whereas we want to invert it to:
        { "item1": "Category", "item2": "Category", … }
    """
    inverted = {}
    for k, v in d.items():
        for item in v:
            inverted[item] = k
    return inverted

In [None]:
def remap_licenses_with_paraphrases(
        summaries: list[dict[str, Any]],
        paraphrases: dict[str, str]
    ) -> dict[str, Any]:
    """Map inconsistent license names to shared paraphrases using the constants.
    E.g. "CC-BY-SA 4.0", "CC BY SA 4.0" -> "CC BY-SA 4.0"
    """

    for i, summary in enumerate(summaries):
        for j, license in enumerate(summary["Licenses"]):
            license = license["License"]
            summaries[i]["Licenses"][j]["License"] = paraphrases.get(
                license,
                license
            )
    return summaries


In [None]:
def classify_and_resolve_licenses(
    license_infos: list[tuple[str, str]],
    all_constants: dict[str, dict[str, list[str]]]
) -> list[str]:
    """Function taken from `text_ft_plots.ipynb`"""
    classified_licenses = []
    for (license_name, license_url) in license_infos:
        # Classify an individual license
        classifications = filters.classify_license(license_name, license_url, all_constants)
        classified_licenses.append(classifications)

    # By default, multiple licenses yield to the most restrictive one
    resolved_criteria = filters.resolve_multiple_licenses(classified_licenses)
    return resolved_criteria


def add_license_classes_to_summaries(
    data_summary: list[dict[str, Any]],
    resolved_classes: dict[str, list[str]],
    aggregator: str
):
    """Function taken from `text_ft_plots.ipynb`"""
    # Update DataFrame with columns for use, attribution, share_alike
    for row in data_summary:
        row[f"License Use ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][0]
        row[f"License Attribution ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][1]
        row[f"License Share Alike ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][2]
    return data_summary


def map_license_criteria_multimodal(
    data_summary: list[dict[str, Any]],
    all_constants: dict[str, dict[str, list[str]]]
) -> list[dict[str, Any]]:
    """Variant of `map_license_criteria` that works with multimodal datasets.
    Simplified to only include `Licenses` (not HF, etc.).

    Function adapted from `text_ft_plots.ipynb`.
    """

    # Unpack licenses for each dataset. {uid --> (license_name, license_url)}
    our_uid_to_license_infos = defaultdict(list)

    # Same as ours, but excludes OpenAI Terms:
    our_uid_to_license_infos_no_openai = defaultdict(list)

    for row in data_summary:
        uid = row["Unique Dataset Identifier"]
        for license_info in row["Licenses"]:
            license_name = license_info["License"]
            license_url = license_info.get("License URL", None) # FOR NOW
            our_uid_to_license_infos[uid].append((license_name, license_url))
            if license_info["License"] != "OpenAI":
                our_uid_to_license_infos_no_openai[uid].append((license_name, license_url))

        # If OpenAI was the only license, we add Unspecified so there isn't nothing there.
        if len(our_uid_to_license_infos_no_openai[uid]) == 0:
            our_uid_to_license_infos_no_openai[uid].append(("Unspecified", None))


    # classify and resolve licenses for each dataset and each aggregator
    ours_resolved, ours_openai_resolved = {}, {}
    for uid in our_uid_to_license_infos.keys():
        ours_resolved[uid] = classify_and_resolve_licenses(our_uid_to_license_infos[uid], all_constants)
        ours_openai_resolved[uid] = classify_and_resolve_licenses(our_uid_to_license_infos_no_openai[uid], all_constants)


    data_summary = add_license_classes_to_summaries(data_summary, ours_resolved, "DataProvenance")
    data_summary = add_license_classes_to_summaries(data_summary, ours_openai_resolved, "DataProvenance IgnoreOpenAI")

    return data_summary

In [None]:
countries_replace = { # These names need to be remapped from the original set to ISO3166
    "South Korea": "KOREA, REPUBLIC OF",
    "United Kingdom": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND",
    "Czech Republic": "CZECHIA",
    "Vietnam": "VIET NAM",
    "Iran": "IRAN, ISLAMIC REPUBLIC OF",
    "Russia": "RUSSIAN FEDERATION",
    "UAE": "UNITED ARAB EMIRATES",
    "United States": "UNITED STATES OF AMERICA",
    "Scotland": "UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND",
    "Turkey": "TÜRKIYE",
    "International/Other/Unknown": ""
}

# Text annotations contain "African Continent" in several cases
# This is a list of ISO3166 codes for the countries in the African Continent, for mapping purposes
african_continent_iso_codes = [12, 24, 204, 72, 86, 854, 108, 132, 120, 140, 148, 174, 178, 180, 384, 262, 818, 226, 232, 748, 231, 260, 266, 270, 288, 324, 624, 404, 426, 430, 434, 450, 454, 466, 478, 480, 175, 504, 508, 516, 562, 566, 638, 646, 654, 678, 686, 690, 694, 706, 710, 728, 729, 834, 768, 788, 800, 732, 894, 716]

def get_country(x: str) -> list[int]:
    """Get the ISO3166 code for a country name. Returns a list for compatibility with x == "African Continent".

    Will log warnings for any countries not found.
    """
    if x == "African Continent":
        return african_continent_iso_codes
    try:
        return [countries.get(countries_replace.get(x, x))[-2]]
    except KeyError:
            logging.warning("Could not find country for %s" % x)
            return []

## Read Constants and Summaries

Load constants and data summaries from JSON files. Constants provide mappings and criteria for licenses, creator groups, various other categories. Data summaries contain modality-specific information about datasets.

- `all_constants`: Dictionary containing all predefined constants.
- `{text/speech/video}_summaries`: Data summaries by modality.

In [None]:
# Read constants
all_constants = io.read_all_constants("../../constants/")

# Read individual modality summaries
text_summaries = io.read_data_summary_json("../../data_summaries/")
speech_summaries = io.read_data_summary_json("../../data_summaries-speech/")
video_summaries = io.read_data_summary_json("../../data_summaries-video/")

## License Use by Modality

Show the proportion of license types by modality using a stacked bar chart (normalized). Licenses are mapped to a higher-level categorization: either **Non-Commercial/Academic**, **Unspecified**, or **Commercial** depending on the permisiveness of the original license.

In [None]:
# Plotting constants
LICENSE_ORDER = ["Non-Commercial/\nAcademic", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_PLOTW = 600
LICENSE_PLOTH = 200

In [None]:
license_paraphrases = invert_dict_of_lists(all_constants["LICENSE_PARAPHRASES"])

In [None]:
text_summaries = map_license_criteria_multimodal(
    remap_licenses_with_paraphrases(
        text_summaries,
        license_paraphrases
    ),
    all_constants
)

speech_summaries = map_license_criteria_multimodal(
    remap_licenses_with_paraphrases(
        speech_summaries,
        license_paraphrases
    ),
    all_constants
)

video_summaries = map_license_criteria_multimodal(
    remap_licenses_with_paraphrases(
        video_summaries,
        license_paraphrases
    ),
    all_constants
)

df_text = pd.DataFrame(text_summaries).assign(Modality="Text")
df_speech = pd.DataFrame(speech_summaries).assign(Modality="Speech").rename(columns={"Location": "Countries"})
df_video = pd.DataFrame(video_summaries).assign(Modality="Video").rename(columns={"Video Sources": "Source Category"})

df = pd.concat([df_text, df_speech, df_video])

In [None]:
df["License Type"] = df["License Use (DataProvenance)"].map({
    "academic-only": "Non-Commercial/\nAcademic",
    "non-commercial": "Non-Commercial/\nAcademic",
    "unspecified": "Unspecified",
    "commercial": "Commercial"
})
df["License Type"] = pd.Categorical(
    df["License Type"],
    categories=LICENSE_ORDER,
    ordered=True
)
df = df.sort_values(by="License Type")

In [None]:
alt.Chart(df).mark_bar().encode(
    x=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    y=alt.X("Modality:N"),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(range=LICENSE_PALETTE),
        title="License Use",
        sort=LICENSE_ORDER
    ),
    order="order:Q"
).properties(
    title="License Use by Modality",
    width=LICENSE_PLOTW,
    height=LICENSE_PLOTH
)

## Visualize Creator Categories (e.g. Academic, Industry) by Modality

Show the distribution of creator categories across modalities. Plots a normalized stacked bar chart, and also donut/pie for comparison (for now).

- `df_categories`: DataFrame unlisted to handle multiple creator categories.

In [None]:
# Plotting constants
CREATORCATEGORY_ORDER = ["Academic", "Research Group", "Industry Lab", "Corporation", "Startup", "Other", "Government", "Unspecified"]
CREATORCATEGORY_PALETTE = ["#CF4E9CFF", "#8C57A2FF", "#358DB9FF", "#82581FFF", "#2F509EFF", "#E5614CFF", "#97A1A7FF", "#2E2A2BFF"]
CREATORCATEGORY_PLOTW = 600
CREATORCATEGORY_PLOTH = 200

In [None]:
creator_groupmap = invert_dict_of_lists(all_constants["CREATOR_GROUPS"])
creator_countrymap = invert_dict_of_lists(all_constants["CREATOR_COUNTRY_GROUPS"])

In [None]:
# Map creators to categories (all modalities from constants, for this)
df["Creator Categories"] = df["Creators"].map(lambda c : [creator_groupmap[ci] for ci in c])
df_categories = df.explode("Creator Categories").fillna("Unspecified") # For now, we fill in Unspecified

In [None]:
# For Text, we can infer the country from the creator group using the constants
# For other modalities, they're taken from the summaries (annotated indenenpendently)
df_categories.loc[
    df_categories["Modality"] == "Text",
    "Countries"
] = df_categories.loc[
    df_categories["Modality"] == "Text",
    "Creators"
].map(
    lambda x: [creator_countrymap[ci] for ci in x]
)

df_categories["Creator Categories"] = pd.Categorical(
    df_categories["Creator Categories"],
    categories=CREATORCATEGORY_ORDER,
    ordered=True
)
df_categories = df_categories.sort_values(by="Creator Categories")

In [None]:
alt.Chart(df_categories).mark_bar().encode(
    x=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    y=alt.X("Modality:N"),
    color=alt.Color(
        "Creator Categories:N",
        scale=alt.Scale(range=CREATORCATEGORY_PALETTE),
        title="Creator Category",
        sort=CREATORCATEGORY_ORDER
    ),
    order="order:Q"
).properties(
    title="Creator Categories by Modality",
    width=CREATORCATEGORY_PLOTW,
    height=CREATORCATEGORY_PLOTH
)

In [None]:
# Donut chart as alternate, to test
alt.Chart(df_categories).mark_arc(innerRadius=40).encode(
    theta="count():Q",
    color=alt.Color(
        "Creator Categories:N",
        scale=alt.Scale(range=CREATORCATEGORY_PALETTE),
        title="Creator Category",
        sort=CREATORCATEGORY_ORDER
    ),
    order="order:Q"
).properties(
    title="Creator Categories by Modality",
    width=CREATORCATEGORY_PLOTH, # Use height as width for square aspect ratio
    height=CREATORCATEGORY_PLOTH
).facet(
    "Modality:N",
    columns=3
)

## Visualize Dataset Count by Creator Country/Region and Modality

Global distribution of datasets by modality. Uses a world map with color-coded regions to indicate the count of datasets from different regions.

- `df_countries`: DataFrame with country codes for plotting.
- `base`: Base map (shared across all modalities).
- `charts`: List of modality-specific maps (to concatenate into facets).

In [None]:
# Plotting constants
CREATORCOUNTRY_PLOTDIM = 600
MODALITY_COLORS = {
    "Text": "reds",
    "Speech": "blues",
    "Video": "greens"
}

In [None]:
countries_src = data.world_110m.url # World map for plotting
df_countries = df_categories.explode("Countries").dropna() # Drop rows with no country for the moment
df_countries = df_countries[["Countries", "Modality"]].value_counts().reset_index(name="Count")
df_countries["Country ID"] = df_countries["Countries"].map(get_country)
df_countries = df_countries.explode("Country ID").dropna()

In [None]:
base = alt.Chart(
    alt.topo_feature(countries_src, "countries")
).mark_geoshape(
    stroke="white"
).project(
    type="equalEarth"
)

charts = []

for modality, color in MODALITY_COLORS.items():
    modality_data = df_countries[df_countries["Modality"] == modality]
    chart = base.encode(
        color=alt.Color(
            "Count:Q",
            # log scale
            scale=alt.Scale(scheme=color, type="symlog"),
            title="Datasets"
        ),
        tooltip=["Countries:N", "Count:Q", "Modality:N"]
    ).properties(
        width=CREATORCOUNTRY_PLOTDIM,
        height=CREATORCOUNTRY_PLOTDIM//2
    ).transform_lookup(
        lookup="id",
        from_=alt.LookupData(modality_data, "Country ID", ["Count", "Modality", "Countries"])
    ).transform_calculate(
        Count="isValid(datum.Count) ? datum.Count : 0",
        Modality="isValid(datum.Modality) ? datum.Modality : ''",
        Countries="isValid(datum.Countries) ? datum.Countries : ''"
    ).properties(
        title=modality
    )
    charts.append(chart)

alt.vconcat(
    *charts
).resolve_scale(
    color="independent"
).properties(
    title="Dataset Count by Country and Modality"
)

## Visualize Data Source Categories by Modality

Distribution of source categories, mapped to higher-level groups in `domain_types.json`, across modalities.

- `df_sources`: DataFrame with grouped sources.

In [None]:
domain_groupmap = invert_dict_of_lists(all_constants["DOMAIN_GROUPS"])
domain_typemap = invert_dict_of_lists(all_constants["DOMAIN_TYPES"])

In [None]:
# Plotting constants
SOURCECATEGORY_PLOTW = 600
SOURCECATEGORY_PLOTH = 200
SOURCECATEGORY_ORDER = sorted(set(domain_typemap.values()) - {"Other"}) + ["Other", "Unspecified"]

In [None]:
# For Text, we can infer the domain from the text sources using the constants
# For other modalities, they're taken from the summaries (renamed columns)
df.loc[
    df["Modality"] == "Text",
    "Source Category"
] = df.loc[
    df["Modality"] == "Text",
    "Text Sources"
].map(
    lambda x: [domain_groupmap[ci] for ci in x]
)

In [None]:
# Unlist to have one row per source category (atomic components)
df_sources = df.explode("Source Category")
df_sources["Source Category"] = df_sources["Source Category"].map(
    domain_typemap
).fillna("Unspecified") # For now, we fill in Unspecified

df_sources["Source Category"] = pd.Categorical(
    df_sources["Source Category"],
    categories=SOURCECATEGORY_ORDER,
    ordered=True
)
df_sources = df_sources.sort_values(by="Source Category")

alt.Chart(df_sources).mark_bar().encode(
    x=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    y=alt.X("Modality:N"),
    color=alt.Color(
        "Source Category:N",
        title="Source Category",
        sort=SOURCECATEGORY_ORDER
    ),
    order="order:Q"
).configure_legend(
    orient="bottom",
    columns=3,
    labelLimit=0
).properties(
    title="Source Categories by Modality",
    width=SOURCECATEGORY_PLOTW,
    height=SOURCECATEGORY_PLOTH
)

Here, we re-plot the source category distributions, but aggregating within collections for text data (i.e. we use the majority source within collections). For possible ToS mapping (since those annotations are collection level).

In [None]:
df_text = df_sources[df_sources["Modality"] == "Text"].copy()
df_nontext = df_sources[df_sources["Modality"] != "Text"]

df_text.loc[:, "Source Category"] = df_text.groupby("Collection")["Source Category"].transform(
    lambda x: x.mode()[0] if not x.mode().empty else "Unspecified"
)

df_text = df_text.drop_duplicates(subset="Collection")

df_sources = pd.concat([df_nontext, df_text], ignore_index=True)

logging.warning("Aggregating to %d collections" % len(df_sources.loc[df_sources["Modality"] == "Text", "Collection"].unique()))

df_sources = df_sources.sort_values(by="Source Category")
alt.Chart(df_sources).mark_bar().encode(
    x=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    y=alt.X("Modality:N"),
    color=alt.Color(
        "Source Category:N",
        title="Source Category",
        sort=SOURCECATEGORY_ORDER
    ),
    order="order:Q"
).configure_legend(
    orient="bottom",
    columns=3,
    labelLimit=0
).properties(
    title="Source Categories by Modality (Aggregated Collections)",
    width=SOURCECATEGORY_PLOTW,
    height=SOURCECATEGORY_PLOTH
)