In [None]:
!pip3 install -q -U pandas altair vega_datasets iso3166 vl-convert-python matplotlib seaborn

In [None]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../..'
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import logging
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler(stream=sys.stdout)])
from vega_datasets import data
from helpers import io
from analysis import multimodal_util


%load_ext autoreload
%autoreload 2

# Plotting Constants (All Plots)

In [None]:
def times_newroman():
    font = "Times New Roman"

    return {
          "config" : {
               "title": {"font": font},
               "axis": {
               "labelFont": font,
               "titleFont": font
          },
          "header": {
               "labelFont": font,
               "titleFont": font
          },
          "legend": {
               "labelFont": font,
               "titleFont": font
          },
          "text": {
               "font": font
          }
     }
}

alt.themes.register("times_newroman", times_newroman)
alt.themes.enable("times_newroman")

In [None]:
FONT_SIZE = 16
LEGEND_POSITION = "bottom"
PLOT_TOFILE = True # Whether and where to output plots
PLOT_DIR = "~/dpi-plotsmultimodal/"
PLOT_PPI = 300
MAX_LABELLIMIT = 1000 # Large number to avoid label summarization in plots

PLOT_DIR = os.path.expanduser(PLOT_DIR)

# Create directory if needed
if PLOT_TOFILE:
    os.makedirs(PLOT_DIR, exist_ok=True)

## Read Constants and Summaries

Load constants and data summaries from JSON files. Constants provide mappings and criteria for licenses, creator groups, various other categories. Data summaries contain modality-specific information about datasets.

- `all_constants`: Dictionary containing all predefined constants.
- `{text/speech/video}_summaries`: Data summaries by modality.

In [None]:
# Whether and where to output plots
PLOT_TOFILE = True
PLOT_DIR = "~/dpi-plotsmultimodal/"
PLOT_PPI = 300
MAX_LABELLIMIT = 1000 # Large number to avoid label summarization in plots

PLOT_DIR = os.path.expanduser(PLOT_DIR)

# Create directory if needed
if PLOT_TOFILE:
    os.makedirs(PLOT_DIR, exist_ok=True)

# Plotting constants
LICENSE_ORDER = ["Non-Commercial/\nAcademic", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_PLOTW = 600
LICENSE_PLOTH = 200
YEAR_CATEGORIES = ["Unknown", "<2013", *list(map(str, range(2013, 2025)))]

# Read constants
all_constants = io.read_all_constants("../../constants/")

# Read Terms data
collection_to_terms_mapper = multimodal_util.load_terms_metadata("multimodal_terms_data")

# Read individual modality summaries
text_summaries = io.read_data_summary_json("../../data_summaries/")
logging.info("Checking Text Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(text_summaries, all_constants)

speech_summaries = io.read_data_summary_json("../../data_summaries-speech/")
logging.info("Checking Speech Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(speech_summaries, all_constants)

video_summaries = io.read_data_summary_json("../../data_summaries-video/")
logging.info("Checking Video Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(video_summaries, all_constants)


In [None]:
# Prep dataframes
df = multimodal_util.prep_summaries_for_visualization(
    text_summaries,
    speech_summaries,
    video_summaries,
    all_constants,
    collection_to_terms_mapper,
    YEAR_CATEGORIES,
    LICENSE_ORDER,
)

## License Use by Modality

Show the proportion of license types by modality using a stacked bar chart (normalized). Licenses are mapped to a higher-level categorization: either **Non-Commercial/Academic**, **Unspecified**, or **Commercial** depending on the permisiveness of the original license.

In [None]:
multimodal_util.plot_license_terms_stacked_bar_chart(
    df, LICENSE_PALETTE, LICENSE_ORDER, LICENSE_PLOTW, LICENSE_PLOTH, PLOT_DIR, PLOT_PPI
)


In [None]:
multimodal_util.plot_license_terms_stacked_bar_chart_collections(
    df, LICENSE_PALETTE, LICENSE_ORDER, LICENSE_PLOTW, LICENSE_PLOTH, PLOT_DIR, PLOT_PPI
)

## Visualize Creator Categories (e.g. Academic, Industry) by Modality

Show the distribution of creator categories across modalities. Plots a normalized stacked bar chart, and also donut/pie for comparison (for now).

- `df_categories`: DataFrame unlisted to handle multiple creator categories.

In [None]:
# Plotting constants
CREATORCATEGORY_ORDER = ["Academic", "Research Group", "Industry Lab", "Corporation", "Startup", "Other", "Government", "Unspecified"]
CREATORCATEGORY_PALETTE = ["#CF4E9CFF", "#8C57A2FF", "#358DB9FF", "#82581FFF", "#2F509EFF", "#E5614CFF", "#97A1A7FF", "#2E2A2BFF"]
CREATORCATEGORY_PLOTW = 600
CREATORCATEGORY_PLOTH = 100

In [None]:
multimodal_util.plot_stacked_creator_categories(
    df, CREATORCATEGORY_ORDER, CREATORCATEGORY_PALETTE, CREATORCATEGORY_PLOTW, CREATORCATEGORY_PLOTH, PLOT_DIR
)

## Visualize Dataset Count by Creator Country/Region and Modality

Global distribution of datasets by modality. Uses a world map with color-coded regions to indicate the count of datasets from different regions.

- `df_countries`: DataFrame with country codes for plotting.
- `base`: Base map (shared across all modalities).
- `charts`: List of modality-specific maps (to concatenate into facets).

In [None]:
# Plotting constants
CREATORCOUNTRY_PLOTDIM = 600
MODALITY_COLORS = {
    "Text": "reds",
    "Speech": "blues",
    "Video": "greens"
}

In [None]:
map_charts = multimodal_util.plot_altair_worldmap(
    df,
    data.world_110m.url, # World map for plotting
    MODALITY_COLORS,
    CREATORCOUNTRY_PLOTDIM,
    PLOT_DIR
)

map_charts

## Visualize Data Source Categories by Modality

Distribution of source categories, mapped to higher-level groups in `domain_types.json`, across modalities.

- `df_sources`: DataFrame with grouped sources.

In [None]:
# Plotting constants
DOMAIN_TYPEMAP = multimodal_util.invert_dict_of_lists(all_constants["DOMAIN_TYPES"])
SOURCECATEGORY_PLOTW = 600
SOURCECATEGORY_PLOTH = 100
SOURCECATEGORY_ORDER = sorted(set(DOMAIN_TYPEMAP.values()) - {"Other"}) + ["Other", "Unspecified"]

In [None]:
multimodal_util.plot_source_domain_stacked_chart(
    df, DOMAIN_TYPEMAP, SOURCECATEGORY_ORDER, SOURCECATEGORY_PLOTW, SOURCECATEGORY_PLOTH, PLOT_DIR
)

Here, we plot source/domain type distributions by year and modality.

Here, we re-plot the source category distributions, but aggregating within collections for text data (i.e. we use the majority source within collections). For possible ToS mapping (since those annotations are collection level).

In [None]:
multimodal_util.plot_source_domain_stacked_chart_collections(
    df, DOMAIN_TYPEMAP, SOURCECATEGORY_ORDER, SOURCECATEGORY_PLOTW, SOURCECATEGORY_PLOTH, PLOT_DIR
)

## Combined Plots

In [None]:
df_text = df[df["Modality"] == "Text"].copy()
df_nontext = df[df["Modality"] != "Text"]

def pick_licensetype(x: pd.Series) -> str:
    """Pick most restrictive license type for a collection."""
    if "Non-Commercial/Academic" in x.values:
        return "Non-Commercial/Academic"
    return x.mode()[0] if len(x) > 0 else "Unspecified"

df_text.loc[:, "License Type"] = df_text.groupby("Collection")["License Type"].transform(pick_licensetype)

df_text = df_text.drop_duplicates(subset="Collection")

df_licensescondensed = pd.concat([df_nontext, df_text], ignore_index=True)

logging.warning("Aggregating to %d collections" % len(df_licensescondensed.loc[df_licensescondensed["Modality"] == "Text", "Collection"].unique()))

df_licensescondensed = df_licensescondensed.sort_values(by="License Type")
chart_licensesaggregated = alt.Chart(df_licensescondensed).mark_bar().encode(
    x=alt.X(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%", title=None, labels=False, ticks=False)
    ),
    y=alt.Y("Modality:N", title=""),
    color=alt.Color(
        "License Type:N",
        title="License Use",
        scale=alt.Scale(range=LICENSE_PALETTE),
        sort=LICENSE_ORDER
    ),
    order="order:Q"
)

chart_licensesaggregated