In [None]:
!pip3 install -q -U pandas altair vega_datasets iso3166 vl-convert-python matplotlib seaborn scipy scikit-learn

In [None]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../.."
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import logging
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler(stream=sys.stdout)])
from vega_datasets import data
from helpers import io
from analysis import multimodal_util


%load_ext autoreload
%autoreload 2

# Plotting Constants (All Plots)

In [None]:
def times_newroman():
    font = "Times New Roman"

    return {
          "config" : {
               "title": {"font": font},
               "axis": {
               "labelFont": font,
               "titleFont": font
          },
          "header": {
               "labelFont": font,
               "titleFont": font
          },
          "legend": {
               "labelFont": font,
               "titleFont": font
          },
          "text": {
               "font": font
          }
     }
}

alt.themes.register("times_newroman", times_newroman)
alt.themes.enable("times_newroman")

In [None]:
FONT_SIZE = 16
LEGEND_POSITION = "bottom"
PLOT_TOFILE = True # Whether and where to output plots
PLOT_DIR = "~/dpi-plotsmultimodal/"
PLOT_PPI = 300
MAX_LABELLIMIT = 400 # Large number to avoid label summarization in plots

PLOT_DIR = os.path.expanduser(PLOT_DIR)

# Create directory if needed
if PLOT_TOFILE:
    os.makedirs(PLOT_DIR, exist_ok=True)

## Read Constants and Summaries

Load constants and data summaries from JSON files. Constants provide mappings and criteria for licenses, creator groups, various other categories. Data summaries contain modality-specific information about datasets.

- `all_constants`: Dictionary containing all predefined constants.
- `{text/speech/video}_summaries`: Data summaries by modality.

In [None]:
# Whether and where to output plots
PLOT_TOFILE = True
PLOT_DIR = "~/dpi-plotsmultimodal/"
PLOT_PPI = 300
MAX_LABELLIMIT = 400 # Large number to avoid label summarization in plots

PLOT_DIR = os.path.expanduser(PLOT_DIR)

# Create directory if needed
if PLOT_TOFILE:
    os.makedirs(PLOT_DIR, exist_ok=True)

# Plotting constants
LICENSE_ORDER = ["NC/Acad", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_TERMS_ORDER = [
    "NC/Acad | Model Closed", "NC/Acad | Source Closed", "NC/Acad | Unspecified", "NC/Acad | Unrestricted",
    "Unspecified | Model Closed", "Unspecified | Source Closed", "Unspecified | Unspecified", "Unspecified | Unrestricted",
    "Commercial | Model Closed", "Commercial | Source Closed", "Commercial | Unspecified", "Commercial | Unrestricted",
]
LICENSE_TERMS_PALETTE = [
    '#9d354f', '#c24262', '#e04c71', '#e04c71',  # Shades of #e04c71
    '#9d9066', '#c2b27f', '#e0cd92', '#e0cd92',  # Shades of #e0cd92
    '#5b7f91', '#719db3', '#82b5cf', '#82b5cf',   # Shades of #82b5cf
]

LICENSE_PLOTW = 400
LICENSE_PLOTH = 100
YEAR_CATEGORIES = ["Unknown", "<2013", *list(map(str, range(2013, 2025)))]

# Read constants
all_constants = io.read_all_constants("../../constants/")

# Read Terms data
collection_to_terms_mapper = multimodal_util.load_terms_metadata("multimodal_terms_data")

# Read individual modality summaries
text_summaries = io.read_data_summary_json("../../data_summaries/")
logging.info("Checking Text Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(text_summaries, all_constants)

speech_summaries = io.read_data_summary_json("../../data_summaries-speech/")
logging.info("Checking Speech Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(speech_summaries, all_constants)

video_summaries = io.read_data_summary_json("../../data_summaries-video/")
logging.info("Checking Video Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(video_summaries, all_constants)


In [None]:
# Prep dataframes
df = multimodal_util.prep_summaries_for_visualization(
    text_summaries,
    speech_summaries,
    video_summaries,
    all_constants,
    collection_to_terms_mapper,
    YEAR_CATEGORIES,
    LICENSE_ORDER,
)

In [None]:
# video_summaries

In [None]:
df["Data Terms"].value_counts()

In [None]:
# df["License Type"].unique()

## License Use by Modality

Show the proportion of license types by modality using a stacked bar chart (normalized). Licenses are mapped to a higher-level categorization: either **Non-Commercial/Academic**, **Unspecified**, or **Commercial** depending on the permisiveness of the original license.

In [None]:
# multimodal_util.plot_license_terms_stacked_bar_chart(
#     df, LICENSE_PALETTE, LICENSE_ORDER, LICENSE_PLOTW, LICENSE_PLOTH, PLOT_DIR, PLOT_PPI
# )


In [None]:
license_chart, license_table = multimodal_util.plot_license_terms_stacked_bar_chart_collections(
    df, "License Type", LICENSE_PALETTE, LICENSE_ORDER, LICENSE_PLOTW, LICENSE_PLOTH, PLOT_DIR, PLOT_PPI
)

license_chart

In [None]:
license_terms_chart, license_terms_table = multimodal_util.plot_license_terms_stacked_bar_chart_collections(
    df, "License | Terms", LICENSE_TERMS_PALETTE, LICENSE_TERMS_ORDER, 800, 200, PLOT_DIR, PLOT_PPI
)

license_terms_chart

In [None]:
print(license_terms_table["Video"])

## Visualize Creator Categories (e.g. Academic, Industry) by Modality

Show the distribution of creator categories across modalities. Plots a normalized stacked bar chart, and also donut/pie for comparison (for now).

- `df_categories`: DataFrame unlisted to handle multiple creator categories.

In [None]:
# Plotting constants
CREATORCATEGORY_ORDER = ["Academic", "Research Group", "Industry Lab", "Corporation", "Startup", "Other", "Government", "Unspecified"]
CREATORCATEGORY_PALETTE = ["#CF4E9CFF", "#8C57A2FF", "#358DB9FF", "#82581FFF", "#2F509EFF", "#E5614CFF", "#97A1A7FF", "#2E2A2BFF"]
CREATORCATEGORY_PLOTW = 600
CREATORCATEGORY_PLOTH = 200

In [None]:
multimodal_util.plot_stacked_creator_categories(
    df, CREATORCATEGORY_ORDER, CREATORCATEGORY_PALETTE, CREATORCATEGORY_PLOTW, CREATORCATEGORY_PLOTH, PLOT_DIR
)

In [None]:
# multimodal_util.plot_donut_creator_categories(
#     df, CREATORCATEGORY_ORDER, CREATORCATEGORY_PALETTE, CREATORCATEGORY_PLOTH, PLOT_DIR
# )

## Visualize Dataset Count by Creator Country/Region and Modality

Global distribution of datasets by modality. Uses a world map with color-coded regions to indicate the count of datasets from different regions.

- `df_countries`: DataFrame with country codes for plotting.
- `base`: Base map (shared across all modalities).
- `charts`: List of modality-specific maps (to concatenate into facets).

In [None]:
# Plotting constants
CREATORCOUNTRY_PLOTDIM = 400
MODALITY_COLORS = {
    "Text": "reds",
    "Speech": "blues",
    "Video": "greens"
}

In [None]:
map_charts = multimodal_util.plot_altair_worldmap(
    df,
    data.world_110m.url, # World map for plotting
    MODALITY_COLORS,
    CREATORCOUNTRY_PLOTDIM,
    PLOT_DIR
)

map_charts

## Visualize Data Source Categories by Modality

Distribution of source categories, mapped to higher-level groups in `domain_types.json`, across modalities.

- `df_sources`: DataFrame with grouped sources.

In [None]:
# Plotting constants
DOMAIN_TYPEMAP = multimodal_util.invert_dict_of_lists(all_constants["DOMAIN_TYPES"])
SOURCECATEGORY_PLOTW = 400
SOURCECATEGORY_PLOTH = 100
SOURCECATEGORY_ORDER = sorted(set(DOMAIN_TYPEMAP.values()) - {"Other"}) + ["Other", "Unspecified"]

In [None]:
multimodal_util.plot_source_domain_stacked_chart(
    df, DOMAIN_TYPEMAP, SOURCECATEGORY_ORDER, SOURCECATEGORY_PLOTW, SOURCECATEGORY_PLOTH, PLOT_DIR
)

Here, we plot source/domain type distributions by year and modality.

Here, we re-plot the source category distributions, but aggregating within collections for text data (i.e. we use the majority source within collections). For possible ToS mapping (since those annotations are collection level).

In [None]:
source_chart = multimodal_util.plot_source_domain_stacked_chart_collections(
    df, DOMAIN_TYPEMAP, SOURCECATEGORY_ORDER, SOURCECATEGORY_PLOTW, SOURCECATEGORY_PLOTH, PLOT_DIR
)

source_chart

## Combined Plots

In [None]:
combined_chart = alt.hconcat(
    license_chart,
    creator_chart,
    source_chart
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    titleFontSize=FONT_SIZE,
    labelFontSize=FONT_SIZE,
    orient="bottom",
    columns=2,
    labelLimit=MAX_LABELLIMIT
).configure_title(
    fontSize=FONT_SIZE
).resolve_scale(
    color="independent",
    x="shared",
    y="shared"
).resolve_axis(
    x="shared",
    y="shared"
)

combined_chart_json = combined_chart.to_dict()

combined_chart_json["hconcat"][0]["encoding"]["x"]["axis"] = {"title": "\n"} # For alignment
combined_chart_json["hconcat"][2]["encoding"]["x"]["axis"] = {"title": "\n"}
combined_chart_json["hconcat"][1]["encoding"]["y"]["axis"] = {"title": None, "ticks": False, "labels": False}
combined_chart_json["hconcat"][2]["encoding"]["y"]["axis"] = {"title": None, "ticks": False, "labels": False}

combined_chart = alt.Chart.from_dict(combined_chart_json)

# Save the chart if required
if PLOT_TOFILE:
    combined_chart.save(
        os.path.join(PLOT_DIR, "multimodal-combined_chart.png"),
        ppi=300
    )

combined_chart

In [None]:
TASK_TYPEMAP = multimodal_util.invert_dict_of_lists(all_constants["TASK_GROUPS"])
TASKCATEGORY_PLOTW = 400
TASKCATEGORY_PLOTH = 400
TASKCATEGORY_FONT_SIZE = 16
TASKCATEGORY_ORDER = sorted(set(TASK_TYPEMAP.values()) - {"null"})
PLOT_TOFILE_TASKS = True

In [None]:
task_chart_speech_datasets = multimodal_util.plot_tasks_chart(
    df, TASK_TYPEMAP, TASKCATEGORY_ORDER, TASKCATEGORY_PLOTW, TASKCATEGORY_PLOTH, PLOT_DIR, TASKCATEGORY_FONT_SIZE, "Speech", "Tasks", "Datasets"
)

task_chart_speech_datasets

In [None]:
task_chart_text_datasets = multimodal_util.plot_tasks_chart(
    df, TASK_TYPEMAP, TASKCATEGORY_ORDER, TASKCATEGORY_PLOTW, TASKCATEGORY_PLOTH, PLOT_DIR, TASKCATEGORY_FONT_SIZE, "Text", "Task Categories", "Datasets"
)

task_chart_text_datasets

In [None]:
task_chart_text_collections = multimodal_util.plot_tasks_chart(
    df, TASK_TYPEMAP, TASKCATEGORY_ORDER, TASKCATEGORY_PLOTW, TASKCATEGORY_PLOTH, PLOT_DIR, TASKCATEGORY_FONT_SIZE, "Text", "Task Categories", "Collections"
)

task_chart_text_collections

In [None]:
combined_tasks_chart = multimodal_util.concatenate_task_charts(task_chart_speech_datasets, task_chart_text_datasets, task_chart_text_collections, 14)

if PLOT_TOFILE_TASKS:
    combined_tasks_chart.save(
        os.path.join(PLOT_DIR, "multimodal-combined_tasks_chart.png"),
        ppi=300
    )

### Visualize Dataset dimensions by tokens (Datasets and Collections)

#### Tokens calculation

df_tokens = multimodal_util.tokens_calculation(df)

#### Datasets split by tokens bins

In [None]:
bins_dataset = [0, 1000000, 50000000,100000000, 500000000, 1000000000, float('inf')]
labels_dataset = ['0-1M', '1M-50M', '50M-100M', '100M-500M', '500m-1B', '1B+']

dataset_chart, max_count_dataset = multimodal_util.data_aggregation_for_chart(
    df_tokens,
    'Text',
    bins_dataset,
    labels_dataset,
    by_collection=False,
    measure_column='Total Tokens',
    group_column='Token Groups'
)

#### Collections split by tokens bins

In [None]:
bins_collection = [0, 1000000, 50000000,100000000, 500000000, 1000000000, float('inf')]
labels_collection = ['0-1M', '1M-50M', '50M-100M', '100M-500M', '500m-1B', '1B+']

collection_chart, max_count_collection = multimodal_util.data_aggregation_for_chart(df_tokens,'Text', bins_collection, labels_collection, by_collection=True, measure_column='Total Tokens', group_column='Token Groups')

#### Combine graphs for dataset and collections split

In [None]:
chart1 = multimodal_util.chart_creation(
    dataset_chart, 
    max_count_dataset, 
    x_field='Token Groups',  
    labels=labels_dataset,
    ratio=1.15,
    title='Token distributions by Datasets',
    width=400,
    height=300,
    color='skyblue'
)

chart2 = multimodal_util.chart_creation(
    collection_chart, 
    max_count_collection, 
    x_field='Token Groups',  
    labels=labels_collection,
    ratio=1.15,
    title='Token distributions by Datasets Collections',
    width=400,
    height=300,
    color='salmon'
)

# Concatenate the two charts horizontally with different scales for the y-axes
combined_chart = multimodal_util.combined_charts(chart1, chart2)

# Save the plot if PLOT_TOFILE is True
if PLOT_TOFILE:
    output_file = os.path.join(PLOT_DIR, "Token_&_collection.png")
    combined_chart.save(output_file, scale_factor=PLOT_PPI/100)

# Display the chart
combined_chart.display()

### Visualize Datasets dimensions by modalities and tokens/hours splits

bins_text = [0, 1000000, 50000000, 100000000, 500000000, 1000000000, float('inf')]
labels_text = ['0-1M', '1M-50M', '50M-100M', '100M-500M', '500M-1B', '1B+']
bins_speech = [0, 50, 100, 500, 1000, 5000, float('inf')]
labels_speech = ['0h-50h', '50h-100h', '100h-500h', '500h-1000h', '1000h-5000h', '5000h+']
bins_video = [0, 50, 100, 500, 1000, 5000, float('inf')]
labels_video = ['0h-50h', '50h-100h', '100h-500h', '500h-1000h', '1000h-5000h', '5000h+']

df_text_dist, max_count_text = multimodal_util.data_aggregation_for_chart(df_tokens,'Text', bins_text, labels_text, by_collection=False, measure_column='Total Tokens',group_column='Token Groups')
df_speech_dist, max_count_speech = multimodal_util.data_aggregation_for_chart(df_tokens,'Speech', bins_speech, labels_speech, by_collection=False, measure_column='Hours',group_column='Speech Hours Groups')
df_video_dist, max_count_video = multimodal_util.data_aggregation_for_chart(df_tokens,'Video', bins_video, labels_video, by_collection=False, measure_column='Video Hours',group_column='Video Hours Groups')


chart_text = multimodal_util.chart_creation(
    df_text_dist, 
    max_count_text, 
    x_field='Token Groups',  # Ensure this matches the actual column name in df_text_dist
    labels=labels_text,
    ratio=1.1,
    title='Text datasets distribution by tokens',
    width=400,
    height=300,
    color='skyblue'
)
chart_speech = multimodal_util.chart_creation(
    df_speech_dist, 
    max_count_speech, 
    x_field='Speech Hours Groups', 
    labels=labels_speech,
    ratio=1.1,
    title='Speech datasets distribution by hours',
    width=400,
    height=300,
    color='salmon'
)


chart_video = multimodal_util.chart_creation(
    df_video_dist, 
    max_count_video, 
    x_field='Video Hours Groups', 
    labels=labels_video,
    ratio=1.1,
    title='Video datasets distribution by hours',
    width=400,
    height=300,
    color='lightgreen'
)


# Concatenate the three charts horizontally with different scales for the y-axes
combined_chart1 = multimodal_util.combined_charts(chart_text, chart_speech, chart_video)


# Save the plot if PLOT_TOFILE is True
if PLOT_TOFILE:
    output_file = os.path.join(PLOT_DIR, "combined_modality_chart.png")
    combined_chart1.save(output_file, scale_factor=PLOT_PPI/100)

# Display the chart
combined_chart1.display()