In [1]:
!pip3 install -q -U pandas altair langcodes


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../..'
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
import langcodes
from helpers import io
from analysis import multimodal_util
from typing import Any


%load_ext autoreload
%autoreload 2

## Read Constants and Summaries

Load constants and data summaries from JSON files. Constants provide mappings and criteria for licenses, creator groups, various other categories. Data summaries contain modality-specific information about datasets.

- `all_constants`: Dictionary containing all predefined constants.
- `speech_summaries`: Data summaries for speech.

In [3]:
all_constants = io.read_all_constants("../../constants/")
speech_summaries = io.read_data_summary_json("../../data_summaries-speech/")

license_paraphrases = multimodal_util.invert_dict_of_lists(all_constants["LICENSE_PARAPHRASES"])
speech_summaries = multimodal_util.map_license_criteria_multimodal(
    multimodal_util.remap_licenses_with_paraphrases(
        speech_summaries,
        license_paraphrases
    ),
    all_constants
)

df_speech = pd.DataFrame(speech_summaries)
df_speech, YEARS_ORDER = multimodal_util.factor_year(df_speech)

In [4]:
# Overall Gini coefficient (hours by dataset)
multimodal_util.gini(df_speech["Hours"].values)

0.9246717350669285

In [5]:
# Plotting constants
FONT_SIZE = 20
LEGEND_POSITION = "bottom"
PLOT_TOFILE = True # Whether and where to output plots
PLOT_DIR = "~/Dropbox (MIT)/dpi-plotsspeech/"
PLOT_PPI = 300
MAX_LABELLIMIT = 1000 # Large number to avoid label summarization in plots

if PLOT_TOFILE:
    PLOT_DIR = os.path.expanduser(PLOT_DIR)
    os.makedirs(PLOT_DIR, exist_ok=True)

In [6]:
def times_newroman():
    font = "Times New Roman"

    return {
          "config" : {
               "title": {"font": font},
               "axis": {
               "labelFont": font,
               "titleFont": font
          },
          "header": {
               "labelFont": font,
               "titleFont": font
          },
          "legend": {
               "labelFont": font,
               "titleFont": font
          },
          "text": {
               "font": font
          }
     }
}

alt.themes.register("times_newroman", times_newroman)
alt.themes.enable("times_newroman")

ThemeRegistry.enable('times_newroman')

## License Use by Language Family and Source Category

In [7]:
# Plotting constants
LICENSE_ORDER = ["Non-Commercial/Academic", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_PLOTW = 600
LICENSE_PLOTH = 200

In [8]:
# Map to main DPI license types
df_speech["License Type"] = df_speech["License Use (DataProvenance)"].map({
    "academic-only": "Non-Commercial/Academic",
    "non-commercial": "Non-Commercial/Academic",
    "unspecified": "Unspecified",
    "commercial": "Commercial"
})

df_speech["License Type"] = pd.Categorical(
    df_speech["License Type"],
    categories=LICENSE_ORDER,
    ordered=True
)
df_speech = df_speech.sort_values(by="License Type")

In [9]:
# Remap language families for condensed plots
df_speechlanguages = df_speech.explode("Language Families")
df_speechlanguages["Language Families"] = df_speechlanguages["Language Families"].map(
    lambda x : {
        "English": "Germanic",
        "Hindustani": "Indo-European"
    }.get(x, x)
)

# Calculate permissiveness by language family (defined as the proportion of commercial licenses)
permisiveness = df_speechlanguages.groupby("Language Families").apply(
    lambda x: (x["License Type"] == "Commercial").mean()
).reset_index(name="Permissiveness")

# Sort by computed permisiveness
languagefamily_order = permisiveness.sort_values(by="Permissiveness")["Language Families"].tolist()

# Make factor
df_speechlanguages["Language Families"] = pd.Categorical(
    df_speechlanguages["Language Families"],
    categories=languagefamily_order,
    ordered=True
)

# Sort by language families
df_speechlanguages = df_speechlanguages.sort_values(by="Language Families")

In [10]:
base = alt.Chart(df_speechlanguages).mark_bar().encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_order,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(
            domain=LICENSE_ORDER,
            range=LICENSE_PALETTE
        ),
        title="License Type"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechlanguages).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_order
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_languagefamilies-licenses.png"),
        ppi=PLOT_PPI
    )

chart

In [11]:
INCLUDE_TOP_N_CATEGORIES = 10 # Number of top categories to include, rest will be grouped as "Other"

df_sources = df_speech.explode("Source Category")
df_sources = multimodal_util.reduce_categories_to_topk(df_sources, "Source Category", INCLUDE_TOP_N_CATEGORIES)

sourcecategory_order = multimodal_util.order_by_grouped_permisiveness(df_sources, "Source Category")

df_sources["Source Category"] = pd.Categorical(
    df_sources["Source Category"],
    categories=sourcecategory_order,
    ordered=True
)

df_sources = df_sources.sort_values(by="Source Category")

In [12]:
base = alt.Chart(
    df_sources
).mark_bar().encode(
    x=alt.X(
        "Source Category:N",
        title="Source Category",
        axis=alt.Axis(labelAngle=-30),
        sort=sourcecategory_order
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(
            domain=LICENSE_ORDER,
            range=LICENSE_PALETTE
        ),
        title="License Type"
    )
).properties(
    width=800,
    height=100
)

text = alt.Chart(df_sources).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Source Category:N",
        title="Source Category",
        sort=sourcecategory_order
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
        labelFontSize=FONT_SIZE,
        titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
).configure_header(
    titleFontSize=FONT_SIZE,
    labelFontSize=FONT_SIZE
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-licenses.png"),
        ppi=PLOT_PPI
    )

chart

## Sources by Language Family

In [13]:
INCLUDE_TOP_N_CATEGORIES = 6 # Number of top categories to include, rest will be grouped as "Other"

# # Further unlist the categories of sources
df_speechlanguagessources = multimodal_util.reduce_categories_to_topk(
    df_speechlanguages.explode("Source Category"),
    "Source Category",
    INCLUDE_TOP_N_CATEGORIES
)

In [14]:
base = alt.Chart(
    df_speechlanguagessources
).mark_bar().encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_order,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Source Category:N",
        title="Source Category"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechlanguagessources).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Language Families:N",
        title="Language Family",
        sort=languagefamily_order
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_languagefamilies-sources.png"),
        ppi=PLOT_PPI
    )

chart

## Dataset Preprocessing for Languages
Retrieve precise hours per language from metadata for large-scale, highly multilingual datasets. Specifically: YODAS, Common Voice, Multilingual LibriSpeech, Bloom Speech, and FLEURS (for now).

In [15]:
df_speechlanguagenames = df_speech.explode("Languages (ISO)")
df_speechlanguagenames["Language (Name)"] = df_speechlanguagenames["Languages (ISO)"].map(
    lambda x: langcodes.Language.make(language=langcodes.standardize_tag(x.split("-")[0].split("_")[0], macro=True)).language_name()
)

In [16]:
df_yodashours = pd.read_csv("data/speech_supporting_data/yodas_splithours.csv").rename({"hours": "Hours"}, axis=1)
df_yodashours["Languages (ISO)"] = df_yodashours["split"].map(lambda x : x[:2])
df_yodashours["Language (Name)"] = df_yodashours["Languages (ISO)"].map(
    lambda x: langcodes.Language.make(language=langcodes.standardize_tag(x, macro=True)).language_name()
)

In [17]:
df_commonvoicehours = pd.read_json("data/speech_supporting_data/commonvoice_splithours.json").T

language_codes_to_aggregate = {}
for langcode in df_commonvoicehours.index:
    if "-" in langcode or "_" in langcode:
        langcode_simplified = langcode.split("-")[0].split("_")[0]
        print("Will aggregate language %s to %s" % (langcode, langcode_simplified))
        language_codes_to_aggregate.setdefault(langcode_simplified, [])
        language_codes_to_aggregate[langcode_simplified].append(langcode)

for langcode_simplified, langcode_data_to_aggregate in language_codes_to_aggregate.items():
    df_commonvoicehours.loc[langcode_simplified, "total_clips_duration"] = df_commonvoicehours.loc[langcode_data_to_aggregate, "total_clips_duration"].sum()


df_commonvoicehours["Hours"] = df_commonvoicehours["total_clips_duration"] / 60 / 60 / 1000
df_commonvoicehours = df_commonvoicehours.reset_index(names=["Languages (ISO)"])

Will aggregate language ga-IE to ga
Will aggregate language zh-TW to zh
Will aggregate language zh-CN to zh
Will aggregate language zh-HK to zh
Will aggregate language fy-NL to fy
Will aggregate language ne-NP to ne
Will aggregate language nn-NO to nn
Will aggregate language rm-sursilv to rm
Will aggregate language sv-SE to sv
Will aggregate language rm-vallader to rm
Will aggregate language pa-IN to pa
Will aggregate language hy-AM to hy
Will aggregate language nan-tw to nan


In [18]:
df_multilinguallibrispeechhours = pd.read_csv("data/speech_supporting_data/multilinguallibrispeech_splithours.csv")
df_multilinguallibrispeechhours["Hours"] = df_multilinguallibrispeechhours[df_multilinguallibrispeechhours.columns[1:]].sum(axis=1)
df_multilinguallibrispeechhours["Languages (ISO)"] = df_multilinguallibrispeechhours.language.map(lambda x: langcodes.find(x).to_tag())

In [19]:
df_bloomspeechhours = pd.read_csv("data/speech_supporting_data/bloomspeech_splithours.csv")
df_bloomspeechhours["Hours"] = df_bloomspeechhours[df_bloomspeechhours.columns[2:]].sum(axis=1) / 60
df_bloomspeechhours["Languages (ISO)"] = df_bloomspeechhours["ISO-639-3"].map(lambda x: langcodes.standardize_tag(x, macro=True))

In [20]:
# Preprocess from raw data (once only)
# df_fleursspeechhours = pd.read_csv("speech_supportingdata/fleurs_splithoursorig.csv")
# df_fleursspeechhours = df_fleursspeechhours.groupby("language")["num_samples"].sum().reset_index()
# df_fleursspeechhours["Hours"] = df_fleursspeechhours["num_samples"] / 16000 / 60 / 60
# df_fleursspeechhours.columns = ["Languages (ISO)", *df_fleursspeechhours.columns[1:]]

# df_fleursspeechhours.to_csv("speech_supportingdata/fleurs_splithours.csv", index=False)


df_fleursspeechhours = pd.read_csv("data/speech_supporting_data/fleurs_splithours.csv")

In [21]:
dataset_hoursmapping = {
    "yodas": df_yodashours,
    "common-voice-corpus-170": df_commonvoicehours,
    "multilingual-librispeech": df_multilinguallibrispeechhours,
    "bloom-speech": df_bloomspeechhours,
    "fleurs": df_fleursspeechhours
}

special_cases = {
    "yodas": {
        "sr-Latn": "sr", # YODAS metadata doesn't specify the script
        "he": "iw" # YODAS appears to use the old ISO639-1 code
    },
    "fleurs": {
        "no": "nb" # FLEURS specifies locale code
    }
}

def get_hours_for_dataset_and_language(row: pd.Series) -> float:
    dataset = row["Unique Dataset Identifier"]
    language = row["Languages (ISO)"]
    language = langcodes.standardize_tag(language)

    if dataset in dataset_hoursmapping:
        if language in special_cases.get(dataset, {}):
            language = special_cases[dataset][language]

        hours_df = dataset_hoursmapping[dataset]
        hours = hours_df[hours_df["Languages (ISO)"] == language]["Hours"].sum()
        if hours == 0:
            print("Hours not found for language code %s in dataset %s" % (language, dataset))
        return hours

    return row["Hours"]

In [47]:
# special

## Gini Coefficient Across Languages by (Cumulative) Total Hours

In [22]:
lang_codes_to_families = {}
lang_codes_to_names = {}
iso_codes_to_langs = {}
lang_families = {}

df_langsglottolog = pd.read_csv("data/speech_supporting_data/languages.csv")
for i, row in df_langsglottolog.iterrows():
    lang_id = row["ID"]

    iso_code = row["Closest_ISO369P3code"]

    iso_codes_to_langs[iso_code] = lang_id
    lang_codes_to_names[lang_id] = row["Name"]

    if row["Level"] == "family":
        lang_families[lang_id] = row["Name"]

    if pd.isna(iso_code):
        continue

    family_id = row["Family_ID"]
    if pd.isna(family_id):
        continue

    lang_codes_to_families[lang_id] = family_id


def get_langfamily(lang: str) -> str:
    lang = lang.split("-")[0].split("_")[0]
    try:
        # Need to iteratively seek to the top level
        lang = langcodes.get(langcodes.standardize_tag(lang, macro=True)).to_alpha3()
        lang = iso_codes_to_langs.get(lang, lang)

        while lang in lang_codes_to_families:
            lang = lang_codes_to_families[lang]

        lang = lang_families[lang]

    except:
        lang = "Other"

    return lang

In [23]:
df_speechlanguagesn = df_speechlanguagenames.copy()

df_speechlanguagesn["Languages (ISO)"] = df_speechlanguagesn["Languages (ISO)"].map(lambda x : x.split("_")[0].split("-")[0])
df_speechlanguagesn["Language Family"] = df_speechlanguagesn["Languages (ISO)"].map(get_langfamily)

# Subdivide hours evenly across the languages given in each dataset
df_speechlanguagesn["Hours"] = df_speechlanguagesn.groupby(["Unique Dataset Identifier", "Languages (ISO)"])["Hours"].transform(
    lambda x: x / x.count()
)
df_speechlanguagesn["Hours"] = df_speechlanguagesn.apply(get_hours_for_dataset_and_language, axis=1)

df_speechlanguagesn = df_speechlanguagesn.sort_values(by="Year Released")

In [24]:
# Ensure that, for each of those datasets, we have heterogenous language hours
for dataset in ["yodas", "common-voice-corpus-170", "multilingual-librispeech", "bloom-speech", "fleurs"]:
    assert df_speechlanguagesn[
            df_speechlanguagesn["Unique Dataset Identifier"] == dataset
        ]["Hours"].nunique() > 1, "Dataset %s has homogenous language hours" % dataset

In [25]:
# Gini coefficient for hours across languages
speechlanguages_totalhours = df_speechlanguagesn.groupby("Languages (ISO)")["Hours"].sum().reset_index(name="Total Hours")

multimodal_util.gini(speechlanguages_totalhours["Total Hours"].values)

0.9299532996970992

In [26]:
# Gini coefficient for hours across language-families
speechlanguagesf_totalhours = df_speechlanguagesn.groupby("Language Family")["Hours"].sum().reset_index(name="Total Hours")

multimodal_util.gini(speechlanguagesf_totalhours["Total Hours"].values)

0.928910419969393

In [27]:
# Get the cumulative hours by language over time
df_speechlanguagescumulativehours = df_speechlanguagesn.groupby(
    ["Year Released", "Languages (ISO)"]
)["Hours"].sum().groupby(
    "Languages (ISO)"
).cumsum().reset_index(name="Cumulative Hours")

# Calculate Gini coefficient and CIs for cumulative hours by language
df_speechlanguagescumulativehoursgini = df_speechlanguagescumulativehours.groupby(
    "Year Released"
)["Cumulative Hours"].apply(
    lambda x: multimodal_util.bootstrap_cis_for_gini(x.values)
).reset_index(
    name="Gini"
)

df_speechlanguagescumulativehoursgini["Gini Mean"] = df_speechlanguagescumulativehoursgini["Gini"].map(lambda x: x[0])
df_speechlanguagescumulativehoursgini["Gini Lower"] = df_speechlanguagescumulativehoursgini["Gini"].map(lambda x: x[1])
df_speechlanguagescumulativehoursgini["Gini Upper"] = df_speechlanguagescumulativehoursgini["Gini"].map(lambda x: x[2])

# Get the cumulative hours by language-family over time
df_speechlanguagesfamilycumulativehours = df_speechlanguagesn.groupby(
    ["Year Released", "Language Family"]
)["Hours"].sum().groupby(
    "Language Family"
).cumsum().reset_index(name="Cumulative Hours")

# Calculate Gini coefficient and CIs for cumulative hours by language
df_speechlanguagesfamilycumulativehoursgini = df_speechlanguagesfamilycumulativehours.groupby(
    "Year Released"
)["Cumulative Hours"].apply(
    lambda x: multimodal_util.bootstrap_cis_for_gini(x.values)
).reset_index(
    name="Gini"
)

df_speechlanguagesfamilycumulativehoursgini["Gini Mean"] = df_speechlanguagesfamilycumulativehoursgini["Gini"].map(lambda x: x[0])
df_speechlanguagesfamilycumulativehoursgini["Gini Lower"] = df_speechlanguagesfamilycumulativehoursgini["Gini"].map(lambda x: x[1])
df_speechlanguagesfamilycumulativehoursgini["Gini Upper"] = df_speechlanguagesfamilycumulativehoursgini["Gini"].map(lambda x: x[2])

df_speechlanguagescumulativehoursgini["Type"] = "Languages"
df_speechlanguagesfamilycumulativehoursgini["Type"] = "Families"

df_speechlanguagesfamilycumulativehoursgini = pd.concat(
    [df_speechlanguagescumulativehoursgini, df_speechlanguagesfamilycumulativehoursgini]
)

chart_meanlangf = alt.Chart(
    df_speechlanguagesfamilycumulativehoursgini
).mark_line().encode(
    x=alt.X(
        "Year Released:N",
        title="",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30),
        scale=alt.Scale(padding=0)
    ),
    y=alt.Y(
        "Gini Mean:Q",
        title="Gini (Cumulative)",
        scale=alt.Scale(zero=False)
    ),
    color=alt.Color(
        "Type:N",
        title="Type",
        scale=alt.Scale(
            domain=["Languages", "Families"],
            range=["#82b5cf", "#ff7fde"]
        )
    )
)

chart_meanpointslangf = alt.Chart(
    df_speechlanguagesfamilycumulativehoursgini
).mark_point().encode(
    x=alt.X(
        "Year Released:N",
        title="",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30),
        scale=alt.Scale(padding=0)
    ),
    y=alt.Y(
        "Gini Mean:Q",
        title="Gini (Cumulative)",
        scale=alt.Scale(zero=False)
    ),
    color="Type:N"
)

chart_cislangf = alt.Chart(
    df_speechlanguagesfamilycumulativehoursgini
).mark_area(
    opacity=0.2
).encode(
    x=alt.X(
        "Year Released:N",
        title="",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y="Gini Lower:Q",
    y2="Gini Upper:Q",
    color="Type:N"
)

chart_langf = (chart_cislangf + chart_meanlangf + chart_meanpointslangf).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    grid=False
).properties(
    width=600,
    height=200,
    title="For %d Languages (ISO) and %d Language Families" % (
        df_speechlanguagesn["Languages (ISO)"].nunique(),
        df_speechlanguagesn["Language Family"].nunique()
    )
)

if PLOT_TOFILE:
    chart_langf.save(
        os.path.join(PLOT_DIR, "speech_languagefamilies-giniyears.png"),
        ppi=PLOT_PPI
    )

chart_langf

In [28]:
df_speechlanguagesn["First Year Released"] = df_speechlanguagesn.groupby("Languages (ISO)")["Year Released"].transform("min")
df_speechlanguagesn = df_speechlanguagesn.drop_duplicates(subset=["Languages (ISO)"])

chart_langcount = alt.Chart(
    df_speechlanguagesn.groupby("First Year Released")["Languages (ISO)"].count().cumsum().reset_index(name="Total Languages")
).mark_bar().encode(
    x=alt.X(
        "First Year Released:N",
        title="",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Total Languages:Q",
        title="Total Languages"
    ),
    color=alt.value("#82b5cf")
)

if PLOT_TOFILE:
    chart_langcount.save(
        os.path.join(PLOT_DIR, "speech_languages-years.png"),
        ppi=PLOT_PPI
    )

chart_langcount

In [51]:
# df_speech

In [29]:
chart_multilingualdatasets = alt.Chart(df_speech).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Languages (Count)",
        title="Languages (Count)",
        scale=alt.Scale(
            domain=[1, 199]
        )
    ),
    color=alt.Color(
        "Unique Dataset Identifier:O",
        legend=None
    )
)

chart_multilingualdatasets

In [30]:
chart_langcombined = alt.hconcat(
    chart_langcount.properties(
        width=400,
        height=200,
        title="A"
    ),
    chart_multilingualdatasets.properties(
        width=400,
        height=200,
        title="B"
    ),
    (chart_cislangf + chart_meanlangf + chart_meanpointslangf).properties(
        width=400,
        height=200,
        title="C"
    ),
    padding=0
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    grid=False
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient="bottom-right",
    labelLimit=MAX_LABELLIMIT
).configure_header(
    titleFontSize=FONT_SIZE,
    labelFontSize=FONT_SIZE
).configure_title(
    fontSize=FONT_SIZE,
    anchor="start"
).resolve_scale(
    x="independent",
    y="independent",
    color="independent"
)

if PLOT_TOFILE:
    chart_langcombined.save(
        os.path.join(PLOT_DIR, "speech_languages-yearscombinedgini.pdf"),
        ppi=PLOT_PPI
    )

chart_langcombined

## Hours by Language Family

In [31]:
df_speechlanguagesf = speechlanguagesf_totalhours.copy()
topk_langf = df_speechlanguagesf.sort_values(by="Total Hours", ascending=False).head(10)["Language Family"]
df_speechlanguagesf["Language Family"] = df_speechlanguagesf["Language Family"].map(
    lambda x: x if x in topk_langf.values else "Other"
)

In [32]:
alt.Chart(df_speechlanguagesf).mark_bar().encode(
    x=alt.X(
        "Language Family:N",
        title="Language Family",
        sort=[item for item in topk_langf.tolist() if item != "Other"] + ["Other"],
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Total Hours:Q",
        title="Hours",
        axis=alt.Axis(format="~s")
    )
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    grid=False
).properties(
    width=600,
    height=200
)

## Source Category by Year

In [33]:
INCLUDE_TOP_N_CATEGORIES = 6
df_speechsourceyears = df_speech.explode("Source Category")
df_speechsourceyears = multimodal_util.reduce_categories_to_topk(df_speechsourceyears, "Source Category", INCLUDE_TOP_N_CATEGORIES)

df_speechsourceyears = df_speechsourceyears.sort_values(by="Year Released")

In [34]:
base_sourceyear = alt.Chart(
    df_speechsourceyears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Source Category:N",
        title="Source Category"
    )
).properties(
    width=600,
    height=160
)

text_sourceyear = alt.Chart(df_speechsourceyears).mark_text(
    dy=-90,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart_sourceyear = (base_sourceyear + text_sourceyear).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)


if PLOT_TOFILE:
    chart_sourceyear.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-years.png"),
        ppi=PLOT_PPI
    )

chart_sourceyear

## Total Hours by Source Category (Cumulative)

In [35]:
INCLUDE_TOP_N_CATEGORIES = 6

df_speechsourceyears = df_speech.explode("Source Category")
df_speechsourceyears = multimodal_util.reduce_categories_to_topk(df_speechsourceyears, "Source Category", INCLUDE_TOP_N_CATEGORIES)

df_speechsourceyearscumulativehours = df_speechsourceyears.groupby(
    ["Year Released", "Source Category"]
)["Hours"].sum().groupby(
    "Source Category"
).cumsum().reset_index(name="Cumulative Hours")

df_speechsourceyearscumulativehours = df_speechsourceyearscumulativehours.sort_values(by="Year Released")

In [36]:
chart_sourceyearhours = alt.Chart(
    df_speechsourceyearscumulativehours
).mark_line().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Cumulative Hours:Q",
        title="Cumulative Hours",
        scale=alt.Scale(
            type="symlog",
            constant=1000,
            domain=[1, 1000000]
        ),
        axis=alt.Axis(
            values=[0, 1, 1000, 10000, 100000, 1000000],
            labelExpr="datum.value >= 1000000 ? datum.value / 1000000 + 'M' : datum.value >= 1000 ? datum.value / 1000 + 'K' : datum.value"
        )
    ),
    color=alt.Color(
        "Source Category:N",
        title="Source Category"
    )
).properties(
    width=400,
    height=160
)

if PLOT_TOFILE:
    chart_sourceyearhours.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-cumulativehours.png"),
        ppi=PLOT_PPI
    )

chart_sourceyearhours

#### Combine Source Years-Based Plots

In [37]:
chart_sourceyearcombined = alt.hconcat(
    base_sourceyear,
    chart_sourceyearhours
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    grid=False
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    labelLimit=MAX_LABELLIMIT
).configure_header(
    titleFontSize=FONT_SIZE,
    labelFontSize=FONT_SIZE
).configure_title(
    fontSize=FONT_SIZE
).resolve_scale(
    x="independent",
    y="independent"
)

if PLOT_TOFILE:
    chart_sourceyearcombined.save(
        os.path.join(PLOT_DIR, "speech_sourcecategories-yearscombined.png"),
        ppi=PLOT_PPI
    )

chart_sourceyearcombined

## Creator Categories by Year

Note: we use the original annotations here instead of the DPI constants, for a different view.

In [38]:
df_speechcategoriesyears = df_speech.explode("Creator Categories")
df_speechcategoriesyears = df_speechcategoriesyears.sort_values(by="Year Released")

In [39]:
base = alt.Chart(
    df_speechcategoriesyears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Creator Categories:N",
        title="Creator Category"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_speechsourceyears).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "speech_categories-years.png"),
        ppi=PLOT_PPI
    )

chart

## Summary Tables
#### Table of License Type

In [40]:
licensetype_counts = df_speech["License Type"].value_counts()
df_licensetypes = pd.concat([
    licensetype_counts,
    (licensetype_counts / licensetype_counts.sum()).round(4) * 100
], axis=1)

df_licensetypes.columns = ["Count", "Pct."]

df_licensetypes

Unnamed: 0_level_0,Count,Pct.
License Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Commercial,51,53.68
Non-Commercial/Academic,31,32.63
Unspecified,13,13.68


#### Tables of YouTube-Sourced Portions (By Count and Hour, Per License-Type)

In [41]:
# By count
df_speech["YouTube"] = df_speech["Source"].map(
    lambda x: "YouTube" if any("youtube" in xi.lower() for xi in x) else "Other"
)

df_youtube = df_speech.groupby(["License Type", "YouTube"]).size().reset_index(name="Count")
df_youtube = df_youtube.sort_values(by="Count")
df_youtube["Pct."] = df_youtube.groupby("License Type")["Count"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_youtube

Unnamed: 0,License Type,YouTube,Count,Pct.
1,Non-Commercial/Academic,YouTube,2,6.45
3,Unspecified,YouTube,2,15.38
5,Commercial,YouTube,5,9.8
2,Unspecified,Other,11,84.62
0,Non-Commercial/Academic,Other,29,93.55
4,Commercial,Other,46,90.2


In [42]:
# By hours
df_youtubehours = df_speech.groupby(["License Type", "YouTube"])["Hours"].sum().reset_index(name="Total Hours")
df_youtubehours = df_youtubehours.sort_values(by="Total Hours")
df_youtubehours["Pct."] = df_youtubehours.groupby("License Type")["Total Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_youtubehours

Unnamed: 0,License Type,YouTube,Total Hours,Pct.
3,Unspecified,YouTube,1308.65,19.02
2,Unspecified,Other,5573.11,80.98
0,Non-Commercial/Academic,Other,28830.51,14.21
1,Non-Commercial/Academic,YouTube,174041.0,85.79
4,Commercial,Other,175251.14,30.97
5,Commercial,YouTube,390596.0,69.03


#### Tables of Creator Categories (By Count and Hour)

In [43]:
df_creatorcategories = df_speech.explode("Creator Categories").groupby("Creator Categories").size().reset_index(name="Count")
df_creatorcategories["Pct."] = df_creatorcategories["Count"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_creatorcategories

Unnamed: 0,Creator Categories,Count,Pct.
0,Academia,59,44.7
1,Government,11,8.33
2,Independent,11,8.33
3,Industry,43,32.58
4,Nonprofit,8,6.06


In [44]:
df_creatorcategories = df_speech.explode("Creator Categories").groupby("Creator Categories")["Hours"].sum().reset_index(name="Total Hours")
df_creatorcategories["Pct."] = df_creatorcategories["Total Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_creatorcategories

Unnamed: 0,Creator Categories,Total Hours,Pct.
0,Academia,668894.76,68.73
1,Government,10499.3,1.08
2,Independent,3631.16,0.37
3,Industry,222579.49,22.87
4,Nonprofit,67686.0,6.95
