In [138]:
!pip3 install langcodes


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../.."
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import logging
import pandas as pd
import numpy as np
import altair as alt
import langcodes
from collections import Counter, defaultdict
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler(stream=sys.stdout)])
from vega_datasets import data
from helpers import io
from analysis import multimodal_util

pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext autoreload
%autoreload 2

DEBUG:matplotlib:matplotlib data path: /Users/shayne/Documents/research/notebooks/venv/lib/python3.8/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/Users/shayne/.matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is darwin
DEBUG:matplotlib:CACHEDIR=/Users/shayne/.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /Users/shayne/.matplotlib/fontlist-v330.json


In [2]:
# Whether and where to output plots
PLOT_TOFILE = True
PLOT_DIR = "~/dpi-plotsmultimodal/"
PLOT_PPI = 300
MAX_LABELLIMIT = 400 # Large number to avoid label summarization in plots

PLOT_DIR = os.path.expanduser(PLOT_DIR)

# Create directory if needed
if PLOT_TOFILE:
    os.makedirs(PLOT_DIR, exist_ok=True)

# Plotting constants
LICENSE_ORDER = ["NC/Acad", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_TERMS_ORDER = [
    "NC/Acad | Model Closed", "NC/Acad | Source Closed", "NC/Acad | Unspecified", "NC/Acad | Unrestricted",
    "Unspecified | Model Closed", "Unspecified | Source Closed", "Unspecified | Unspecified", "Unspecified | Unrestricted",
    "Commercial | Model Closed", "Commercial | Source Closed", "Commercial | Unspecified", "Commercial | Unrestricted",
]
LICENSE_TERMS_ORDER_VARIANT = [
    "NC/Acad | Restricted", "NC/Acad | Unspecified", "NC/Acad | Unrestricted",
    "Unspecified | Restricted", "Unspecified | Unspecified", "Unspecified | Unrestricted",
    "Commercial | Restricted", "Commercial | Unspecified", "Commercial | Unrestricted"
]
LICENSE_TERMS_PALETTE = [
    '#9d354f', '#c24262', '#e04c71',
    '#9d9066', '#c2b27f', '#e0cd92',
    '#5b7f91', '#719db3', '#82b5cf',
]

LICENSE_PLOTW = 400
LICENSE_PLOTH = 100
YEAR_CATEGORIES = ["Unknown", "<2013", *list(map(str, range(2013, 2025)))]

# Read constants
all_constants = io.read_all_constants("../../constants/")

# Read Terms data
collection_to_terms_mapper = multimodal_util.load_terms_metadata("data/multimodal_terms_data")

# Read individual modality summaries
text_summaries = io.read_data_summary_json("../../data_summaries/")
logging.info("Checking Text Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(text_summaries, all_constants)

speech_summaries = io.read_data_summary_json("../../data_summaries-speech/")
logging.info("Checking Speech Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(speech_summaries, all_constants)

video_summaries = io.read_data_summary_json("../../data_summaries-video/")
logging.info("Checking Video Data Summaries against Constants")
# analysis_util.check_datasummary_in_constants(video_summaries, all_constants)


INFO:root:Checking Text Data Summaries against Constants
INFO:root:Checking Speech Data Summaries against Constants
INFO:root:Checking Video Data Summaries against Constants


In [155]:
def modality_task_annotation(df, modality, task_col, dset_col):
    TASK_TYPEMAP = multimodal_util.invert_dict_of_lists(all_constants["TASK_GROUPS"])
    TASKCATEGORY_ORDER = sorted(set(TASK_TYPEMAP.values()) - {"null"})
    df_speech = df[df['Modality'] == modality]
        # Categorize the tasks into their respective groups
    df_speech_tasks = multimodal_util.categorize_tasks(
        df_speech, TASKCATEGORY_ORDER, TASK_TYPEMAP, task_col, modality, dset_col, False)
    df_speech_tasks = df_speech_tasks[['Unique Dataset Identifier', task_col]]
    df_speech_tasks = df_speech_tasks.rename(columns={task_col: 'Task Groups'})
    df_speech_tasks_grouped = df_speech_tasks.groupby('Unique Dataset Identifier')['Task Groups'].apply(list).reset_index()
    df_speech = pd.merge(df_speech, df_speech_tasks_grouped, on='Unique Dataset Identifier', how='inner')
    return df_speech

def get_terms_licenses(df):
    LICENSE_TERMS_MODALITY_ORDER_2 = ["Text (Collections)", "Text (Datasets)", "Speech", "Video"]
    
    dflt = multimodal_util.plot_license_terms_stacked_bar_chart_collections(
        df, "License | Terms", LICENSE_TERMS_PALETTE, LICENSE_TERMS_ORDER_VARIANT,
        LICENSE_TERMS_MODALITY_ORDER_2, 800, 140,
        # save_dir=PLOT_DIR,  # svg saving package compatibility issues, and you can save from the notebook anyway
        plot_ppi=PLOT_PPI,
        title="Dataset & Source Restrictions (Dataset Count)",
        no_legend=True,
        split_text_mod=False,
        return_df=True,
        early_return=True,
    )
    dflt[['Dataset License', 'Collection Terms']] = dflt['License | Terms'].str.split('|', expand=True)
    dflt['Dataset License'] = dflt['Dataset License'].str.strip()
    dflt['Collection Terms'] = dflt['Collection Terms'].str.strip()
    dflt = dflt[["Unique Dataset Identifier", 'Dataset License', 'Collection Terms']]
    return dflt

def get_creators_info(df):
    df_countries = df.explode("Countries").dropna(subset=["Countries"])  # Drop rows with no country for the moment
    df_countries["Creator Country ID"] = df_countries["Countries"].map(multimodal_util.get_country)
    df_countries_grouped = df_countries.groupby('Unique Dataset Identifier')['Creator Country ID'].apply(lambda x: [item for sublist in x for item in sublist]).reset_index()
    df2 = pd.merge(df, df_countries_grouped, on='Unique Dataset Identifier', how='outer')
    df2 = df2.rename(columns={"Countries": 'Creator Countries'})
    return df2

def get_langs_info(df):
    _, df_text_lang_explode = multimodal_util.prep_text_for_lang_gini(df, all_constants)
    df_text_lang_groups = df_text_lang_explode.groupby('Unique Dataset Identifier')['Language (ISO)'].apply(list).reset_index()
    df_text_lang_groups2 = df_text_lang_explode.groupby('Unique Dataset Identifier')['Language Family'].apply(list).reset_index()
    df_text_lang_combined = pd.merge(df_text_lang_groups, df_text_lang_groups2, on='Unique Dataset Identifier', how='outer')
    _, df_speechlanguagesn = multimodal_util.prepare_speech_for_gini(df)
    df_speech_lang_groups = df_speechlanguagesn.groupby('Unique Dataset Identifier')['Language (ISO)'].apply(list).reset_index()
    df_speech_lang_groups2 = df_speechlanguagesn.groupby('Unique Dataset Identifier')['Language Family'].apply(list).reset_index()
    df_speech_lang_combined = pd.merge(df_speech_lang_groups, df_speech_lang_groups2, on='Unique Dataset Identifier', how='outer')
    df_langs_combined = pd.concat([df_text_lang_combined, df_speech_lang_combined])
    return df_langs_combined

In [156]:
# Prep dataframes
df = multimodal_util.prep_summaries_for_visualization(
    text_summaries,
    speech_summaries,
    video_summaries,
    all_constants,
    collection_to_terms_mapper,
    YEAR_CATEGORIES,
    LICENSE_ORDER,
)

In [157]:
df_speech = modality_task_annotation(df, "Speech", "Tasks", "Datasets")
df_video = modality_task_annotation(df, "Video", "Task Categories", "Datasets")
df_text = modality_task_annotation(df, "Text", "Task Categories", "Collections")
df_w_tasks = pd.concat([df_text, df_speech, df_video])

df_w_creators = get_creators_info(df)

df_w_langs = get_langs_info(df)

df_w_licterms = get_terms_licenses(df)

df_merged = pd.merge(df, df_w_tasks, on='Unique Dataset Identifier', how='outer')
df_merged = pd.merge(df_merged, df_w_creators, on='Unique Dataset Identifier', how='outer')
df_merged = pd.merge(df_merged, df_w_langs, on='Unique Dataset Identifier', how='outer')
df_merged = pd.merge(df_merged, df_w_licterms, on='Unique Dataset Identifier', how='outer')

In [173]:
df_merged_short = df_merged[['Unique Dataset Identifier', 'Dataset Name', 'Paper Title',
    'Dataset URL', 'GitHub URL', 'Hugging Face URL', 'Papers with Code URL',
    'ArXiv URL', 'Semantic Scholar Corpus ID', 'Collection',
    'Collection URL', 'Text Sources',
    'Model Generated', 'Human Annotation', 'Derived from Datasets', 'Creators', 'Licenses', 'Bibtex',
    'Inferred Metadata', 'Text Metrics', 'GitHub License',
    'HF Yaml License', 'HF Config License', 'PwC License',
    'License Use (DataProvenance)', 'License Attribution (DataProvenance)',
    'License Share Alike (DataProvenance)', 'Modality', 'Total Tokens', 'Year Released', 'Hours', 'Speakers', 'Source Category',
    'Paper URL', 'Website URL',
    'Language (ISO)', 'Language Family', 'Task Groups', 'Dataset License', 'Collection Terms', 'Creator Countries',                  
]]

In [174]:
df_merged_short.to_csv("multimodal_df.csv", index=False)