In [52]:
!pip3 install -q -U pandas altair langcodes
!pip3 install -q -U install semanticscholar
!pip install vl-convert-python==1.6.0

[31mERROR: Could not find a version that satisfies the requirement install (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for install[0m[31m
[0m

In [53]:
import sys
# Append system path
sys.path = [p for p in sys.path if not p.endswith("../..")]  # Cleans duplicated '../..'
sys.path.insert(0, "../")  # This adds `src` to the path
import os
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows() # Allow using more than 5000 rows, for now
import langcodes
from collections import defaultdict
from helpers import io, filters
from typing import Any


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Helper Functions

Utility functions to process and transform data summaries:

---
```python
def invert_dict_of_lists(
  d: dict[str, list[str]]
) -> dict[str, str]
```
- Inverts a dictionary of lists for easier mapping of constants.
---
```python
def remap_licenses_with_paraphrases(
  summaries: list[dict[str, Any]],
  paraphrases: dict[str, str]
) -> dict[str, Any]
``` 
- Standardizes inconsistent license names in data summaries using predefined paraphrases.
---
```python
def map_license_criteria_multimodal(
  data_summary: list[dict[str, Any]],
  all_constants: dict[str, dict[str, list[str]]]
) -> list[dict[str, Any]]
```
- Maps license criteria for multimodal datasets, resolving them according to predefined constants.
---
```python
def get_country(x: str) -> list[int]
```
- Takes a country name as input and returns a list of ISO3166 codes (mostly, of length 1). It handles a special case that appears in some text annotations ("African Continent" -> list of ISO codes) and logs a warning for any countries not found in the mapping.
---
```python
def gini(array: np.ndarray) -> float:
```
- Takes an array of values and computes the Gini coefficient.
---
```python
def factor_year(
  df: pd.DataFrame,
  column: str = "Year Released",
  min_year: int = 2013
) -> pd.DataFrame:
```
- Converts the year column into a categorical variable (with years before a given value grouped together).
---
```python
def order_by_grouped_permisiveness(
        df: pd.DataFrame,
        group_column: str,
        licensetype_column: str = "License Type",
        permissive_licensetypes: list[str] = ["Commercial"]
) -> pd.Series:
```
- Computes permisiveness (proportion of license types in a given set, by default only those marked `Commercial`) by a given grouping factor and returns an order for that factor.
---
```python
def reduce_categories_to_topk(
    df: pd.DataFrame,
    column: str,
    k: int = 6
) -> pd.DataFrame:
```
- Reduces the number of categories in a column to `k`, with the rest grouped under `Other`. So returns a `DataFrame` with a version of that column with `k + 1` total categories.
---

In [54]:
def invert_dict_of_lists(d: dict[str, list[str]]) -> dict[str, str]:
    """Useful for mapping constants, paraphrases, etc.
    These are normally in the form:
        { "Category": ["item1", "item2", … ] }
    Whereas we want to invert it to:
        { "item1": "Category", "item2": "Category", … }
    """
    inverted = {}
    for k, v in d.items():
        for item in v:
            inverted[item] = k
    return inverted

In [55]:
def remap_licenses_with_paraphrases(
        summaries: list[dict[str, Any]],
        paraphrases: dict[str, str]
    ) -> dict[str, Any]:
    """Map inconsistent license names to shared paraphrases using the constants.
    E.g. "CC-BY-SA 4.0", "CC BY SA 4.0" -> "CC BY-SA 4.0"
    """

    for i, summary in enumerate(summaries):
        for j, license in enumerate(summary["Licenses"]):
            license = license["License"]
            summaries[i]["Licenses"][j]["License"] = paraphrases.get(
                license,
                license
            )
    return summaries

In [56]:
def classify_and_resolve_licenses(
    license_infos: list[tuple[str, str]],
    all_constants: dict[str, dict[str, list[str]]]
) -> list[str]:
    """Function taken from `text_ft_plots.ipynb`"""
    classified_licenses = []
    for (license_name, license_url) in license_infos:
        # Classify an individual license
        classifications = filters.classify_license(license_name, license_url, all_constants)
        classified_licenses.append(classifications)

    # By default, multiple licenses yield to the most restrictive one
    resolved_criteria = filters.resolve_multiple_licenses(classified_licenses)
    return resolved_criteria


def add_license_classes_to_summaries(
    data_summary: list[dict[str, Any]],
    resolved_classes: dict[str, list[str]],
    aggregator: str
):
    """Function taken from `text_ft_plots.ipynb`"""
    # Update DataFrame with columns for use, attribution, share_alike
    for row in data_summary:
        row[f"License Use ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][0]
        row[f"License Attribution ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][1]
        row[f"License Share Alike ({aggregator})"] = resolved_classes[row["Unique Dataset Identifier"]][2]
    return data_summary


def map_license_criteria_multimodal(
    data_summary: list[dict[str, Any]],
    all_constants: dict[str, dict[str, list[str]]]
) -> list[dict[str, Any]]:
    """Variant of `map_license_criteria` that works with multimodal datasets.
    Simplified to only include `Licenses` (not HF, etc.).

    Function adapted from `text_ft_plots.ipynb`.
    """

    # Unpack licenses for each dataset. {uid --> (license_name, license_url)}
    our_uid_to_license_infos = defaultdict(list)

    # Same as ours, but excludes OpenAI Terms:
    our_uid_to_license_infos_no_openai = defaultdict(list)

    for row in data_summary:
        uid = row["Unique Dataset Identifier"]
        for license_info in row["Licenses"]:
            license_name = license_info["License"]
            license_url = license_info.get("License URL", None) # FOR NOW
            our_uid_to_license_infos[uid].append((license_name, license_url))
            if license_info["License"] != "OpenAI":
                our_uid_to_license_infos_no_openai[uid].append((license_name, license_url))

        # If OpenAI was the only license, we add Unspecified so there isn't nothing there.
        if len(our_uid_to_license_infos_no_openai[uid]) == 0:
            our_uid_to_license_infos_no_openai[uid].append(("Unspecified", None))


    # classify and resolve licenses for each dataset and each aggregator
    ours_resolved, ours_openai_resolved = {}, {}
    for uid in our_uid_to_license_infos.keys():
        ours_resolved[uid] = classify_and_resolve_licenses(our_uid_to_license_infos[uid], all_constants)
        ours_openai_resolved[uid] = classify_and_resolve_licenses(our_uid_to_license_infos_no_openai[uid], all_constants)


    data_summary = add_license_classes_to_summaries(data_summary, ours_resolved, "DataProvenance")
    data_summary = add_license_classes_to_summaries(data_summary, ours_openai_resolved, "DataProvenance IgnoreOpenAI")

    return data_summary

In [57]:
def gini(array: np.ndarray) -> float:
    """Calculate the Gini coefficient of a numpy array.

    Implementation taken from: https://github.com/oliviaguest/gini
    """
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

In [58]:
def bootstrap_cis_for_gini(
    data: np.ndarray,
    n_samples: int = 1000,
    alpha: float = 0.05
) -> tuple[float, float]:
    """Calculate the confidence interval for the Gini coefficient using bootstrapping.
    """

    ginis = []
    for _ in range(n_samples):
        sample = np.random.choice(data, size=len(data), replace=True)
        ginis.append(gini(sample))

    ginis = np.array(ginis)
    lower_bound = np.percentile(ginis, alpha / 2 * 100)
    upper_bound = np.percentile(ginis, (1 - alpha / 2) * 100)

    return np.mean(ginis), lower_bound, upper_bound

In [59]:
def factor_year(
    df: pd.DataFrame,
    column: str = "Year Released",
    min_year: int = 2004
) -> pd.DataFrame:
    """Transform the year column into a categorical column.

    Years before `min_year` are grouped into a category, i.e. "<`min_year`" (e.g. )
    """
    df = df.copy()

    min_yeartext = "<%d" % min_year
    max_year = df[column].max().astype(int)

    df[column] = df[column].map(
        lambda x: min_yeartext if (x < min_year) else str(x)
    )

    order = [min_yeartext, *map(str, range(min_year, max_year + 1))]

    df[column] = pd.Categorical(
        df[column],
        categories=order,
        ordered=True
    )

    return df, order

In [60]:
def order_by_grouped_permisiveness(
        df: pd.DataFrame,
        group_column: str,
        licensetype_column: str = "License Type",
        permissive_licensetypes: list[str] = ["Commercial"]
) -> pd.Series:
    """Given a DataFrame, group it by `group_column` and calculate the permissiveness of each group.

    Permisiveness is calculated as the proportion of licenses that are in `permissive_licensetypes`.
    """
    permisiveness = df.groupby(group_column).apply(
        lambda x: (x[licensetype_column].isin(permissive_licensetypes)).mean()
    ).reset_index(name="Permissiveness")

    permisiveness_order = permisiveness.sort_values(by="Permissiveness")[group_column].tolist()

    return permisiveness_order

In [61]:
def reduce_categories_to_topk(
    df: pd.DataFrame,
    column: str,
    k: int = 6
) -> pd.DataFrame:
    """Reduce the number of categories in a column to the top `k` categories.

    The rest are grouped into an "Other" category.
    """
    df = df.copy()
    topk = df[column].value_counts().head(k).index.tolist()
    df[column] = df[column].map(
        lambda x: x if x in topk else "Other"
    )

    return df

## Read Constants and Summaries

Load constants and data summaries from JSON files. Constants provide mappings and criteria for licenses, creator groups, various other categories. Data summaries contain modality-specific information about datasets.

- `all_constants`: Dictionary containing all predefined constants.
- `video_summaries`: Data summaries for speech.

In [62]:
all_constants = io.read_all_constants("../../constants/")
video_summaries = io.read_data_summary_json("../..//data_summaries-video")
license_paraphrases = invert_dict_of_lists(all_constants["LICENSE_PARAPHRASES"])
creator_categories = invert_dict_of_lists(all_constants["CREATOR_GROUPS"])


video_summaries = map_license_criteria_multimodal(
    remap_licenses_with_paraphrases(
        video_summaries,
        license_paraphrases
    ),
    all_constants
)

df_video = pd.DataFrame(video_summaries)
# do some checks on the video dataset
# raise if there are any missing values in the license, year released, video sources, tasks
assert df_video["Licenses"].apply(lambda x: len(x) == 0).sum() == 0, "print the rows with missing licenses: \n" + str(df_video[df_video["Licenses"].apply(lambda x: len(x) == 0)])
assert df_video["Year Released"].isna().sum() == 0, "print the rows with missing year released: \n" + str(df_video[df_video["Year Released"].isna()])
assert df_video["Video Sources"].apply(lambda x: len(x) == 0).sum() == 0, "print the rows with missing video sources: \n" + str(df_video[df_video["Video Sources"].apply(lambda x: len(x) == 0)])
assert df_video["Task Categories"].apply(lambda x: len(x) == 0).sum() == 0, "print the rows with missing tasks: \n" + str(df_video[df_video["Tasks"].apply(lambda x: len(x) == 0)])

df_video, YEARS_ORDER = factor_year(df_video)

In [63]:
# Overall Gini coefficient (hours by dataset)
gini(df_video["Video Hours"].values)

0.9389346359741161

In [64]:
# Plotting constants
FONT_SIZE = 20
LEGEND_POSITION = "bottom"
PLOT_TOFILE = False # Whether and where to output plots
PLOT_DIR = "/home/gridsan/ktiwary/src/dpi-ktiwary-fork/dpi-plots/video"
PLOT_PPI = 300
MAX_LABELLIMIT = 1000 # Large number to avoid label summarization in plots

if PLOT_TOFILE:
    PLOT_DIR = os.path.expanduser(PLOT_DIR)
    os.makedirs(PLOT_DIR, exist_ok=True)

In [65]:
def times_newroman():
    font = "Times New Roman"

    return {
          "config" : {
               "title": {"font": font},
               "axis": {
               "labelFont": font,
               "titleFont": font
          },
          "header": {
               "labelFont": font,
               "titleFont": font
          },
          "legend": {
               "labelFont": font,
               "titleFont": font
          },
          "text": {
               "font": font
          }
     }
}

alt.themes.register("times_newroman", times_newroman)
alt.themes.enable("times_newroman")

ThemeRegistry.enable('times_newroman')

## License Use Vs. Source Category

In [66]:
# Plotting constants
LICENSE_ORDER = ["Non-Commercial/Academic", "Unspecified", "Commercial"]
LICENSE_PALETTE = ["#e04c71", "#e0cd92", "#82b5cf"]
LICENSE_PLOTW = 600
LICENSE_PLOTH = 200

In [67]:
# Map to main DPI license types
df_video["License Type"] = df_video["License Use (DataProvenance)"].map({
    "academic-only": "Non-Commercial/Academic",
    "non-commercial": "Non-Commercial/Academic",
    "unspecified": "Unspecified",
    "commercial": "Commercial"
})

df_video["License Type"] = pd.Categorical(
    df_video["License Type"],
    categories=LICENSE_ORDER,
    ordered=True
)
df_video = df_video.sort_values(by="License Type")

In [68]:
# Remap language families for condensed plots
INCLUDE_TOP_N_CATEGORIES = 8
df_videosourcelicences = df_video.explode("Video Sources")
df_videosourcelicences = reduce_categories_to_topk(df_videosourcelicences, "Video Sources", INCLUDE_TOP_N_CATEGORIES)


# Calculate permissiveness by language family (defined as the proportion of commercial licenses)
permisiveness = df_videosourcelicences.groupby("Video Sources").apply(
    lambda x: (x["License Type"] == "Commercial").mean()
).reset_index(name="Permissiveness")

# Sort by computed permisiveness
videosources_order = permisiveness.sort_values(by="Permissiveness")["Video Sources"].tolist()

# Make factor
df_videosourcelicences["Video Sources"] = pd.Categorical(
    df_videosourcelicences["Video Sources"],
    categories=videosources_order,
    ordered=True
)

# Sort by Video Sources
df_videosourcelicences = df_videosourcelicences.sort_values(by="Video Sources")
df_videosourcelicences.head()[['Video Sources', 'License Type']]

  permisiveness = df_videosourcelicences.groupby("Video Sources").apply(


Unnamed: 0,Video Sources,License Type
4,crowdsourced,Unspecified
79,crowdsourced,Unspecified
90,crowdsourced,Non-Commercial/Academic
60,crowdsourced,Unspecified
37,crowdsourced,Unspecified


In [69]:
base = alt.Chart(df_videosourcelicences).mark_bar().encode(
    x=alt.X(
        "Video Sources:N",
        title="Video Sources",
        sort=videosources_order,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "License Type:N",
        scale=alt.Scale(
            domain=LICENSE_ORDER,
            range=LICENSE_PALETTE
        ),
        title="License Type"
    )
).properties(
    width=600,
    height=100
)

text = alt.Chart(df_videosourcelicences).mark_text(
    dy=-68,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Video Sources:N",
        title="Video Sources",
        sort=videosources_order
    ),
    text="count():Q"
)

chart = (base + text).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    labelLimit=MAX_LABELLIMIT
)

if PLOT_TOFILE:
    chart.save(
        os.path.join(PLOT_DIR, "video_sources-licenses.png"),
        ppi=PLOT_PPI
    )

chart

## Source Category by Year

In [70]:
INCLUDE_TOP_N_CATEGORIES = 6
df_videosourceyears = df_video.explode("Video Sources")
df_videosourceyears = reduce_categories_to_topk(df_videosourceyears, "Video Sources", INCLUDE_TOP_N_CATEGORIES)

df_videosourceyears = df_videosourceyears.sort_values(by="Year Released")
df_videosourceyears.head()['Year Released']

5     2009
3     2009
68    2011
68    2011
68    2011
Name: Year Released, dtype: category
Categories (22, object): ['<2004' < '2004' < '2005' < '2006' ... '2021' < '2022' < '2023' < '2024']

In [71]:
base_sourceyear = alt.Chart(
    df_videosourceyears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Video Sources:N",
        title="Video Sources"
    )
).properties(
    width=600,
    height=160
)

text_sourceyear = alt.Chart(df_videosourceyears).mark_text(
    dy=-90,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart_sourceyear = (base_sourceyear + text_sourceyear).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)


if PLOT_TOFILE:
    chart_sourceyear.save(
        os.path.join(PLOT_DIR, "video_sourcecategories-years.png"),
        ppi=PLOT_PPI
    )

chart_sourceyear

# Creator Category by Year

In [72]:
df_video['Creators'] = df_video['Creators'].apply(
    lambda x: [creator_categories.get(item, item) for item in x]
)

print(df_video['Creators'].explode().value_counts())

INCLUDE_TOP_N_CATEGORIES = 6
df_videocreatoryears = df_video.explode("Creators")
df_videocreatoryears = reduce_categories_to_topk(df_videocreatoryears, "Creators", INCLUDE_TOP_N_CATEGORIES)

df_videocreatoryears = df_videocreatoryears.sort_values(by="Year Released")
df_videocreatoryears.head()[['Creators', 'Year Released']]

Creators
Academic          132
Corporation        20
Industry Lab       17
Research Group     12
Other               6
Government          1
Startup             1
Name: count, dtype: int64


Unnamed: 0,Creators,Year Released
3,Academic,2009
5,Research Group,2009
68,Academic,2011
68,Academic,2011
68,Academic,2011


In [73]:
base_creatoryear = alt.Chart(
    df_videocreatoryears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Creators:N",
        title="Video Creator Cateogies"
    )
).properties(
    width=600,
    height=160
)

text_creatoryear = alt.Chart(df_videocreatoryears).mark_text(
    dy=-90,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart_creatoryear = (base_creatoryear + text_creatoryear).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)


if PLOT_TOFILE:
    chart_creatoryear.save(
        os.path.join(PLOT_DIR, "video_creatorcategories-years.png"),
        ppi=PLOT_PPI
    )

chart_creatoryear

## Total Hours by Source Category (Cumulative)

In [74]:
INCLUDE_TOP_N_CATEGORIES = 6

df_speechsourceyears = df_video.explode("Video Sources")
df_speechsourceyears = reduce_categories_to_topk(df_speechsourceyears, "Video Sources", INCLUDE_TOP_N_CATEGORIES)

df_speechsourceyearscumulativehours = df_speechsourceyears.groupby(
    ["Year Released", "Video Sources"]
)["Video Hours"].sum().groupby(
    "Video Sources"
).cumsum().reset_index(name="Cumulative Hours")

df_speechsourceyearscumulativehours = df_speechsourceyearscumulativehours.sort_values(by="Year Released")

  df_speechsourceyearscumulativehours = df_speechsourceyears.groupby(


In [75]:
chart_sourceyearhours = alt.Chart(
    df_speechsourceyearscumulativehours
).mark_line().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "Cumulative Hours:Q",
        title="Cumulative Hours",
        scale=alt.Scale(
            type="symlog",
            constant=1000,
            domain=[1, 1000000]
        ),
        axis=alt.Axis(
            values=[0, 1, 1000, 10000, 100000, 1000000],
            labelExpr="datum.value >= 1000000 ? datum.value / 1000000 + 'M' : datum.value >= 1000 ? datum.value / 1000 + 'K' : datum.value"
        )
    ),
    color=alt.Color(
        "Video Sources:N",
        title="Video Sources"
    )
).properties(
    width=400,
    height=160
)

if PLOT_TOFILE:
    chart_sourceyearhours.save(
        os.path.join(PLOT_DIR, "video_sourcecategories-cumulativehours.png"),
        ppi=PLOT_PPI
    )

chart_sourceyearhours

#### Combine Source Years-Based Plots

In [76]:
chart_sourceyearcombined = alt.hconcat(
    base_sourceyear,
    chart_sourceyearhours
).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    grid=False
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    labelLimit=MAX_LABELLIMIT
).configure_header(
    titleFontSize=FONT_SIZE,
    labelFontSize=FONT_SIZE
).configure_title(
    fontSize=FONT_SIZE
).resolve_scale(
    x="independent",
    y="independent"
)

if PLOT_TOFILE:
    chart_sourceyearcombined.save(
        os.path.join(PLOT_DIR, "video_sourcecategories-yearscombined.png"),
        ppi=PLOT_PPI
    )

chart_sourceyearcombined

# Video Task Vs Year Release 

In [77]:

task_categories: dict[str, list[str]] = {}
# categories['Video Q&A'] = ['Video Question Answering', "Video Summarization", "Video Q&A", "Video Captioning"]
task_categories['Video Q&A'] = ["Video Question Answering"]
task_categories['Misc'] = ["Misc (Hand/Object Detection)", "Misc", "Misc (Scene Reconstruction)", "Misc (video retrieval", 
                        "Misc (Locomotion Prediction", "Misc", "interaction understanding via ordering", 
                        "reason prediction)", "Misc (Scene Segmentation)", ]

# reverse the categories
task_categories = invert_dict_of_lists(task_categories)

# Task Categories are a list of categories, but applu the mapping to each item in the list
df_video['Task Categories'] = df_video['Task Categories'].apply(
    lambda x: [task_categories.get(item, item) for item in x]
)
print(df_video['Task Categories'].explode().value_counts())

INCLUDE_TOP_N_CATEGORIES = 6
df_videotaskyears = df_video.explode("Task Categories")
df_videotaskyears = reduce_categories_to_topk(df_videotaskyears, "Task Categories", INCLUDE_TOP_N_CATEGORIES)

df_videotaskyears = df_videotaskyears.sort_values(by="Year Released")
df_videotaskyears.head()['Year Released']

Task Categories
Video Classification                    51
Video Captioning                        25
Misc                                    12
Video Summarization                      7
Video Q&A                                5
Video Segmentation                       3
Action Segmentation                      2
Temporal Action Detection                2
Temporal Action Segmentation             2
Temporal Localization                    2
Action Recognition                       2
Group Activity Recognition               2
Temporal Action Localization             2
Visual Interaction Prediction            1
Vision to Proprioception Prediction)     1
Spatial-Temporal Action Localization     1
Action Localization                      1
Video Object Detection                   1
Pose Estimation                          1
Name: count, dtype: int64


5     2009
3     2009
68    2011
47    2012
86    2013
Name: Year Released, dtype: category
Categories (22, object): ['<2004' < '2004' < '2005' < '2006' ... '2021' < '2022' < '2023' < '2024']

In [78]:
base_taskyear = alt.Chart(
    df_videotaskyears
).mark_bar().encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER,
        axis=alt.Axis(labelAngle=-30)
    ),
    y=alt.Y(
        "count():Q",
        stack="normalize",
        axis=alt.Axis(format="%"),
        title="Pct. Datasets"
    ),
    color=alt.Color(
        "Task Categories:N",
        title="Video Task Categories"
    )
).properties(
    width=600,
    height=160
)

text_sourceyear = alt.Chart(df_videotaskyears).mark_text(
    dy=-90,
    align="center",
    baseline="top",
    fontSize=12
).encode(
    x=alt.X(
        "Year Released:N",
        title="Year Released",
        sort=YEARS_ORDER
    ),
    text="count():Q"
)

chart_taskyear = (base_taskyear + text_sourceyear).configure_axis(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE
).configure_legend(
    labelFontSize=FONT_SIZE,
    titleFontSize=FONT_SIZE,
    orient=LEGEND_POSITION,
    columns=4,
    labelLimit=MAX_LABELLIMIT
)


if PLOT_TOFILE:
    chart_taskyear.save(
        os.path.join(PLOT_DIR, "video_taskcategories-years.png"),
        ppi=PLOT_PPI
    )

chart_taskyear

## Source Category (YouTube or Other) by License Type

In [79]:
# By count
df_counts_by_license_source = df_video.explode("Video Sources").groupby(["License Type", "Video Sources"]).size().reset_index(name="Count")
df_counts_by_license_source = df_counts_by_license_source.sort_values(by="Count")
df_counts_by_license_source["YouTube"] = df_counts_by_license_source["Video Sources"].map(
    lambda x: "YouTube" if "youtube" in x.lower() else "Other"
)

# # By hours
# df_hours_by_license_source = df_speech.explode("Source").groupby(["License Type", "Source"])["Hours"].sum().reset_index(name="Total Hours")
# df_hours_by_license_source = df_hours_by_license_source.sort_values(by="Total Hours")
# df_hours_by_license_source["YouTube"] = df_hours_by_license_source["Source"].map(
#     lambda x: "YouTube" if "youtube" in x.lower() else "Other"
# )

  df_counts_by_license_source = df_video.explode("Video Sources").groupby(["License Type", "Video Sources"]).size().reset_index(name="Count")


## Creator Categories by Year

Note: we use the original annotations here instead of the DPI constants, for a different view.

## Table of License Type

In [80]:
df_video['Licenses'].apply(lambda x: [item['License'] for item in x][0])

0              Custom
48       CC BY-NC 4.0
52            Various
54    CC BY-NC-SA 4.0
56     Non Commercial
           ...       
11          CC BY 4.0
45        MIT License
74          CC BY 4.0
13             Custom
31             Custom
Name: Licenses, Length: 99, dtype: object

In [81]:
licensetype_counts = df_video["Licenses"].value_counts()
licensetype_counts = df_video['Licenses'].apply(lambda x: [item['License'] for item in x][0]).value_counts()
df_licensetypes = pd.concat([
    licensetype_counts,
    (licensetype_counts / licensetype_counts.sum()).round(4) * 100
], axis=1)

df_licensetypes.columns = ["Count", "Pct."]

df_licensetypes

Unnamed: 0_level_0,Count,Pct.
Licenses,Unnamed: 1_level_1,Unnamed: 2_level_1
Unspecified,38,38.38
Custom,27,27.27
CC BY 4.0,9,9.09
MIT License,8,8.08
CC BY-NC 4.0,5,5.05
Apache License 2.0,3,3.03
CC BY-NC-SA 4.0,3,3.03
CC BY 3.0,2,2.02
Non Commercial,2,2.02
Various,1,1.01


# Video Task Categories by Licence Type

### Youtuve to Licence Type

In [82]:
# By count
df_video["YouTube"] = df_video["Video Sources"].map(
    lambda x: "YouTube" if any("youtube" in xi.lower() for xi in x) else "Other"
)

df_youtube = df_video.groupby(["License Type", "YouTube"]).size().reset_index(name="Count")
df_youtube = df_youtube.sort_values(by="Count")
df_youtube["Pct."] = df_youtube.groupby("License Type")["Count"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_youtube

  df_youtube = df_video.groupby(["License Type", "YouTube"]).size().reset_index(name="Count")
  df_youtube["Pct."] = df_youtube.groupby("License Type")["Count"].transform(lambda x: (x / x.sum()).round(4) * 100)


Unnamed: 0,License Type,YouTube,Count,Pct.
1,Non-Commercial/Academic,YouTube,13,41.94
3,Unspecified,YouTube,13,34.21
5,Commercial,YouTube,13,43.33
4,Commercial,Other,17,56.67
0,Non-Commercial/Academic,Other,18,58.06
2,Unspecified,Other,25,65.79


### Video Hours by Licence Type

In [83]:
# By hours
df_youtubehours = df_video.groupby(["License Type", "YouTube"])["Video Hours"].sum().reset_index(name="Total Hours")
df_youtubehours = df_youtubehours.sort_values(by="Total Hours")
df_youtubehours["Pct."] = df_youtubehours.groupby("License Type")["Total Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_youtubehours

  df_youtubehours = df_video.groupby(["License Type", "YouTube"])["Video Hours"].sum().reset_index(name="Total Hours")
  df_youtubehours["Pct."] = df_youtubehours.groupby("License Type")["Total Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)


Unnamed: 0,License Type,YouTube,Total Hours,Pct.
2,Unspecified,Other,8380.77,1.68
4,Commercial,Other,9847.75,5.07
0,Non-Commercial/Academic,Other,16920.19,12.18
1,Non-Commercial/Academic,YouTube,121946.43,87.82
5,Commercial,YouTube,184436.86,94.93
3,Unspecified,YouTube,490965.17,98.32


#### Tables of Creator Categories (By Count and Hour)

In [84]:
df_creatorcategories = df_video.explode("Creators").groupby("Creators").size().reset_index(name="Count")
df_creatorcategories["Pct."] = df_creatorcategories["Count"].transform(lambda x: (x / x.sum()).round(4) * 100)

df_creatorcategories

Unnamed: 0,Creators,Count,Pct.
0,Academic,132,69.84
1,Corporation,20,10.58
2,Government,1,0.53
3,Industry Lab,17,8.99
4,Other,6,3.17
5,Research Group,12,6.35
6,Startup,1,0.53


In [85]:
df_creatorcategories = df_video.explode("Creators").groupby("Creators")["Video Hours"].sum().reset_index(name="Video Hours")
df_creatorcategories["Pct."] = df_creatorcategories["Video Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)
df_creatorcategories

Unnamed: 0,Creators,Video Hours,Pct.
0,Academic,1057588.71,58.13
1,Corporation,6518.77,0.36
2,Government,1000.0,0.05
3,Industry Lab,359428.15,19.76
4,Other,149003.99,8.19
5,Research Group,149517.24,8.22
6,Startup,96166.67,5.29


In [86]:
# do video hours vs. video source categories
df_videohours = df_video.explode("Video Sources")
df_videohours = df_videohours.groupby("Video Sources")["Video Hours"].sum().reset_index(name="Total Hours")
df_videohours = df_videohours.sort_values(by="Total Hours")
df_videohours["Pct."] = df_videohours["Total Hours"].transform(lambda x: (x / x.sum()).round(4) * 100)
df_videohours.head()

Unnamed: 0,Video Sources,Total Hours,Pct.
0,Not Prohibited,38.5,0.0
15,tiktok,84.0,0.01
16,tumblr,86.1,0.01
22,youdescribe,207.0,0.02
1,bbc,1000.0,0.11
