In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util as util
from analysis import visualization_util as viz_util


%load_ext autoreload
%autoreload 2

# Data Loading

In [None]:
data_summary_json = io.read_data_summary_json("../../data_summaries/")
all_constants = io.read_all_constants("../../constants/")
data_summary = filters.map_license_criteria(data_summary_json, all_constants)
print(len(data_summary_json))

# Test Data

Tests your data summary rows to see if all the values are in the constants. If not, it will print out the missing values, and which data collections they came from.

In [None]:
util.check_datasummary_in_constants(data_summary, all_constants)

# Extract & Categorize Data

Runs through all rows of the data summary, and applies the constants files to categorize the licenses, creators, tasks, sources, languages, and other metadata, so it is ready for plotting.

In [None]:
all_info = util.extract_info(data_summary, all_constants)
commercial_info = {k: v for k, v in all_info.items() if v["License Use (DataProvenance)"] in ["commercial"]}
unspecified_info = {k: v for k, v in all_info.items() if v["License Use (DataProvenance)"] in ["unspecified"]}
ncao_info = {k: v for k, v in all_info.items() if v["License Use (DataProvenance)"] in ["non-commercial", "academic-only"]}

print(f"Total Rows = {len(all_info)}")
print(f"Total Rows w/ Commercially permissible licenses (according to DPI) = {len(commercial_info)}")
print(f"Total Rows w/ Unspecified licenses (according to DPI) = {len(unspecified_info)}")
print(f"Total Rows w/ Non-Commercial/Academic-Only licenses (according to DPI) = {len(ncao_info)}")

# Plot License Breakdowns, by different Categories

In [None]:
viz_util.plot_grouped_chart(
    {"Non-Commercial/\nAcademic": ncao_info, "Unspecified": unspecified_info, "Commercial": commercial_info},
    all_constants["LANGUAGE_GROUPS"],
    "Language Groups",
    name_remapper=None,
    exclude_groups={},
    savename="langs_stacked-altair.json"
)

In [None]:
task_name_remapper = {
    "Short Text Generation": "Short Text Gen",
    "Bias & Toxicity Detection": "Bias/Toxic Detect",
    "Bias & Toxicicity Detection": "Bias/Toxic Detect",
    "Natural Language Inference": "NLI",
    "Commonsense Reasoning": "Commonsense Reas.",
    "Logical and Mathematical Reasoning": "Logic & Math",
}
viz_util.plot_grouped_chart(
    {"Non-Commercial/\nAcademic": ncao_info, "Unspecified": unspecified_info, "Commercial": commercial_info},
    all_constants["TASK_GROUPS"],
    "Task Groups",
    name_remapper=task_name_remapper,
    exclude_groups={},
    savename="tasks_stacked-altair.json"
)

In [None]:
excludes = ["Other", "Others", "Unsure", "Template Gen", "Human","Reviews", "Unknown", "ML Datasets", "Conversations", "Search Queries"]
viz_util.plot_grouped_chart(
    {"Non-Commercial/\nAcademic": ncao_info, "Unspecified": unspecified_info, "Commercial": commercial_info},
    all_constants["DOMAIN_GROUPS"],
    "Domains",
    name_remapper=None,
    exclude_groups=excludes,
    savename="sources_stacked-altair.json"
)

In [None]:
viz_util.plot_grouped_time_chart(
    {"Non-Commercial/\nAcademic": ncao_info, "Unspecified": unspecified_info, "Commercial": commercial_info},
    "Preparation Times",
    disallow_repeat_dsetnames=True,
    savename="times_stacked-altair.json"
)

In [None]:
viz_util.plot_license_breakdown(
    all_info, 
    all_constants["LICENSE_CLASSES"],
    disallow_repeat_dsetnames=True,
    savename="license_dist.pdf"
)