In [265]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
import typing
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util, forecasting_util


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Define Paths to all relevant files

In [118]:
FPATH_TO_RELEVANT_URL_TOKENS = 'pretrain_data/relevant_url_token_counts.csv'
FPATH_to_HEAD_ROBOTS = "robots_data/temporal_robots_head.json"
FPATH_TO_RAND_ROBOTS = "robots_data/temporal_robots_rand_10k.json"
FPATH_TO_TOS_DATA = "robots_data/tos_ai_scraping_policies.json"
FPATH_TO_TOS_LICENSE_DATA = "robots_data/tos_license_type_verdicts.json"
FPATH_TO_TOS_COMPETE_DATA = "robots_data/tos_competing_services_policies.json"
DIRPATHS_TO_ANNOTATED_TASKS = ["annotated_websites/Task 1", "annotated_websites/Task 2"]
START_DATES = "robots_data/domain_start_dates.json"

ALL_COMPANIES_TO_TRACK = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta", "Internet Archive", "Google Search", "False Anthropic"]
COMPANIES_TO_ANALYZE = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
TEMPORAL_ANALYSIS_START_DATE = '2016-01-01'
TEMPORAL_ANALYSIS_END_DATE = '2024-04-30'

### Load all URL splits (top vs random) and maps to Token Counts

In [3]:
url_token_lookup = robots_util.URLTokenLookup(FPATH_TO_RELEVANT_URL_TOKENS) # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")
rf_url_to_counts = url_token_lookup.get_url_to_token_map("rf")
dolma_url_to_counts = url_token_lookup.get_url_to_token_map("dolma")
top_c4_urls = url_token_lookup.top_k_urls("c4", 2000)
top_rf_urls = url_token_lookup.top_k_urls("rf", 2000)
top_dolma_urls = url_token_lookup.top_k_urls("dolma", 2000)
random_10k_urls = url_token_lookup.get_10k_random_sample()
all_urls = set(random_10k_urls + top_c4_urls + top_rf_urls + top_dolma_urls)

# Load website snapshots for relevant URLs
website_start_dates = robots_util.read_start_dates(START_DATES, all_urls) # THIS WON'T WORK FOR THE 10k SAMPLE

Number of tokens in 2000 URLs: 18447797380 | 10.85% of c4
Number of tokens in 2000 URLs: 67098747294 | 15.56% of rf
Number of tokens in 2000 URLs: 429152555144 | 21.74% of dolma


### Define Agents and Agent Groups

In [4]:
agent_groups_to_track = robots_util.get_bot_groups(ALL_COMPANIES_TO_TRACK)
agent_subset_to_track = robots_util.get_bot_groups(COMPANIES_TO_ANALYZE)
agents_to_track = robots_util.get_bots()

### Load Robots.txt info

In [5]:
# URL -> Date -> Robots.txt raw text
head_robots = io.read_json(FPATH_to_HEAD_ROBOTS)
random_10k_robots = io.read_json(FPATH_TO_RAND_ROBOTS)
joined_robots = copy.deepcopy(head_robots)
joined_robots.update(random_10k_robots)
robots_util.print_out_robots_info(head_robots)
robots_util.print_out_robots_info(random_10k_robots)

# {URL --> Date --> Agent --> Status}
url_robots_summary, agent_counter_df = robots_util.compute_url_date_agent_status(
    data=joined_robots,
    # relevant_agents=agents_to_track)
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs])

agent_counter_df.to_csv("all_agents_counter.csv", index=False)

Num robot URLs loaded: 2631
Earliest time: 2016-01-01
Last time: 2024-04-19
Num robot URLs loaded: 6331
Earliest time: 2016-01-01
Last time: 2024-04-19


In [6]:
url_robots_summary_detailed = robots_util.compute_url_date_agent_status_detailed(
    data=joined_robots,
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs]
)

### Load ToS info

In [119]:
# URL --> Date --> ToS-suburl --> {"verdict": X, "evidence": Y}
tos_policies = io.read_json(FPATH_TO_TOS_DATA)
tos_license_policies = io.read_json(FPATH_TO_TOS_LICENSE_DATA)
tos_compete_policies = io.read_json(FPATH_TO_TOS_COMPETE_DATA)
# tos_license_policies = robots_util.switch_dates_yearly_to_monthly(tos_license_policies)
print(f"Num ToS AI/Scraping URLs: {len(tos_policies)}")
print(f"Num ToS License URLs: {len(tos_license_policies)}")
print(f"Num ToS Compete URLs: {len(tos_compete_policies)}")

Num ToS AI/Scraping URLs: 3068
Num ToS License URLs: 3070
Num ToS Compete URLs: 3070


In [443]:
# verdicts_cc = []
# for url, date_suburl_verdicts in tos_license_policies.items():
#     for suburl_verdicts in date_suburl_verdicts.values():
#         for verdicts in suburl_verdicts.values():
#             verdicts_cc.append(verdicts["verdict"])
# print(Counter(verdicts_cc))

In [483]:
# url_results_df

### Load Manual Pretraining Annotations

In [484]:
url_to_info = analysis_util.extract_url_annotations(DIRPATHS_TO_ANNOTATED_TASKS)
url_results_df = analysis_util.process_url_annotations(url_to_info)
url_results_df = analysis_util.encode_size_columns(url_results_df, url_token_lookup)
url_results_df = robots_util.encode_latest_tos_robots_into_df(
    url_results_df, tos_policies, tos_license_policies, tos_compete_policies, url_robots_summary,
    COMPANIES_TO_ANALYZE
)

6664 rows before filtering.
4029 rows after filtering. 1580 issues, 1055 unannotated.
<class 'collections.defaultdict'>
9312


# Create Plots

### Preprocessing for Robots Head & Random URL splits

In [7]:
### DECISION POINT: Use C4, Dolma, or RefinedWeb here?

CHOSEN_CORPUS = "c4" # 'c4', 'rf', 'dolma'
if CHOSEN_CORPUS == "c4":
    HEAD_URL_SET = top_c4_urls
    URL_TO_COUNTS = c4_url_to_counts
elif CHOSEN_CORPUS == "rf":
    HEAD_URL_SET = top_rf_urls
    URL_TO_COUNTS = rf_url_to_counts
elif CHOSEN_CORPUS == "dolma":
    HEAD_URL_SET = top_dolma_urls
    URL_TO_COUNTS = dolma_url_to_counts

In [8]:
url_robots_summary_head = {url: url_robots_summary[url] for url in HEAD_URL_SET if url in url_robots_summary}
url_robots_summary_head_detailed = {url: url_robots_summary_detailed[url] for url in HEAD_URL_SET if url in url_robots_summary_detailed}
url_robots_summary_rand = {url: url_robots_summary[url] for url in random_10k_urls if url in url_robots_summary}

In [9]:
# HEAD URL SPLIT
# {Period --> Agent --> Status --> set(URLs)}
robots_filled_status_head_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_head,
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE,
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)
# RANDOM URL SPLIT
robots_filled_status_rand_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_rand,
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE,
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)

# DETAILED HEAD
robots_filled_status_head_summary_detailed = robots_util.prepare_robots_temporal_summary_detailed(
    url_robots_summary=url_robots_summary_head_detailed,
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE,
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)

In [10]:
# Dataframe w/ [Period, Agent, Status, count(URLs or tokens)]
robots_temporal_head_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary,
    strictness_order=['no_robots', 'none', 'some', 'all'],
    url_to_counts=c4_url_to_counts,
)
# Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]
robots_temporal_rand_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_rand_summary,
    strictness_order=['no_robots', 'none', 'some', 'all'],
    url_to_counts=URL_TO_COUNTS,
)

detailed_strictness_order = [
    'no_robots', 'none', 'none_sitemap', 'none_crawl_delay', 'some_pattern_restrictions', 'some_disallow_important_dir', 'some_other', 'all'
]
robots_temporal_head_summary_detailed = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary_detailed,
    strictness_order=detailed_strictness_order,
    url_to_counts=URL_TO_COUNTS,
)

### Preprocessing for ToS

In [121]:
# URL --> time --> ToS verdict string.
url_to_time_to_tos_verdict = robots_util.get_tos_url_time_verdicts(
    tos_policies, tos_license_policies, tos_compete_policies)
# Period --> Status --> set(URLs)
period_tos_verdict_urls = robots_util.prepare_tos_temporal_summary(
    url_to_time_to_tos_verdict,
    start_time=TEMPORAL_ANALYSIS_START_DATE,
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)
# Dataframe: [Period, Status, Count, Tokens]
tos_summary_df = robots_util.tos_temporal_to_df(
    period_tos_verdict_urls,
    url_set=HEAD_URL_SET,
    url_to_counts=URL_TO_COUNTS,
)

6 / 26494 dates missed due to time mismatches.


## Plotting Constants

#### Event Markers

In [12]:
# Test Events (Date, Text)
# EVENTS = [
#     ("2016-08-01", "Event A"),
#     ("2020-06-01", "Event B"),
#     ("2021-06-01", "Event C"),
#     ("2022-07-01", "Event D"),
#     ("2023-02-01", "Event E")
# ]

EVENTS = []

#### Plot Style

In [13]:
PLOT_WIDTH = 1000
PLOT_HEIGHT = 160
LABEL_FONTSIZE = 14
TITLE_FONTSIZE = 16
LEGEND_COLS = 6


PLOT_KWARGS = {
    "width": PLOT_WIDTH,
    "height": PLOT_HEIGHT,
    "label_fontsize": LABEL_FONTSIZE,
    "title_fontsize": TITLE_FONTSIZE,
    "legend_cols": LEGEND_COLS,
    "vertical_line_dates": EVENTS
}

#### Forecasting Hyperparameters

In [246]:
# ANALYSIS_TYPE = "autoregression"
# ANALYSIS_TYPE = "prophet"
# ANALYSIS_TYPE = "arima"
ANALYSIS_TYPE = "sarima"
# ANALYSIS_TYPE = None # No forecasting
LAGS = [1, 3, 6, 12, 24, 48]
ORDER = (2, 1, 2)
SEASONAL_ORDER = (1, 1, 1, 6)
N_PERIODS = 12 # Number of periods to forecast (months, by default)

#### Helper Function for Forecasting

In [23]:
def forecast_plot(
    df: pd.DataFrame,
    agent: str,
    lags: typing.List[int],
    val_col: str = "tokens", # "count" / "tokens"
    n_periods: int = 6,
    seaonal_order: typing.Tuple[int] = None, # This needs to be set to run SARIMA
    ordered_statuses: typing.List[str] = None,
    status_colors: typing.Dict[str, str] = None,
    **kwargs
) -> alt.Chart:
    """Forecast and plot time series data (robots plots).

    Args:
        df (pd.DataFrame): Dataframe with columns like period, agent, status, count
        agent (str): Which agent to use
        lags (list[int]): List of lags to include in autoregression model
        val_col (str, optional): Column to forecast. Defaults to "tokens".
        n_periods (int, optional): Number of periods to forecast. Defaults to 6.
        seaonal_order (tuple[int], optional): Seasonal order for SARIMA. Defaults to None, which means no seasonal component.
        ordered_statuses (list[str], optional): Order of statuses in the legend.
        status_colors (dict[str, str], optional): Mapping of status to plot colors.

    Returns:
        alt.Chart: Altair chart
    """
    if ANALYSIS_TYPE == "autoregression":
        chart, _ = forecasting_util.forecast_and_plot(
            df,
            agent=agent,
            lags=lags,
            status_colors=status_colors,
            ordered_statuses=ordered_statuses,
            val_col=val_col,
            n_periods=n_periods,
            detailed=True,
            **kwargs
        )
    elif ANALYSIS_TYPE == "prophet":
        chart = forecasting_util.forecast_and_plot_prophet(
            df,
            agent=agent,
            lags=lags,
            status_colors=status_colors,
            ordered_statuses=ordered_statuses,
            n_periods=n_periods,
            **kwargs
        )
    elif ANALYSIS_TYPE == "arima":
        chart = forecasting_util.forecast_and_plot_arima(
            df,
            agent=agent,
            lags=lags,
            status_colors=status_colors,
            ordered_statuses=ordered_statuses,
            n_periods=n_periods,
            **kwargs
        )
    elif ANALYSIS_TYPE == "sarima":
        chart = forecasting_util.forecast_and_plot_sarima(
            df,
            agent=agent,
            order=ORDER,
            seasonal_order=SEASONAL_ORDER,
            status_colors=status_colors,
            ordered_statuses=ordered_statuses,
            n_periods=n_periods,
            **kwargs
        )
    else:
        chart = robots_util.plot_robots_time_map_altair_detailed(
            df,
            agent_type=agent,
            period_col="period",
            status_col="status",
            val_col="tokens", # "count" / "tokens"
            ordered_statuses=ordered_statuses,
            status_colors=status_colors,
            **kwargs
        )

    return chart

#### Other Parameters

In [29]:
AGENT = "Combined Agent"

In [17]:
def times_newroman():
    font = "Times New Roman"

    return {
          "config" : {
               "title": {"font": font},
               "axis": {
               "labelFont": font,
               "titleFont": font
          },
          "header": {
               "labelFont": font,
               "titleFont": font
          },
          "legend": {
               "labelFont": font,
               "titleFont": font
          },
          "text": {
               "font": font
          }
     }
}

alt.themes.register("times_newroman", times_newroman)
alt.themes.enable("times_newroman")

ThemeRegistry.enable('times_newroman')

## Plot Temporal Robots Chart

In [266]:
ORDERED_STATUSES = [
    "no_robots",
    "none",
    "none_sitemap",
    "none_crawl_delay",
    "some_other",
    "some_disallow_important_dir",
    # "some_disallow_file_types",
    "some_pattern_restrictions",
    "all"
]

STATUS_COLORS = {
    "all": "#a32a31",
    "some_pattern_restrictions": "#c76753",
    # "some_disallow_file_types": "#e8a888",
    "some_disallow_important_dir": "#f7dcca",
    "some_other": "#f7f7f7",
    "none_crawl_delay": "#d5e4ef",
    "none_sitemap": "#9dc4db",
    "none": "#5891bf",
    "no_robots": "#d9d9d9"
}

LEGEND_MAPPING = {
    "all": "Full restrictions",
    "some_pattern_restrictions": "Pattern-based restrictions",
    # "some_disallow_file_types": "Disallow specific file types",
    "some_disallow_important_dir": "Disallow private directories",
    "some_other": "Other restrictions",
    "none_crawl_delay": "Crawl delay specified",
    "none_sitemap": "Sitemap provided",
    "none": "No restrictions or sitemap",
    "no_robots": "No Robots.txt"
}


# Copy and remap data for plotting
df = robots_temporal_head_summary_detailed.copy()
df["status"] = df["status"].map(LEGEND_MAPPING)
ordered_statuses = [LEGEND_MAPPING[s] for s in ORDERED_STATUSES]
status_colors = {LEGEND_MAPPING[k]: v for k, v in STATUS_COLORS.items()}

chart_robots = forecast_plot(
    df,
    agent=AGENT,
    lags=LAGS,
    val_col="tokens",
    n_periods=N_PERIODS,
    ordered_statuses=ordered_statuses,
    status_colors=status_colors,
    legend_title="Robots.txt Agent Restrictions",
    **PLOT_KWARGS
)


#chart_robots.save(
#    "%s_%s-%s.pdf" % (AGENT, CHOSEN_CORPUS, ANALYSIS_TYPE),
#    ppi=300
#)

chart_robots

## Plot Temporal ToS Area Chart + Forecast

In [267]:
ORDERED_STATUSES = [
    "No Terms Pages", "Unrestricted Use", "Conditional Use", "No Re-Distribution", "Non-Compete", "NC Only", "No AI", "No Scraping", "No Scraping & AI"
]

STATUS_COLORS = {
    "No Terms Pages": "#d9d9d9",
    "Unrestricted Use": "#5891bf",
    "Conditional Use": "#d5e4ef",
    "No Re-Distribution": "#f7dcca",
    "Non-Compete": "#f7dcca",
    "NC Only": "#f7dcca",
    "No AI": "#f7dcca",
    "No Scraping": "#e8a888",
    "No Scraping & AI": "#a32a31"
}

# 'No Terms Pages' 'Unrestricted Use' 'No Scraping + NC Only'
#  'Conditional Use' 'NC Only' 'No Scraping' 'No Scraping & AI + NC Only'
#  'No Scraping & AI' 'No AI']


df = tos_summary_df.copy()
# print(df["status"].unique())


chart_tos = forecasting_util.plot_and_forecast_tos_sarima(
    df,
    period_col="period",
    status_col="status",
    val_col="tokens",  # "count" / "tokens"
    title="",
    ordered_statuses=ORDERED_STATUSES,
    status_colors=STATUS_COLORS,
    order=ORDER,
    seasonal_order=SEASONAL_ORDER,
    configure=False,
    legend_title="Web Domain Terms of Service Restrictions",
    **PLOT_KWARGS
)

#chart_tos.save(
#    "TOS_%s.png" % (ANALYSIS_TYPE),
#    ppi=300
#)

chart_tos

## Plot Restrictions by Company

In [268]:
df = robots_temporal_head_summary.copy()

forecast_startdate = df["period"].max().strftime("%Y-%m-%d")
if ANALYSIS_TYPE == "autoregression":
    df = forecasting_util.forecast_company_comparisons_autoregression(
        df,
        lags=LAGS,
        val_col="tokens",
        n_periods=N_PERIODS
    )
elif ANALYSIS_TYPE == "sarima":
    df = forecasting_util.forecast_company_comparisons_sarima(
        df,
        val_col="tokens",
        n_periods=N_PERIODS,
        order=ORDER,
        seasonal_order=SEASONAL_ORDER,
    )

AGENT_COLORMAP = {
    "Google": "#4D58D5",          # blue
    "OpenAI": "#519BEE",          # coral
    "Anthropic": "#68D4C4",       # green
    "Cohere": "#83F590",          # red
    "Common Crawl": "#DEDD56",    # muted purple
    "Meta": "#ADF86C",            # slate gray
    "Internet Archive": "#5D6D7E",# golden yellow
    "Google Search": "#D7EDEE",   # light gray
    "False Anthropic": "#D9D9D9"  # teal
}

chart_companies = visualization_util.plot_company_comparisons_altair(
    df,
    color_mapping=AGENT_COLORMAP,
    scale_y="log",
    configure=False,
    eventline_scaling=4, # How tall the event marker lines should be (scaling is relative to the span of data at the event time)
    forecast_startdate=forecast_startdate,
    skip_pct=True,
    legend_title="Restrictions by Organization Agent",
    **PLOT_KWARGS
)

#chart_companies.save(
#    "Company_Comparisons_%s.png" % (ANALYSIS_TYPE),
#    ppi=300
#)

chart_companies

## Combine Charts

In [269]:
combined_chart = alt.vconcat(chart_robots, chart_tos, chart_companies).configure_axis(
    grid=False,
    labelFontSize=PLOT_KWARGS["label_fontsize"],
    titleFontSize=PLOT_KWARGS["title_fontsize"],
    labelAngle=0,
    tickCount=len(range(df["period"].dt.year.min(), df["period"].dt.year.max()))
).configure_legend(
    labelFontSize=PLOT_KWARGS["label_fontsize"],
    titleFontSize=PLOT_KWARGS["title_fontsize"],
    columns=PLOT_KWARGS["legend_cols"]
).configure_view(
    strokeWidth=0
).resolve_scale(
    color="independent",
    x="shared"
)

#combined_chart.save(
#    "Combined_%s.png" % (ANALYSIS_TYPE),
#    ppi=300
#)

combined_chart

## Create Correlations Tables

In [463]:
c4_estimates = analysis_util.run_population_analysis(
    url_results_df,
    top_c4_urls,
    "c4",
    "pretrain_data/corpus_token_bucket_counts/c4_buckets.csv",
    url_token_lookup,
    verbose=False,
)
rf_estimates = analysis_util.run_population_analysis(
    url_results_df,
    top_rf_urls,
    "rf",
    "pretrain_data/corpus_token_bucket_counts/rf_buckets.csv",
    url_token_lookup,
    verbose=False,
)
dolma_estimates = analysis_util.run_population_analysis(
    url_results_df,
    top_dolma_urls,
    "dolma",
    "pretrain_data/corpus_token_bucket_counts/dolma_buckets.csv",
    url_token_lookup,
    verbose=False,
)

Head sample size: 1634
Rand sample size: 1342


ValueError: cannot convert float NaN to integer

In [None]:
url_correlation_df = analysis_util.analyze_url_variable_correlations(
    url_results_df,
    [100, 500, 2000],
    c4_estimates,
    rf_estimates,
    dolma_estimates
)

# Convert the dataframe to a LaTeX table
latex_table = url_correlation_df.to_latex(index=True, escape=True, float_format="{:.1f}".format)
print(latex_table)

## Robots & ToS Confusion Matrix

In [464]:
robots_util.prepare_tos_robots_confusion_matrix(
    tos_policies,
    url_robots_summary,
    COMPANIES_TO_ANALYZE,
    url_token_lookup,
    corpora_choice="dolma",
)

TypeError: prepare_tos_robots_confusion_matrix() missing 1 required positional argument: 'url_token_lookup'

In [None]:
type(url_robots_summary)

In [None]:
def company_to_company_restrictions(url_robots_summary, companies):
    # Create a dictionary to hold the URL statuses for each company
    url_status_dict = {}

    for company in companies:
        # Get the latest URL robot statuses for the company
        agent_names = robots_util.get_bots(company)
        url_status = robots_util.get_latest_url_robot_statuses(url_robots_summary, agent_names)
        url_status_dict[company] = url_status

    # print(url_status_dict["OpenAI"].values())
    # Create a list to hold the conditional probability data
    conditional_prob_data = []

    # Compare each pair of companies
    for company_a in companies:
        for company_b in companies:
            if company_a == company_b:
                continue  # Skip self-comparison

            status_a = url_status_dict[company_a]
            status_b = url_status_dict[company_b]

            total_restricted_a = sum(1 for status in status_a.values() if status == 'all')
            restricted_b_if_a_restricted = sum(1 for url, status in status_a.items() if status == 'all' and status_b.get(url) == 'all')

            if total_restricted_a > 0:
                pct_b_restricted_if_a_restricted = round((restricted_b_if_a_restricted / total_restricted_a) * 100, 1)
            else:
                pct_b_restricted_if_a_restricted = 0

            conditional_prob_data.append({
                'Company A': company_a,
                'Company B': company_b,
                'pct_a_restricted_if_b_restricted': pct_b_restricted_if_a_restricted
            })

    # Convert the conditional probability data to a DataFrame
    df = pd.DataFrame(conditional_prob_data)
    # print(df)

    # Plot the confusion matrix
    return visualization_util.plot_confusion_matrix(
        df,
        yaxis_order=companies,
        xaxis_order=companies,
        text_axis='pct_a_restricted_if_b_restricted',
        color_axis='pct_a_restricted_if_b_restricted',
        yaxis_title="Company A",
        xaxis_title="Company B",
        font_size=20,
        font_style='sans-serif',
        width=600,
        height=400,
    )

# Assuming robots_util and visualization_util are already imported and configured correctly


In [None]:
company_to_company_restrictions(url_robots_summary_head, ALL_COMPANIES_TO_TRACK)

## Plot Num Tokens against Robots Restrictions per Company

In [None]:
size_bucket_to_urls = robots_util.bucket_urls_by_size(
    c4_url_to_counts,
    bucket_boundaries=[0, 1000, 10000, 50000, 1000000, 10000000, 50000000, 9999999999999999]
)
robots_util.plot_size_against_restrictions(
    url_robots_summary,
    size_bucket_to_urls,
    "OpenAI",
    setting="train"
)