In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy

from datetime import datetime
from collections import Counter, defaultdict

from scipy.stats import norm, normaltest, percentileofscore

import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

from tqdm.notebook import tqdm

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util, forecasting_util

%load_ext autoreload
%autoreload 2

In [None]:
# os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection/'))

In [None]:
def times_newroman():
    font = "Times New Roman"

    return {
          "config" : {
               "title": {"font": font},
               "axis": {
               "labelFont": font,
               "titleFont": font
          },
          "header": {
               "labelFont": font,
               "titleFont": font
          },
          "legend": {
               "labelFont": font,
               "titleFont": font
          },
          "text": {
               "font": font
          }
     }
}

alt.themes.register("times_newroman", times_newroman)
alt.themes.enable("times_newroman")

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Define Paths to all relevant files

In [None]:
EXCEL_FNAME_ROBOTS = "data/forecasted_robots_data.xlsx"
EXCEL_FNAME_TOS = "data/forecasted_tos_data.xlsx"
EXCEL_FNAME_FINAL = "data/forecasted_final_data.xlsx"
FPATH_TO_RELEVANT_URL_TOKENS = 'data/pretrain_data/relevant_url_token_counts.csv'
FPATH_to_HEAD_ROBOTS = "data/robots/temporal_robots_head.json"
FPATH_TO_RAND_ROBOTS = "data/robots/temporal_robots_rand_10k.json"
FPATH_TO_TOS_DATA = "data/GPT_analysis_results/tos_ai_scraping_policies.json"
FPATH_TO_TOS_LICENSE_DATA = "data/GPT_analysis_results/tos_license_policies.json"
FPATH_TO_TOS_COMPETE_DATA = "data/GPT_analysis_results/tos_competing_services_policies.json"
FPATH_TO_C4_TOKEN_ESTIMATES = "data/raw_annotations/c4_total_token_estimates.csv" 
FPATH_TO_DOLMA_TOKEN_ESTIMATES = "data/raw_annotations/dolma_total_token_estimates.csv"
FPATH_TO_RF_TOKEN_ESTIMATES = "data/raw_annotations/rf_total_token_estimates.csv" 
DIRPATHS_TO_ANNOTATED_TASKS = ["data/raw_annotations/task_1", "data/raw_annotations/task_2"]
START_DATES = "data/raw_annotations/domain_start_dates.json"

ALL_COMPANIES_TO_TRACK = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta", "Internet Archive", "Google Search", "False Anthropic"]
COMPANIES_TO_ANALYZE = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
TEMPORAL_ANALYSIS_START_DATE = '2016-01-01'
TEMPORAL_ANALYSIS_END_DATE = '2024-04-30'

### Load all URL splits (top vs random) and maps to Token Counts

In [None]:
url_token_lookup = robots_util.URLTokenLookup(FPATH_TO_RELEVANT_URL_TOKENS) # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")
rf_url_to_counts = url_token_lookup.get_url_to_token_map("rf")
dolma_url_to_counts = url_token_lookup.get_url_to_token_map("dolma")
top_c4_urls = url_token_lookup.top_k_urls("c4", 2000)
top_rf_urls = url_token_lookup.top_k_urls("rf", 2000)
top_dolma_urls = url_token_lookup.top_k_urls("dolma", 2000)
random_10k_urls = url_token_lookup.get_10k_random_sample()
all_urls = set(random_10k_urls + top_c4_urls + top_rf_urls + top_dolma_urls)

# Load website snapshots for relevant URLs
website_start_dates = robots_util.read_start_dates(START_DATES, all_urls) # THIS WON'T WORK FOR THE 10k SAMPLE

### Define Agents and Agent Groups

In [None]:
agent_groups_to_track = robots_util.get_bot_groups(ALL_COMPANIES_TO_TRACK)
agents_to_track = robots_util.get_bots()

### Load Robots.txt info

In [None]:
# URL -> Date -> Robots.txt raw text
head_robots = io.read_json(FPATH_to_HEAD_ROBOTS)
random_10k_robots = io.read_json(FPATH_TO_RAND_ROBOTS)
joined_robots = copy.deepcopy(head_robots)
joined_robots.update(random_10k_robots)
robots_util.print_out_robots_info(head_robots)
robots_util.print_out_robots_info(random_10k_robots)

# {URL --> Date --> Agent --> Status}
url_robots_summary, agent_counter_df = robots_util.compute_url_date_agent_status(
    data=joined_robots, 
    # relevant_agents=agents_to_track)
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs])

# agent_counter_df.to_csv("src/analysis/all_agents_counter.csv", index=False)

In [None]:
url_robots_summary_detailed = robots_util.compute_url_date_agent_status_detailed(
    data=joined_robots, 
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs]
)
print(len(url_robots_summary_detailed))

### Load ToS info

In [None]:
# URL --> Date --> ToS-suburl --> {"verdict": X, "evidence": Y}
tos_policies = {robots_util.normalize_url(url): info for url, info in io.read_json(FPATH_TO_TOS_DATA).items()}
tos_license_policies = {robots_util.normalize_url(url): info for url, info in io.read_json(FPATH_TO_TOS_LICENSE_DATA).items()}
tos_compete_policies = {robots_util.normalize_url(url): info for url, info in io.read_json(FPATH_TO_TOS_COMPETE_DATA).items()}
# tos_license_policies = robots_util.switch_dates_yearly_to_monthly(tos_license_policies)
print(f"Num ToS AI/Scraping URLs: {len(tos_policies)}")
print(f"Num ToS License URLs: {len(tos_license_policies)}")
print(f"Num ToS Compete URLs: {len(tos_compete_policies)}")

### Load Manual Pretraining Annotations

In [None]:
url_to_info, unannotated_urls = analysis_util.extract_url_annotations(DIRPATHS_TO_ANNOTATED_TASKS)
url_results_df = analysis_util.process_url_annotations(url_to_info)
url_results_df = analysis_util.encode_size_columns(url_results_df, url_token_lookup)
manually_annotated_urls = url_results_df["URL"].tolist()
url_results_df = robots_util.encode_latest_tos_robots_into_df(
    url_results_df, tos_policies, tos_license_policies, tos_compete_policies, url_robots_summary_detailed,
    COMPANIES_TO_ANALYZE, True
)
service_to_urls = analysis_util.map_services_to_urls(url_results_df)

assert url_results_df['URL'].nunique() == url_results_df.shape[0]

### DECISION POINT: Use C4, Dolma, or RefinedWeb here?

In [None]:
CHOSEN_CORPUS = "c4" # 'c4', 'rf', 'dolma'
if CHOSEN_CORPUS == "c4":
    HEAD_URL_SET = top_c4_urls
    URL_TO_COUNTS = c4_url_to_counts
elif CHOSEN_CORPUS == "rf":
    HEAD_URL_SET = top_rf_urls
    URL_TO_COUNTS = rf_url_to_counts
elif CHOSEN_CORPUS == "dolma":
    HEAD_URL_SET = top_dolma_urls
    URL_TO_COUNTS = dolma_url_to_counts

In [None]:
url_robots_summary_head = {url: url_robots_summary[url] for url in HEAD_URL_SET if url in url_robots_summary}
url_robots_summary_head_detailed = {url: url_robots_summary_detailed[url] for url in HEAD_URL_SET if url in url_robots_summary_detailed}
url_robots_summary_rand = {url: url_robots_summary[url] for url in random_10k_urls if url in url_robots_summary}
url_robots_summary_rand_detailed = {url: url_robots_summary_detailed[url] for url in random_10k_urls if url in url_robots_summary_detailed}

# Robots Data for Ariel

In [None]:
DETAILED_ROBOTS_STRICTNESS_ORDER = [
    'no_robots', 'none', 'none_sitemap', 'none_crawl_delay', 'some_pattern_restrictions', 'some_disallow_important_dir', 'some_other', 'all'
]

# C4
urlsubset_to_robots_summary_c4, c4_url_subsets = robots_util.prepare_temporal_robots_for_corpus(
    url_robots_summary_detailed,
    top_c4_urls,
    random_10k_urls,
    service_to_urls,
    c4_url_to_counts,
    agent_groups_to_track,
    DETAILED_ROBOTS_STRICTNESS_ORDER,
    TEMPORAL_ANALYSIS_START_DATE,
    TEMPORAL_ANALYSIS_END_DATE,
    website_start_dates,   
)
# REFINEDWEB
urlsubset_to_robots_summary_rf, rf_url_subsets = robots_util.prepare_temporal_robots_for_corpus(
    url_robots_summary_detailed,
    top_rf_urls,
    random_10k_urls,
    service_to_urls,
    rf_url_to_counts,
    agent_groups_to_track,
    DETAILED_ROBOTS_STRICTNESS_ORDER,
    TEMPORAL_ANALYSIS_START_DATE,
    TEMPORAL_ANALYSIS_END_DATE,
    website_start_dates,   
)
# DOLMA
urlsubset_to_robots_summary_dolma, dolma_url_subsets = robots_util.prepare_temporal_robots_for_corpus(
    url_robots_summary_detailed,
    top_dolma_urls,
    random_10k_urls,
    service_to_urls,
    dolma_url_to_counts,
    agent_groups_to_track,
    DETAILED_ROBOTS_STRICTNESS_ORDER,
    TEMPORAL_ANALYSIS_START_DATE,
    TEMPORAL_ANALYSIS_END_DATE,
    website_start_dates,   
)

# ToS Data for Ariel

In [None]:
# C4
urlsubset_to_tos_summary_c4, c4_tos_url_subsets = robots_util.prepare_temporal_tos_for_corpus(
    tos_policies,
    tos_license_policies,
    tos_compete_policies,
    top_c4_urls,
    random_10k_urls,
    service_to_urls,
    c4_url_to_counts,
    agent_groups_to_track,
    TEMPORAL_ANALYSIS_START_DATE,
    TEMPORAL_ANALYSIS_END_DATE,
    manually_annotated_urls,
    website_start_dates,
)
# REFINEDWEB
urlsubset_to_tos_summary_rf, rf_tos_url_subsets = robots_util.prepare_temporal_tos_for_corpus(
    tos_policies,
    tos_license_policies,
    tos_compete_policies,
    top_rf_urls,
    random_10k_urls,
    service_to_urls,
    rf_url_to_counts,
    agent_groups_to_track,
    TEMPORAL_ANALYSIS_START_DATE,
    TEMPORAL_ANALYSIS_END_DATE,
    manually_annotated_urls,
    website_start_dates,
)
# DOLMA
urlsubset_to_tos_summary_dolma, dolma_tos_url_subsets = robots_util.prepare_temporal_tos_for_corpus(
    tos_policies,
    tos_license_policies,
    tos_compete_policies,
    top_dolma_urls,
    random_10k_urls,
    service_to_urls,
    dolma_url_to_counts,
    agent_groups_to_track,
    TEMPORAL_ANALYSIS_START_DATE,
    TEMPORAL_ANALYSIS_END_DATE,
    manually_annotated_urls,
    website_start_dates,
)

In [None]:
# statuses_to_include = ['all', 'some_pattern_restrictions', 'some_disallow_important_dir']
ROBOTS_STATUSES_TO_INCLUDE = ['all']
TARGET_AGENT = 'Combined Agent'

In [None]:
# C4
robots_util.generate_corpus_restriction_estimates_per_url_split(
    urlsubset_to_robots_summary_c4,
    c4_url_subsets,
    "c4",
    url_token_lookup,
    TARGET_AGENT,
    ROBOTS_STATUSES_TO_INCLUDE,
    save_dir="data/forecasting/robots",
)
# REFINEDWEB
robots_util.generate_corpus_restriction_estimates_per_url_split(
    urlsubset_to_robots_summary_rf,
    rf_url_subsets,
    "rf",
    url_token_lookup,
    TARGET_AGENT,
    ROBOTS_STATUSES_TO_INCLUDE,
    save_dir="data/forecasting/robots",
)
# DOLMA
robots_util.generate_corpus_restriction_estimates_per_url_split(
    urlsubset_to_robots_summary_dolma,
    dolma_url_subsets,
    "dolma",
    url_token_lookup,
    TARGET_AGENT,
    ROBOTS_STATUSES_TO_INCLUDE,
    save_dir="data/forecasting/robots",
)

In [None]:
TOS_STATUSES_TO_EXCLUDE = [
    'No Scraping & AI', 'NC Only', 'No Scraping', 'No Re-Distribution', 'Non-Compete', 'No AI', 'Conditional Use'
]

# # C4
robots_util.generate_corpus_restriction_estimates_per_url_split(
    urlsubset_to_tos_summary_c4,
    c4_tos_url_subsets,
    "c4",
    url_token_lookup,
    TARGET_AGENT,
    TOS_STATUSES_TO_EXCLUDE,
    save_dir="data/forecasting/tos",
)
# REFINEDWEB
robots_util.generate_corpus_restriction_estimates_per_url_split(
    urlsubset_to_tos_summary_rf,
    rf_tos_url_subsets,
    "rf",
    url_token_lookup,
    TARGET_AGENT,
    TOS_STATUSES_TO_EXCLUDE,
    save_dir="data/forecasting/tos",
)
# DOLMA
robots_util.generate_corpus_restriction_estimates_per_url_split(
    urlsubset_to_tos_summary_dolma,
    dolma_tos_url_subsets,
    "dolma",
    url_token_lookup,
    TARGET_AGENT,
    TOS_STATUSES_TO_EXCLUDE,
    save_dir="data/forecasting/tos",
)

# Table

In [None]:
from sklearn.linear_model import LogisticRegression

ALL_VARS = [
    'User Content', 'Paywall', 'Ads','Modality: Image', 'Modality: Video', 'Modality: Audio',
    'Sensitive Content', 'services_Academic', 'services_Blogs',
    'services_E-Commerce', 'services_Encyclopedia/Database',
    'services_Government', 'services_News/Periodicals',
    'services_Organization/Personal Website', 'services_Other',
    'services_Social Media/Forums', 'Restrictive Robots.txt', 'Restrictive Terms'
]



def url_variable_instance(url_token_lookup, url_results_df, n_resamples=100000, n_pa_resamples=200):
    url_correlation_df = analysis_util.analyze_url_variable_correlations(url_results_df, ALL_VARS)

    def pop_analysis(df, domains=None):
        if domains is not None:
            c4_domains, rf_domains, dolma_domains = domains, domains, domains
        else:
            c4_domains = url_token_lookup.top_k_urls('c4', 2000)
            rf_domains = url_token_lookup.top_k_urls('rf', 2000)
            dolma_domains = url_token_lookup.top_k_urls('dolma', 2000)
            
        return {
            'C4': analysis_util.run_population_analysis(
                df,
                url_token_lookup,
                'c4',
                ALL_VARS,
                c4_domains,
                # True,
            )['est_tokens_pct'],
            
            'RW': analysis_util.run_population_analysis(
                df, 
                url_token_lookup,
                'rf',
                ALL_VARS,
                rf_domains,
            )['est_tokens_pct'],
            
            'Dolma': analysis_util.run_population_analysis(
                df, 
                url_token_lookup,
                'dolma',
                ALL_VARS,
                dolma_domains,
            )['est_tokens_pct']
        }

    # print(len(url_results_df))
    pct_tokens_in_corpus = pop_analysis(url_results_df)
    url_correlation_df['C4'] = pct_tokens_in_corpus['C4']
    url_correlation_df['RW'] = pct_tokens_in_corpus['RW']
    url_correlation_df['Dolma'] = pct_tokens_in_corpus['Dolma']
    # return url_correlation_df, None, None

    if n_resamples is not None and n_resamples > 0:
        tmp = url_results_df.copy().set_index('URL')
        keys = tmp.loc[tmp['sample'] == 'random'].index.unique().tolist()
        
        resamples = {}
        for i in tqdm(range(n_resamples)):
            domains = np.random.choice(keys, len(keys), replace=True)
            resamples[i] = tmp.loc[domains, ALL_VARS].mean()
        resamples = pd.DataFrame(resamples)
            
    pct_resamples = []
    for i in tqdm(range(n_pa_resamples)):
        tmp = url_results_df.copy().set_index('URL')
        tmp = tmp.loc[tmp['sample'] == 'random']
        
        keys = tmp.index.unique().tolist()
        domains = np.random.choice(keys, len(keys), replace=True)
        tmp = tmp.loc[domains].reset_index()

        try:
            pct_resamples += [pd.DataFrame(pop_analysis(tmp, domains)).T.reset_index().rename({'index': 'corpus'}, axis=1).assign(i=i)]
        except ValueError:  # <2 classes for LogisticRegression
            continue
    print(len(pct_resamples))
    pct_resamples = pd.concat(pct_resamples).set_index(['i', 'corpus'])
    
    url_correlation_df.columns = pd.MultiIndex.from_tuples([
        ('URL Group', 'Top 100'),
        ('URL Group', 'Top 500'),
        ('URL Group', 'Top 2000'),
        ('URL Group', 'Random'),
        ('Pct. Tokens in Corpus', 'C4'),
        ('Pct. Tokens in Corpus', 'RW'),
        ('Pct. Tokens in Corpus', 'Dolma'),
    ])
    
    url_correlation_df[('Stats', 'Diff')] = url_correlation_df[('URL Group', 'Top 2000')] - url_correlation_df[('URL Group', 'Random')]
    
    url_correlation_df = url_correlation_df[[
        ('URL Group', 'Top 100'),
        ('URL Group', 'Top 500'),
        ('URL Group', 'Top 2000'),
        ('URL Group', 'Random'),
        ('Stats', 'Diff'),
        ('Pct. Tokens in Corpus', 'C4'),
        ('Pct. Tokens in Corpus', 'RW'),
        ('Pct. Tokens in Corpus', 'Dolma'),
    ]]
    
    url_correlation_df.index = url_correlation_df.index.str.replace('services_', '')
    url_correlation_df.index.name = 'Variable'

    resamples.index = resamples.index.str.replace('services_', '')
    resamples.index.name = 'Variable'

    pct_resamples = pct_resamples.reset_index().drop('i', axis=1).groupby('corpus').std().T
    pct_resamples.index.name = 'Variable'
    pct_resamples.columns = pd.MultiIndex.from_tuples([('Pct. Tokens in Corpus', c) for c in pct_resamples.columns])
    pct_resamples.index = pct_resamples.index.str.replace('services_', '')
    
    return url_correlation_df, resamples, pct_resamples

In [None]:
url_correlation_df, url_correlation_resamples, url_correlation_pct_resamples = url_variable_instance(url_token_lookup, url_results_df)

In [None]:
url_correlation_df

In [None]:
## Sanity-check the resampling results

url_correlation_resamples.T.describe().T

In [None]:
# These will usually fail, but it's fine -- exact normality will generally
# not happen for large n, the histograms look good and show that there's
# nothing wrong with the resampling process
url_correlation_resamples.apply(lambda s: normaltest(s).pvalue, axis=1)

In [None]:
for i in url_correlation_resamples.index:
    fig, ax = plt.subplots()
    url_correlation_resamples.loc[i].hist(ax=ax)
    ax.set_title(i)

In [None]:
def run_tests(point, boot, alpha=None):
    if alpha is None:
        alpha = norm.sf(5) / (3 * point.shape[0]) # 5 sigma level

    tests = []
    for i in point.index:
        upper = boot.loc[i].quantile(1 - alpha / 2)
        lower = boot.loc[i].quantile(alpha / 2)
        
        for col in ['Top 100', 'Top 500', 'Top 2000']:
            observed = point.loc[i, ('URL Group', col)]
            
            tests += [{
                'var': i,
                'col': col,
                'observed': observed,
                'upper': upper,
                'lower': lower,
                'reject': (observed > upper or observed < lower),
            }]
    
    return pd.DataFrame(tests).set_index(['var', 'col'])

tests = run_tests(url_correlation_df, url_correlation_resamples)
tests_05 = run_tests(url_correlation_df, url_correlation_resamples, alpha=0.05)

(
    tests['reject'].sum() / tests.shape[0],
    tests_05['reject'].sum() / tests_05.shape[0],
)

In [None]:
kwargs = {
    'environment': 'table',
    
    'label': 'tab:correlations',
    'column_format': 'l' + 'r' * url_correlation_df.shape[1],
    'multicol_align': 'c',
    
    'caption': (r'''
    \textbf{Mean incidence rates of web source features across C4, RefinedWeb, and Dolma.} We measure incidence rates for the top 100, 500, and 2000 URLs, ranked by number of tokens, as well as the random sample. The `Diff' column reports the \% difference between the top 2k and random samples. We test for significant differences between the overall corpus and each of the top-100, top-500 and top-2000 sets with a Bonferroni-corrected two-sided permutation test, where differences significant at the Bonferroni-corrected $5 \sigma$ level are indicated in bold. 81\% of differences are significant at this level, while 93\% are significant at the less strict $p = 0.05$ level. We also estimate the percentage of tokens in each corpus, C4, RefinedWeb, and Dolma, for which the web feature is present (\pm 95\% bootstrap CI shown in gray).
    '''.strip(), r'\textbf{Mean incidence rates of web source features across C4, RefinedWeb, and Dolma.}'),
    
    'hrules': True,
    'convert_css': True,
}

def color_values(val):
    color = 'red' if val < 0 else 'green' if val > 0 else 'black'
    return f'background-color: {color}'

def bold_rejects(column):
    def style_cell(cell_value, index, column_name):
        if column_name[0] != 'URL Group':
            return ''
        elif column_name[1] not in ('Top 100', 'Top 500', 'Top 2000'):
            return ''
        else:
            reject = tests.loc[(index, column_name[1]), 'reject']
            if reject:
                return 'font-weight: bold;'
            else:
                return ''

    return [style_cell(cell, idx, column.name) for cell, idx in zip(column, column.index)]

z = norm.ppf(1 - 0.05 / 2)
tmp = url_correlation_df.copy()
tmp[('Pct. Tokens in Corpus', 'C4')] = tmp[('Pct. Tokens in Corpus', 'C4')].apply(lambda s: '{:.1f}'.format(100*s)) + r'\scriptsize{\color{gray} \textpm ' + (z*url_correlation_pct_resamples[('Pct. Tokens in Corpus', 'C4')]).apply(lambda s: '{:.1f}'.format(100*s)) + '}'
tmp[('Pct. Tokens in Corpus', 'RW')] = tmp[('Pct. Tokens in Corpus', 'RW')].apply(lambda s: '{:.1f}'.format(100*s)) + r'\scriptsize{\color{gray} \textpm ' + (z*url_correlation_pct_resamples[('Pct. Tokens in Corpus', 'RW')]).apply(lambda s: '{:.1f}'.format(100*s)) + '}'
tmp[('Pct. Tokens in Corpus', 'Dolma')] = tmp[('Pct. Tokens in Corpus', 'Dolma')].apply(lambda s: '{:.1f}'.format(100*s)) + r'\scriptsize{\color{gray} \textpm ' + (z*url_correlation_pct_resamples[('Pct. Tokens in Corpus', 'Dolma')]).apply(lambda s: '{:.1f}'.format(100*s)) + '}'

formatters = {}
for c in url_correlation_df.columns:
    if c == ('Stats', 'Diff'):
        formatters[c] = lambda s: '{:+.1f}'.format(100 * s)
    elif c[0] == 'Pct. Tokens in Corpus':
        formatters[c] = lambda s: s
    else:
        formatters[c] = lambda s: '{:.1f}'.format(100 * s)

print(tmp \
    .style \
    .apply(bold_rejects, axis=0) \
    .applymap(color_values, subset=[('Stats', 'Diff')]) \
    .format(formatter=formatters) \
    .to_latex(**kwargs)
)

# Overall confusion matrix

In [None]:
def plot_confusion_matrix(
    df,
    yaxis_order=None, 
    xaxis_order=None,
    text_axis=None,
    color_axis=None,
    color_scale=None,
    yaxis_title="",
    xaxis_title="",
    font_size=20,
    text_font_size=None,
    font_style='sans-serif',
    width=400,
    height=400,
):
    if text_font_size is None:
        text_font_size = font_size
    
    if color_scale is None:
        # color_scale = alt.Scale(scheme='blues')
        color_scale = alt.Scale(domain=[df[color_axis].min(), df[color_axis].max()], range=['#7ec1be', '#101f5b'])
    
    if yaxis_order is None:
        yaxis_order = df[yaxis_title].unique()
    if xaxis_order is None:
        xaxis_order = df[xaxis_title].unique()
    full_matrix = pd.MultiIndex.from_product([yaxis_order, xaxis_order], names=[yaxis_title, xaxis_title]).to_frame(index=False)
    df_full = pd.merge(full_matrix, df, on=[yaxis_title, xaxis_title], how='outer')
    df_full[text_axis] = df_full[text_axis].fillna('--')
    df_full['text_color'] = df_full[text_axis].astype(str).str.replace('%', '').str.strip().apply(lambda s: float(s) if s != '--' else np.nan) > 10
    
    # Create the heatmap
    heatmap = alt.Chart(df_full).mark_rect(invalid=None).encode(
        x=alt.X(f'{xaxis_title}:N', title=xaxis_title, sort=xaxis_order if xaxis_order else None),
        y=alt.Y(f'{yaxis_title}:N', title=yaxis_title, sort=yaxis_order if yaxis_order else None),
        color=alt.condition(
            f"isValid(datum['{color_axis}'])",
            alt.Color(f'{color_axis}:Q', scale=color_scale),
            alt.value('lightgray'),
        ),
        order="order:Q"
    )

    text = heatmap.mark_text(
        align='center',
        baseline='middle',
        fontSize=text_font_size,
        font=font_style,
    ).encode(
        text=alt.Text(f'{text_axis}:N'),  # Format the text as "XX.Y"
        color=alt.condition(
            alt.datum.text_color,
            alt.value('white'),
            alt.value('black')
        )
    )
    
    final_plot = (heatmap + text).properties(
        width=width,
        height=height,
    ).configure_axis(
        labelFontSize=font_size,
        labelFont=font_style,
        titleFontSize=font_size,
        titleFont=font_style,
        domain=True,
    ).configure_axisX(
        labelAngle=0,
        domain=True
    ).configure_axisY(
        domain=True     # Ensure the Y-axis domain line is shown
    ).configure_view(
        stroke='black'  # Add borders around the entire plot
    ).configure_legend(
        disable=True,
    )
    
    return final_plot

In [None]:
def prepare_tos_robots_confusion_matrix(
    tos_policies,
    tos_license_policies,
    tos_compete_policies,
    url_robots_summary,
    companies,
    url_token_lookup,
    use_token_counts=True,
    corpora_choice="c4",
    font_size=20,
    text_font_size=None,
    font_style='sans-serif',
    width=400,
    height=400,
):
    recent_url_robots, recent_tos_verdicts = robots_util.prepare_recent_robots_tos_info(
        tos_policies, tos_license_policies, tos_compete_policies, url_robots_summary, companies,
    )

    ROBOTS_LABELS = {
        "none": "None",
        "some": "Partial",
        "all": "Restricted",
    }


    TOS_LABELS = {
        "Unrestricted Use": "None",
        "Conditional Use": "Conditional",
        "NC Only": "NC Only",
        "No Re-Distribution": "No Distribution",
        "Non-Compete": "Non-Compete",
        "No AI": "No AI",
        "No Scraping": "No Crawling",
        "No Scraping & AI": "No Crawling or AI",
    }
    
    yaxis_order = ["Restricted", "Partial", "None"]
    xaxis_order = [TOS_LABELS["Unrestricted Use"],
                   TOS_LABELS["Conditional Use"],
                   TOS_LABELS["No Re-Distribution"],
                   TOS_LABELS["Non-Compete"],
                   TOS_LABELS["NC Only"],
                   TOS_LABELS["No AI"],
                   TOS_LABELS["No Scraping"],
                   TOS_LABELS["No Scraping & AI"]
                  ]
    
    # Create a defaultdict to store counts
    counts = defaultdict(lambda: defaultdict(int))
    token_counts = defaultdict(lambda: defaultdict(int))
    
    # Count the occurrences of each (status, policy) pair
    total_instances, total_tokens = 0, 0
    url_token_counts = url_token_lookup.get_url_to_token_map(corpora_choice)
    for url in set(recent_url_robots.keys()).intersection(set(recent_tos_verdicts.keys())):
        status = ROBOTS_LABELS[recent_url_robots.get(url, "none")]
        policy = TOS_LABELS[recent_tos_verdicts.get(url, "No Restrictions")]
        counts[status][policy] += 1
        total_instances += 1
        token_counts[status][policy] += url_token_counts[url]
        total_tokens += url_token_counts[url]
    
    # Create a list of tuples (status, policy, count)
    data = [{"Robots Restrictions": status, "Terms of Service Policies": policy, "Count": count, "Token Counts": token_counts[status][policy],
             "Percent": round(100 * count / total_instances, 2), 
             "Percent Tokens": round(100 * token_counts[status][policy] / total_tokens, 2),}
            for status in yaxis_order
            for policy in xaxis_order
            if (count := counts[status][policy]) > 0]
    
    # Create a DataFrame from the list of tuples
    df = pd.DataFrame(data)
    df['Formatted Percent'] = df['Percent'].apply(lambda x: f"{x:.1f} %")
    df['Formatted Percent Tokens'] = df['Percent Tokens'].apply(lambda x: f"{x:.1f} %")
    
    if use_token_counts:
        color_axis, text_axis = "Percent Tokens", "Formatted Percent Tokens"
    else:
        color_axis, text_axis = "Percent", "Formatted Percent"

    return plot_confusion_matrix(
        df,
        yaxis_order=yaxis_order, 
        xaxis_order=xaxis_order,
        text_axis=text_axis,
        color_axis=color_axis,
        yaxis_title="Robots Restrictions",
        xaxis_title="Terms of Service Policies",
        font_size=font_size,
        text_font_size=text_font_size,
        font_style=font_style,
        width=width,
        height=height,
    )

In [None]:
prepare_tos_robots_confusion_matrix(
    tos_policies,
    tos_license_policies,
    tos_compete_policies,
    url_robots_summary,
    COMPANIES_TO_ANALYZE,
    url_token_lookup,
    corpora_choice="dolma",
    font_size=18,
    text_font_size=24,
    width=1000,
    height=220,
)

# Company to company

In [None]:
def company_to_company_restrictions(url_robots_summary, companies, font_size=20, font_style='sans-serif', width=600, height=400):
    # Create a dictionary to hold the URL statuses for each company
    url_status_dict = {}
    
    for company in companies:
        # Get the latest URL robot statuses for the company
        agent_names = robots_util.get_bots(company)
        url_status = robots_util.get_latest_url_robot_statuses(url_robots_summary, agent_names)
        url_status_dict[company] = url_status

    # Create a list to hold the conditional probability data
    conditional_prob_data = []
    
    # Compare each pair of companies
    for company_a in companies:
        for company_b in companies:
            if company_a == company_b:
                continue  # Skip self-comparison
            
            status_a = url_status_dict[company_a]
            status_b = url_status_dict[company_b]
            
            total_restricted_a = sum(1 for status in status_a.values() if status == 'all')
            restricted_b_if_a_restricted = sum(1 for url, status in status_a.items() if status == 'all' and status_b.get(url) == 'all')
            
            if total_restricted_a > 0:
                pct_b_restricted_if_a_restricted = round((restricted_b_if_a_restricted / total_restricted_a) * 100, 1)
            else:
                pct_b_restricted_if_a_restricted = 0
            
            conditional_prob_data.append({
                'Company A': company_a,
                'Company B': company_b,
                'pct_a_restricted_if_b_restricted': pct_b_restricted_if_a_restricted
            })
    
    df = pd.DataFrame(conditional_prob_data)

    COMPANY_MAP = {
        'Google': 'Google',
        'OpenAI': 'OpenAI',
        'Anthropic': 'Anthr',
        'Cohere': 'Cohere',
        'Common Crawl': 'CC',
        'Meta': 'Meta',
        'Internet Archive': 'IA',
        'Google Search': 'Search',
        'False Anthropic': 'F. Anthr',
    }

    df['Company A'] = df['Company A'].map(COMPANY_MAP)
    df['Company B'] = df['Company B'].map(COMPANY_MAP)

    return plot_confusion_matrix(
        df,
        yaxis_order=[COMPANY_MAP[c] for c in companies], 
        xaxis_order=[COMPANY_MAP[c] for c in companies],
        text_axis='pct_a_restricted_if_b_restricted',
        color_axis='pct_a_restricted_if_b_restricted',
        color_scale=alt.Scale(
            domain=[
                df['pct_a_restricted_if_b_restricted'].min(),
                df['pct_a_restricted_if_b_restricted'].max()
            ],
            range=['#7ec1be', '#101f5b']
        ),
        yaxis_title="Company A",
        xaxis_title="Company B",
        font_size=font_size, 
        font_style=font_style,
        width=width,
        height=height,
    )

In [None]:
company_to_company_restrictions(url_robots_summary_head, ALL_COMPANIES_TO_TRACK, width=800, height=400)