In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util


%load_ext autoreload
%autoreload 2

### Define Paths to all relevant files

In [2]:
FPATH_TO_RELEVANT_URL_TOKENS = 'pretrain_data/relevant_url_token_counts.csv'
FPATH_to_HEAD_ROBOTS = "robots_data/temporal_robots_head.json"
FPATH_TO_RAND_ROBOTS = "robots_data/temporal_robots_rand_10k.json"
FPATH_TO_TOS_DATA = "robots_data/tos_ai_scraping_policies.json"
DIRPATHS_TO_ANNOTATED_TASKS = ["annotated_websites/Task 1", "annotated_websites/Task 2"]
START_DATES = "robots_data/domain_start_dates.json"

ALL_COMPANIES_TO_TRACK = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta", "Internet Archive", "Google Search", "False Anthropic"]
COMPANIES_TO_ANALYZE = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
TEMPORAL_ANALYSIS_START_DATE = '2016-01-01'
TEMPORAL_ANALYSIS_END_DATE = '2024-04-30'

### Load all URL splits (top vs random) and maps to Token Counts

In [3]:
url_token_lookup = robots_util.URLTokenLookup(FPATH_TO_RELEVANT_URL_TOKENS) # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")
rf_url_to_counts = url_token_lookup.get_url_to_token_map("rf")
dolma_url_to_counts = url_token_lookup.get_url_to_token_map("dolma")
top_c4_urls = url_token_lookup.top_k_urls("c4", 2000)
top_rf_urls = url_token_lookup.top_k_urls("rf", 2000)
top_dolma_urls = url_token_lookup.top_k_urls("dolma", 2000)
random_10k_urls = url_token_lookup.get_10k_random_sample()
all_urls = set(random_10k_urls + top_c4_urls + top_rf_urls + top_dolma_urls)

# Load website snapshots for relevant URLs
website_start_dates = robots_util.read_start_dates(START_DATES, all_urls) # THIS WON'T WORK FOR THE 10k SAMPLE

Number of tokens in 2000 URLs: 18447797380 | 10.85% of c4
Number of tokens in 2000 URLs: 67098747294 | 15.56% of rf
Number of tokens in 2000 URLs: 429152555144 | 21.74% of dolma


### Define Agents and Agent Groups

In [4]:
agent_groups_to_track = robots_util.get_bot_groups(ALL_COMPANIES_TO_TRACK)
agents_to_track = robots_util.get_bots()

### Load Robots.txt info

In [5]:
# URL -> Date -> Robots.txt raw text
head_robots = io.read_json(FPATH_to_HEAD_ROBOTS)
random_10k_robots = io.read_json(FPATH_TO_RAND_ROBOTS)
joined_robots = copy.deepcopy(head_robots)
joined_robots.update(random_10k_robots)
robots_util.print_out_robots_info(head_robots)
robots_util.print_out_robots_info(random_10k_robots)

# {URL --> Date --> Agent --> Status}
url_robots_summary, agent_counter_df = robots_util.compute_url_date_agent_status(
    data=joined_robots, 
    # relevant_agents=agents_to_track)
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs])

agent_counter_df.to_csv("all_agents_counter.csv", index=False)

Num robot URLs loaded: 2631
Earliest time: 2016-01-01
Last time: 2024-04-19
Num robot URLs loaded: 6331
Earliest time: 2016-01-01
Last time: 2024-04-19


In [6]:
url_robots_summary_detailed = robots_util.compute_url_date_agent_status_detailed(
    data=joined_robots, 
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs]
)

### Load ToS info

In [None]:
# URL --> Date --> ToS-suburl --> {"verdict": X, "evidence": Y}
tos_policies = io.read_json(FPATH_TO_TOS_DATA)
print(f"Num ToS URLs: {len(tos_policies)}")

### Load Manual Pretraining Annotations

In [None]:
url_to_info = analysis_util.extract_url_annotations(DIRPATHS_TO_ANNOTATED_TASKS)
url_results_df = analysis_util.process_url_annotations(url_to_info)
url_results_df = analysis_util.encode_size_columns(url_results_df, url_token_lookup)
url_results_df = robots_util.encode_latest_tos_robots_into_df(
    url_results_df, tos_policies, url_robots_summary,
    COMPANIES_TO_ANALYZE
)

# Create Plots

### Preprocessing for Robots Head & Random URL splits

In [7]:
### DECISION POINT: Use C4, Dolma, or RefinedWeb here?

CHOSEN_CORPUS = "c4" # 'c4', 'rf', 'dolma'
if CHOSEN_CORPUS == "c4":
    HEAD_URL_SET = top_c4_urls
    URL_TO_COUNTS = c4_url_to_counts
elif CHOSEN_CORPUS == "rf":
    HEAD_URL_SET = top_rf_urls
    URL_TO_COUNTS = rf_url_to_counts
elif CHOSEN_CORPUS == "dolma":
    HEAD_URL_SET = top_dolma_urls
    URL_TO_COUNTS = dolma_url_to_counts

In [8]:
url_robots_summary_head = {url: url_robots_summary[url] for url in HEAD_URL_SET if url in url_robots_summary}
url_robots_summary_head_detailed = {url: url_robots_summary_detailed[url] for url in HEAD_URL_SET if url in url_robots_summary_detailed}
url_robots_summary_rand = {url: url_robots_summary[url] for url in random_10k_urls if url in url_robots_summary}

In [9]:
# HEAD URL SPLIT
# {Period --> Agent --> Status --> set(URLs)}
robots_filled_status_head_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_head, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)
# RANDOM URL SPLIT
robots_filled_status_rand_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_rand, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)

# DETAILED HEAD
robots_filled_status_head_summary_detailed = robots_util.prepare_robots_temporal_summary_detailed(
    url_robots_summary=url_robots_summary_head_detailed, 
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)

  target_end_date = target_period.end_time.to_pydatetime().date()


In [10]:
# Dataframe w/ [Period, Agent, Status, count(URLs or tokens)]
robots_temporal_head_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary,
    url_to_counts=c4_url_to_counts,
)
# Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]
robots_temporal_rand_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_rand_summary,
    url_to_counts=URL_TO_COUNTS,
)

robots_temporal_head_summary_detailed = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary_detailed,
    url_to_counts=URL_TO_COUNTS,
)

### Preprocessing for ToS

In [None]:
# URL --> time --> ToS verdict string. 
url_to_time_to_tos_verdict = robots_util.get_tos_url_time_verdicts(tos_policies)
# Period --> Status --> set(URLs)
period_tos_verdict_urls = robots_util.prepare_tos_temporal_summary(
    url_to_time_to_tos_verdict,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)
# Dataframe: [Period, Status, Count, Tokens]
tos_summary_df = robots_util.tos_temporal_to_df(
    period_tos_verdict_urls,
    url_set=HEAD_URL_SET,
    url_to_counts=URL_TO_COUNTS,
)

## TODO: Add Forecasting code here:

In [None]:
# robots_temporal_head_summary --> function that extends for forecasting?
# robots_temporal_rand_summary --> function that extends for forecasting?
# tos_summary_df --> function that extends for forecasting?

In [12]:
import pandas as pd
import altair as alt
from statsmodels.tsa.ar_model import AutoReg
from prophet import Prophet
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

#disable user warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


def create_lagged_features(df, lags):
    for lag in lags:
        df[f'lag_{lag}'] = df['y'].shift(lag)
    return df.dropna()


### Autoregressive robots temporal (best forecasting)

In [13]:
def forecast_and_plot(df, agent, lags):
    # Filter the DataFrame for the specific agent
    agent_df = df[df['agent'] == agent].copy()
    
    # Convert 'period' to timestamp if it's a Period object
    agent_df.loc[:, 'period'] = agent_df['period'].apply(lambda x: x.to_timestamp() if isinstance(x, pd.Period) else x)
    
    # Reshape the data
    pivoted_df = agent_df.pivot_table(index='period', columns='status', values='count')
    
    # Normalize the counts to percentages
    pivoted_df = pivoted_df.div(pivoted_df.sum(axis=1), axis=0) * 100
    
    # Doing these for each status individually
    status_dfs = {}
    for status in pivoted_df.columns:
        status_df = pivoted_df[[status]].reset_index()
        status_df.columns = ['ds', 'y']
        status_df.set_index('ds', inplace=True)
        status_dfs[status] = status_df
    
    # Fit model
    models = {}
    for status, status_df in status_dfs.items():
        model = AutoReg(status_df['y'], lags=lags)
        models[status] = model.fit()
    
    # Periods to predict (months)
    n_periods = 12  
    
    # Make future predictions
    future_periods = pd.date_range(start=agent_df['period'].max(), periods=n_periods, freq='M')
    predictions = {}
    conf_intervals = {}
    for status, model in models.items():
        forecast = model.predict(start=len(status_df), end=len(status_df) + n_periods - 1)
        conf_int = model.get_prediction(start=len(status_df), end=len(status_df) + n_periods - 1).conf_int()
        predictions[status] = forecast.values
        conf_intervals[status] = conf_int
    # Combine the predictions into a single DataFrame
    predicted_df = pd.DataFrame(predictions, index=future_periods)
    predicted_df = predicted_df.reset_index().melt(id_vars='index', var_name='status', value_name='count')
    predicted_df.columns = ['period', 'status', 'count']

    predicted_df['agent'] = agent
    
    # Concatenate the original and predicted DataFrames
    combined_df = pd.concat([agent_df, predicted_df], ignore_index=True)
    
    # Define the color scheme for the statuses
    status_colors = {'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    
    chart = robots_util.plot_robots_time_map_altair(
        combined_df, 
        agent_type=agent, 
        period_col='period', 
        status_col='status', 
        val_col='count', 
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors=status_colors,
        datetime_swap=True,
    )
     # map the confidence intervals to the predicted df
    for status, conf_int in conf_intervals.items():
        # Ensure the length of the confidence intervals matches the number of future periods
        predicted_df.loc[predicted_df['status'] == status, 'lower'] = conf_int['lower'].values
        predicted_df.loc[predicted_df['status'] == status, 'upper'] = conf_int['upper'].values

    return chart, predicted_df


### Prophet 

In [None]:
def forecast_and_plot_prophet(df, agent, lags):
    # Pick agent
    agent_df = df[df['agent'] == agent].copy()
    
    # Convert 'period' to timestamp if it's a Period object
    agent_df.loc[:, 'period'] = agent_df['period'].apply(lambda x: x.to_timestamp() if isinstance(x, pd.Period) else x)
    
    # Reshape the data
    pivoted_df = agent_df.pivot_table(index='period', columns='status', values='count')
    
    # Normalize the counts to percentages
    pivoted_df = pivoted_df.div(pivoted_df.sum(axis=1), axis=0) * 100
    
    # Create separate DataFrames for each status
    status_dfs = {}
    for status in pivoted_df.columns:
        status_df = pivoted_df[[status]].reset_index()
        status_df.columns = ['ds', 'y']
        status_df = create_lagged_features(status_df, lags)
        status_dfs[status] = status_df
    
    # Train time series models for each status
    models = {}
    for status, status_df in status_dfs.items():
        model = Prophet()
        for lag in lags:
            model.add_regressor(f'lag_{lag}')
        model.fit(status_df)
        models[status] = model
    
    # Define the number of future periods
    n_periods = 12 
    
    # Make future predictions
    future_periods = pd.date_range(start=agent_df['period'].max(), periods=n_periods, freq='M')
    predictions = {}
    for status, model in models.items():
        future_df = pd.DataFrame({'ds': future_periods})
        for lag in lags:
            future_df[f'lag_{lag}'] = status_dfs[status][f'lag_{lag}'].iloc[-1]
        forecast = model.predict(future_df)
        predictions[status] = forecast['yhat'].values
    
    # Combine the predictions into a single DataFrame
    predicted_df = pd.DataFrame(predictions, index=future_periods)
    predicted_df = predicted_df.reset_index().melt(id_vars='index', var_name='status', value_name='predicted_value')
    predicted_df.columns = ['period', 'status', 'predicted_value']
    # add agent column
    predicted_df['agent'] = agent
    # add tokens column
    predicted_df['tokens'] = predicted_df['predicted_value']
    # Define the color scheme for the statuses
    status_colors = {'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    
    chart = robots_util.plot_robots_time_map_altair(
        predicted_df, 
        agent_type=agent, 
        period_col='period', 
        status_col='status', 
        val_col='tokens',  # "count" / "tokens"
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors=status_colors,
        datetime_swap=True,
    )
    
    return chart

### ARIMA

In [None]:
def forecast_and_plot_arima(df, agent, lags):
    # Pick agent
    agent_df = df[df['agent'] == agent].copy()
    
    # Convert 'period' to timestamp if it's a Period object
    agent_df.loc[:, 'period'] = agent_df['period'].apply(lambda x: x.to_timestamp() if isinstance(x, pd.Period) else x)
    
    # Reshape the data
    pivoted_df = agent_df.pivot_table(index='period', columns='status', values='count')
    
    # Normalize the counts to percentages
    pivoted_df = pivoted_df.div(pivoted_df.sum(axis=1), axis=0) * 100
    
    # Create separate DataFrames for each status
    status_dfs = {}
    for status in pivoted_df.columns:
        status_df = pivoted_df[[status]].reset_index()
        status_df.columns = ['ds', 'y']
        status_df = create_lagged_features(status_df, lags)
        status_dfs[status] = status_df
    
    # Train ARIMA models for each status
    models = {}
    for status, status_df in status_dfs.items():
        model = ARIMA(status_df['y'], order=(max(lags), 0, 0))
        models[status] = model.fit()
    
    # Define the number of future periods
    n_periods = 12 
    
    # Make future predictions
    future_periods = pd.date_range(start=agent_df['period'].max(), periods=n_periods, freq='M')
    predictions = {}
    for status, model in models.items():
        forecast = model.forecast(steps=n_periods)
        predictions[status] = forecast.values
    
    # Combine the predictions into a single DataFrame
    predicted_df = pd.DataFrame(predictions, index=future_periods)
    predicted_df = predicted_df.reset_index().melt(id_vars='index', var_name='status', value_name='predicted_value')
    predicted_df.columns = ['period', 'status', 'predicted_value']
    # add agent column
    predicted_df['agent'] = agent
    # add tokens column
    predicted_df['tokens'] = predicted_df['predicted_value']
    # Define the color scheme for the statuses
    status_colors = {'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    
    chart = robots_util.plot_robots_time_map_altair(
        predicted_df, 
        agent_type=agent, 
        period_col='period', 
        status_col='status', 
        val_col='tokens', 
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors=status_colors,
        datetime_swap=True,
    )
    
    return chart


### SARIMA

In [None]:
def forecast_and_plot_sarima(df, agent, lags, seasonal_order):
    # Pick agent
    agent_df = df[df['agent'] == agent].copy()
    
    # Convert 'period' to timestamp if it's a Period object
    agent_df.loc[:, 'period'] = agent_df['period'].apply(lambda x: x.to_timestamp() if isinstance(x, pd.Period) else x)
    
    # Reshape the data
    pivoted_df = agent_df.pivot_table(index='period', columns='status', values='count')
    
    # Normalize the counts to percentages
    pivoted_df = pivoted_df.div(pivoted_df.sum(axis=1), axis=0) * 100
    
    # Create separate DataFrames for each status
    status_dfs = {}
    for status in pivoted_df.columns:
        status_df = pivoted_df[[status]].reset_index()
        status_df.columns = ['ds', 'y']
        status_df = create_lagged_features(status_df, lags)
        status_dfs[status] = status_df
    
    # Train SARIMA models for each status
    models = {}
    for status, status_df in status_dfs.items():
        model = SARIMAX(status_df['y'], order=(max(lags), 0, 0), seasonal_order=seasonal_order)
        models[status] = model.fit(disp=False)
    
    # Define the number of future periods
    n_periods = 12 
    
    # Make future predictions
    future_periods = pd.date_range(start=agent_df['period'].max(), periods=n_periods, freq='M')
    predictions = {}
    for status, model in models.items():
        forecast = model.get_forecast(steps=n_periods)
        predictions[status] = forecast.predicted_mean.values
    
    # Combine the predictions into a single DataFrame
    predicted_df = pd.DataFrame(predictions, index=future_periods)
    predicted_df = predicted_df.reset_index().melt(id_vars='index', var_name='status', value_name='predicted_value')
    predicted_df.columns = ['period', 'status', 'predicted_value']
    # add agent column
    predicted_df['agent'] = agent
    # add tokens column
    predicted_df['tokens'] = predicted_df['predicted_value']
    # Define the color scheme for the statuses
    status_colors = {'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    
    chart = robots_util.plot_robots_time_map_altair(
        predicted_df, 
        agent_type=agent, 
        period_col='period', 
        status_col='status', 
        val_col='tokens', 
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors=status_colors,
        datetime_swap=True,
    )
    
    return chart


### Run Forecasts

In [14]:
def analyze_robots(df, analysis_type, lags, seasonal_order=None, display=False):
    """
    Analyzes robot data for different agents using specified forecasting methods.

    Parameters:
    - analysis_type (str): Type of analysis to perform. Options are 'autoregression', 'prophet', 'arima', 'sarima'.
    - lags (list): List of lag values to be used in the forecasting models.
    - seasonal_order (tuple, optional): Seasonal order parameters for SARIMA model. Default is None.
    - display (bool, optional): If True, displays the predicted DataFrame. Default is False.

    Returns:
    - None: Displays the chart for each agent.
    """
    agents = df['agent'].unique()
    
    for agent in agents:
        print(f"CHOSEN_CORPUS: {CHOSEN_CORPUS}")
        print(f"AGENT: {agent}")
        
        if analysis_type == 'autoregression':
            chart, predicted_df = forecast_and_plot(df, agent, lags)
        elif analysis_type == 'prophet':
            chart = forecast_and_plot_prophet(df, agent, lags)
        elif analysis_type == 'arima':
            chart = forecast_and_plot_arima(df, agent, lags)
        elif analysis_type == 'sarima':
            chart = forecast_and_plot_sarima(df, agent, lags, seasonal_order)
        else:
            raise ValueError("Invalid analysis type specified.")
        
        if display & (analysis_type == 'autoregression'):
            display(predicted_df)
        chart.show()

# Example usage:
df_to_analyze = robots_temporal_head_summary  # or robots_temporal_rand_summary
analyze_robots(df_to_analyze, analysis_type='autoregression', lags=[1, 3, 6, 9])

CHOSEN_CORPUS: c4
AGENT: Google


CHOSEN_CORPUS: c4
AGENT: OpenAI


CHOSEN_CORPUS: c4
AGENT: Anthropic


CHOSEN_CORPUS: c4
AGENT: Cohere


CHOSEN_CORPUS: c4
AGENT: Common Crawl


CHOSEN_CORPUS: c4
AGENT: Meta


CHOSEN_CORPUS: c4
AGENT: Internet Archive


CHOSEN_CORPUS: c4
AGENT: Google Search


CHOSEN_CORPUS: c4
AGENT: False Anthropic


### Plot Temporal Robots Area Chart

In [None]:
# for agent in agents_to_track:
#     robots_util.plot_robots_time_map(robots_temporal_summary, agent)

for group in agent_groups_to_track:
    print(group)
    chart = robots_util.plot_robots_time_map_altair(
        robots_temporal_head_summary, 
        agent_type=group, 
        period_col='period', 
        status_col='status', 
        val_col='tokens',  # "count" / "tokens"
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors={'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    )
    chart.show()
#     robots_util.plot_robots_time_map(robots_temporal_head_summary, group, val_key="count")

In [15]:
ordered_statuses = [
    "no_robots", "none", "none_sitemap", "none_crawl_delay", "some_other",
    "some_disallow_important_dir", "some_disallow_file_types", "some_pattern_restrictions", "all"
]

status_colors = {
    "all": "red",
    "some_pattern_restrictions": "#FFA500",
    "some_disallow_file_types": "#FF8C00",
    "some_disallow_important_dir": "#FF4500",
    "some_other": "#FFC107",
    "none_crawl_delay": "lightskyblue",
    "none_sitemap": "navy",
    "none": "blue",
    "no_robots": "gray"
}

legend_mapping = {
    "all": "Full restrictions",
    "some_pattern_restrictions": "Pattern-based restrictions",
    "some_disallow_file_types": "Disallow specific file types",
    "some_disallow_important_dir": "Disallow private directories",
    "some_other": "Other restrictions",
    "none_crawl_delay": "Crawl delay specified",
    "none_sitemap": "Sitemap provided",
    "none": "No restrictions or sitemap",
    "no_robots": "No Robots.txt"
}

for group in agent_groups_to_track:
    print(group)
    chart = robots_util.plot_robots_time_map_altair_detailed(
        robots_temporal_head_summary_detailed,
        agent_type=group,
        period_col='period',
        status_col='status',
        val_col='tokens',
        title='Restriction Status over Time',
        ordered_statuses=ordered_statuses,
        status_colors=status_colors,
    )
    chart.show()

Google


OpenAI


Anthropic


Cohere


Common Crawl


Meta


Internet Archive


Google Search


False Anthropic


### Plot Temporal ToS Area Chart

In [None]:
chart = robots_util.plot_temporal_area_map_altair(
    tos_summary_df,
    period_col='period', 
    status_col='status', 
    val_col='tokens',  # "count" / "tokens"
    title='Restriction Status over Time', 
    ordered_statuses=['No Terms Pages', 'No Restrictions', 'Conditional Restrictions', 'Prohibits AI', 'Prohibits Scraping', 'Prohibits Scraping & AI'], 
    status_colors= {
        'No Terms Pages': 'gray', 'No Restrictions': 'blue', 
        'Conditional Restrictions': 'yellow', 'Prohibits AI': 'orange', 
        'Prohibits Scraping': 'red', 'Prohibits Scraping & AI': 'red'
    },
)
chart.show()

In [None]:

# robots_temporal_head_summary

Restrictions by Company

In [None]:
# # Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]


In [None]:
agent_color_mapping = {
    "Google": "#1f77b4",          # blue
    "OpenAI": "#ff7f0e",          # orange
    "Anthropic": "#2ca02c",       # green
    "Cohere": "#d62728",          # red
    "Common Crawl": "#9467bd",    # purple
    "Meta": "#8c564b",            # brown
    "Internet Archive": "#e377c2",# pink
    "Google Search": "#7f7f7f",   # gray
    "False Anthropic": "#bcbd22"  # yellow
}

visualization_util.plot_company_comparisons_altair(
    robots_temporal_head_summary, color_mapping=agent_color_mapping)


In [None]:
# log scale
# x-axis fix

### Create Correlations Tables

In [None]:
url_correlation_df = analysis_util.analyze_url_variable_correlations(url_results_df, [100, 500, 2000], "dolma")

# Convert the dataframe to a LaTeX table
latex_table = url_correlation_df.to_latex(index=True, escape=True, float_format="{:.1f}".format)
print(latex_table)

### Robots & ToS Confusion Matrix

In [None]:
robots_util.prepare_tos_robots_confusion_matrix(
    tos_policies,
    url_robots_summary,
    COMPANIES_TO_ANALYZE,
    url_token_lookup,
    corpora_choice="dolma",
)

In [None]:
# companies = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
# for company in companies:
#     # {URL --> Date --> Agent --> Status}
#     url_robots_summary[url][date][company]
#     # get latest
#     # df: [company1, company2, status1, status2]
#     # plot confusion matrix.
    

### Plot Num Tokens against Robots Restrictions per Company

In [None]:
size_bucket_to_urls = robots_util.bucket_urls_by_size(
    c4_url_to_counts, 
    bucket_boundaries=[0, 1000, 10000, 50000, 1000000, 10000000, 50000000, 9999999999999999]
)
robots_util.plot_size_against_restrictions(
    url_robots_summary,
    size_bucket_to_urls,
    "OpenAI",
    setting="train"
)

# Scratch / Notes

In [None]:
Notes:

* Take All Agents and subtract it from the other plots to see the diffs between agents.
* How do these charts vary with different quantiles for number of tokens. (Behavior diffs for token rich and token poor)

* Incompatability between Robots.txt and ToS? Robots.txt is an encoded ToS for scrapers. Is there more intention detailed in the ToS than the robots?
* Robots and ToS update rate. --> How often that the other is updated within T time (robots and ToS).
* Analysis: Of all websites that restrict at least one AI bot, what other bots do they restrict? E.g. if you restrict cohere, you probaby also restrict OpenAI
P(Cohere restricted | OpenAI restricted)
* Include Midjourney, CCBot, IAbot, etc
* Vertical lines that show when bots get introduced.

* Restrictions are rising across the board.
* Company-wise restrictions
* Compare for each company their scraping restrictions vs RAG restrictions vs AI bot restrictions (OpenAI, Anthropic)

In [None]:
Data Sources Analysis:

0. Table: Methodology -- what metadata we collected/annotated (and how). [WIP]
1. Figure: Temporal changes in Robots / ToS (somehow over a collection or multiple companies?) [Waiting on ToS]
    C4/RF/(Dolma) vs (2k-Head)/Random vs Company(6)
2. Figure: Robots / ToS contradiction matrix [Waiting on ToS]
    Creative Commons.
3. Table: Robots permission differences by company + Stella(pairwise differences by confusion matrix) [WIP]
4. Table: Robots / ToS / other indicators variability by website num tokens (i.e. head vs tail) [WIP]
5. Figure: Commercial/market copyright concerns and comparison to WildChat + EU AI Act [WIP, waiting on WildChat]

In [None]:
Datasets Analysis:

(License: NC, C, Unspecified) x (Terms: Unspecified/None, NC, C).
Text =~ 200
Video = 11
Speech ~= 50

0. Tables for Text, Video, Speech [Will: WIP]
1. License & source (terms) restriction differences between Text, Video, Speech. (normalized stacked bar chart) [WIP]
2. Source domains by Text, Video, Speech: scraped, synthetic, crowdsourced, ....  (normalized stacked bar chart) [WIP]
2b. Include pretraining (broken down by modality + user content + illicit content)
3. Creator distribution for Text, Video, Speech by geography and organization type [WIP]

Extension: 


4. Temporal breakdown of license restriction categories by Text, Video, Speech [WIP]
5. License Type by modality for Text, Video, Speech [WIP]
6. Languages / Tasks.
(Will run statistical tests checking if diffs between Text,Video,Speech are significant on all of these. [WIP]

In [None]:


# People who restrict Anthropic but not OpenAI. <-- public awareness of organizations is the driving force here. 
# Ordered by notoriety. (Confusion matrix)

In [None]:
nayan = pd.read_csv("test_data/wildchat_annotations.csv")

In [None]:
# nayan

In [None]:
def sample_nayan(df, i):
    row = df.iloc[i]
    prompt = row["WildChat Example Prompt"]
    response = row["WildChat Example Response"]
    typs = row["Types of Service"]
    cd = row["Content Domain"]
    print(prompt)
    # print("*************" + response[:100])
    print(cd)
    print()

In [None]:
# for i in range(30, 60):
#     sample_nayan(nayan, i)