In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util


%load_ext autoreload
%autoreload 2

### Define Paths to all relevant files

In [168]:
FPATH_TO_RELEVANT_URL_TOKENS = 'pretrain_data/relevant_url_token_counts.csv'
FPATH_to_HEAD_ROBOTS = "robots_data/temporal_robots_head.json"
FPATH_TO_RAND_ROBOTS = "robots_data/temporal_robots_rand_10k.json"
FPATH_TO_TOS_DATA = "robots_data/tos_ai_scraping_policies.json"
DIRPATHS_TO_ANNOTATED_TASKS = ["annotated_websites/Task 1", "annotated_websites/Task 2"]
FPATH_SNAPSHOT_DATA = "robots_data/temporal_main_sites_current.json"

ALL_COMPANIES_TO_TRACK = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta", "Internet Archive", "Google Search", "False Anthropic"]
COMPANIES_TO_ANALYZE = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
TEMPORAL_ANALYSIS_START_DATE = '2016-01-01'
TEMPORAL_ANALYSIS_END_DATE = '2024-04-30'

### Load all URL splits (top vs random) and maps to Token Counts

In [169]:
url_token_lookup = robots_util.URLTokenLookup(FPATH_TO_RELEVANT_URL_TOKENS) # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")
rf_url_to_counts = url_token_lookup.get_url_to_token_map("rf")
dolma_url_to_counts = url_token_lookup.get_url_to_token_map("dolma")
top_c4_urls = url_token_lookup.top_k_urls("c4", 2000)
top_rf_urls = url_token_lookup.top_k_urls("rf", 2000)
top_dolma_urls = url_token_lookup.top_k_urls("dolma", 2000)
random_10k_urls = url_token_lookup.get_10k_random_sample()
all_urls = set(random_10k_urls + top_c4_urls + top_rf_urls + top_dolma_urls)

# Load website snapshots for relevant URLs
website_snapshots = robots_util.read_snapshots(FPATH_SNAPSHOT_DATA, all_urls)

Number of tokens in 2000 URLs: 18447797380 | 10.85% of c4
Number of tokens in 2000 URLs: 67098747294 | 15.56% of rf
Number of tokens in 2000 URLs: 429152555144 | 21.74% of dolma


### Define Agents and Agent Groups

In [170]:
agent_groups_to_track = robots_util.get_bot_groups(ALL_COMPANIES_TO_TRACK)
agents_to_track = robots_util.get_bots()

### Load Robots.txt info

In [171]:
# URL -> Date -> Robots.txt raw text
head_robots = io.read_json(FPATH_to_HEAD_ROBOTS)
random_10k_robots = io.read_json(FPATH_TO_RAND_ROBOTS)
joined_robots = copy.deepcopy(head_robots)
joined_robots.update(random_10k_robots)
robots_util.print_out_robots_info(head_robots)
robots_util.print_out_robots_info(random_10k_robots)

# {URL --> Date --> Agent --> Status}
url_robots_summary, agent_counter_df = robots_util.compute_url_date_agent_status(
    data=joined_robots, 
    # relevant_agents=agents_to_track)
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs])

agent_counter_df.to_csv("all_agents_counter.csv", index=False)

Num robot URLs loaded: 830
Earliest time: 2016-01-01
Last time: 2024-04-19
Num robot URLs loaded: 6230
Earliest time: 2016-01-01
Last time: 2024-04-19


### Load ToS info

In [144]:
# URL --> Date --> ToS-suburl --> {"verdict": X, "evidence": Y}
tos_policies = io.read_json(FPATH_TO_TOS_DATA)
print(f"Num ToS URLs: {len(tos_policies)}")

Num ToS URLs: 3068


### Load Manual Pretraining Annotations

In [172]:
url_to_info = analysis_util.extract_url_annotations(DIRPATHS_TO_ANNOTATED_TASKS)
url_results_df = analysis_util.process_url_annotations(url_to_info)
url_results_df = analysis_util.encode_size_columns(url_results_df, url_token_lookup)
url_results_df = robots_util.encode_latest_tos_robots_into_df(
    url_results_df, tos_policies, url_robots_summary,
    COMPANIES_TO_ANALYZE
)

6663 rows before filtering.
3017 rows after filtering. 1550 issues, 2096 unannotated.
7059


# Create Plots

### Preprocessing for Robots Head & Random URL splits

In [173]:
### DECISION POINT: Use C4, Dolma, or RefinedWeb here?

CHOSEN_CORPUS = "c4" # 'c4', 'rf', 'dolma'
if CHOSEN_CORPUS == "c4":
    HEAD_URL_SET = top_c4_urls
    URL_TO_COUNTS = c4_url_to_counts
elif CHOSEN_CORPUS == "rf":
    HEAD_URL_SET = top_rf_urls
    URL_TO_COUNTS = rf_url_to_counts
elif CHOSEN_CORPUS == "dolma":
    HEAD_URL_SET = top_dolma_urls
    URL_TO_COUNTS = dolma_url_to_counts

In [174]:
url_robots_summary_head = {url: url_robots_summary[url] for url in HEAD_URL_SET if url in url_robots_summary}
url_robots_summary_rand = {url: url_robots_summary[url] for url in random_10k_urls if url in url_robots_summary}

In [175]:
# HEAD URL SPLIT
# {Period --> Agent --> Status --> set(URLs)}
robots_filled_status_head_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_head, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_snapshots,
)
# RANDOM URL SPLIT
robots_filled_status_rand_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_rand, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_snapshots,
)


  target_end_date = target_period.end_time.to_pydatetime().date()


In [176]:
# Dataframe w/ [Period, Agent, Status, count(URLs or tokens)]
robots_temporal_head_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary,
    url_to_counts=c4_url_to_counts,
)
# Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]
robots_temporal_rand_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_rand_summary,
    url_to_counts=URL_TO_COUNTS,
)

### Preprocessing for ToS

In [78]:
# URL --> time --> ToS verdict string. 
url_to_time_to_tos_verdict = robots_util.get_tos_url_time_verdicts(tos_policies)
# Period --> Status --> set(URLs)
period_tos_verdict_urls = robots_util.prepare_tos_temporal_summary(
    url_to_time_to_tos_verdict,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_snapshots,
)
# Dataframe: [Period, Status, Count, Tokens]
tos_summary_df = robots_util.tos_temporal_to_df(
    period_tos_verdict_urls,
    url_set=HEAD_URL_SET,
    url_to_counts=URL_TO_COUNTS,
)

  target_end_date = target_period.end_time.to_pydatetime().date()


## TODO: Add Forecasting code here:

In [None]:
# robots_temporal_head_summary --> function that extends for forecasting?
# robots_temporal_rand_summary --> function that extends for forecasting?
# tos_summary_df --> function that extends for forecasting?

### Plot Temporal Robots Area Chart

In [166]:
# for agent in agents_to_track:
#     robots_util.plot_robots_time_map(robots_temporal_summary, agent)

for group in agent_groups_to_track:
    print(group)
    chart = robots_util.plot_robots_time_map_altair(
        robots_temporal_head_summary, 
        agent_type=group, 
        period_col='period', 
        status_col='status', 
        val_col='tokens',  # "count" / "tokens"
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors={'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    )
    chart.show()
#     robots_util.plot_robots_time_map(robots_temporal_head_summary, group, val_key="count")

Google


OpenAI


Anthropic


Cohere


Common Crawl


Meta


Internet Archive


Google Search


### Plot Temporal ToS Area Chart

In [84]:
chart = robots_util.plot_temporal_area_map_altair(
    tos_summary_df,
    period_col='period', 
    status_col='status', 
    val_col='tokens',  # "count" / "tokens"
    title='Restriction Status over Time', 
    ordered_statuses=['No Terms Pages', 'No Restrictions', 'Conditional Restrictions', 'Prohibits AI', 'Prohibits Scraping', 'Prohibits Scraping & AI'], 
    status_colors= {
        'No Terms Pages': 'gray', 'No Restrictions': 'blue', 
        'Conditional Restrictions': 'yellow', 'Prohibits AI': 'orange', 
        'Prohibits Scraping': 'red', 'Prohibits Scraping & AI': 'red'
    },
)
chart.show()

In [182]:

# robots_temporal_head_summary

Restrictions by Company

In [120]:
# # Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]


In [181]:
agent_color_mapping = {
    "Google": "#1f77b4",          # blue
    "OpenAI": "#ff7f0e",          # orange
    "Anthropic": "#2ca02c",       # green
    "Cohere": "#d62728",          # red
    "Common Crawl": "#9467bd",    # purple
    "Meta": "#8c564b",            # brown
    "Internet Archive": "#e377c2",# pink
    "Google Search": "#7f7f7f",   # gray
    "False Anthropic": "#bcbd22"  # yellow
}

visualization_util.plot_company_comparisons_altair(
    robots_temporal_head_summary, color_mapping=agent_color_mapping)


In [113]:
# log scale
# x-axis fix

4.645388439379847

### Create Correlations Tables

In [449]:
url_correlation_df = analysis_util.analyze_url_variable_correlations(url_results_df, [100, 500, 2000], "dolma")

# Convert the dataframe to a LaTeX table
latex_table = url_correlation_df.to_latex(index=True, escape=True, float_format="{:.1f}".format)
print(latex_table)

Num URLs in Top-100: 68
Num URLs in Top-500: 323
Num URLs in Top-2000: 1013
Num URLs in random sample: 1056
\begin{tabular}{lrrrrll}
\toprule
 & Top 100 & Top 500 & Top 2000 & Random & Chi-Squared Stat & P-value \\
\midrule
User Content & 22.1 & 19.5 & 18.9 & 19.1 & 2.44 & 0.12 \\
Paywall & 30.9 & 35.9 & 29.8 & 1.8 & 0.00 & 1.00 \\
Ads & 51.5 & 56.0 & 52.0 & 5.8 & 9.35 & 0.00 \\
Modality: Image & 92.7 & 98.1 & 98.3 & 94.0 & 0.03 & 0.86 \\
Modality: Video & 85.3 & 81.7 & 68.2 & 19.9 & 9.14 & 0.00 \\
Modality: Audio & 82.3 & 75.2 & 53.7 & 3.2 & 2.77 & 0.10 \\
Sensitive Content & 0.0 & 0.6 & 0.9 & 0.6 & 0.00 & 1.00 \\
Restrictive Robots.txt & 27.9 & 18.6 & 10.6 & 1.1 & 0.61 & 0.43 \\
Restrictive Terms & 23.5 & 23.2 & 17.5 & 3.9 & 2.89 & 0.09 \\
domain\_Blogs & 1.5 & 1.9 & 2.8 & 16.5 & 2.65 & 0.10 \\
domain\_Books & 7.3 & 4.3 & 3.8 & 2.9 & 0.05 & 0.82 \\
domain\_Business \& E-Commerce & 13.2 & 18.0 & 20.4 & 58.1 & 0.66 & 0.42 \\
domain\_Entertainment \& Culture & 17.6 & 25.1 & 31.6 & 40.9 

### Robots & ToS Confusion Matrix

In [448]:
robots_util.prepare_tos_robots_confusion_matrix(
    tos_policies,
    url_robots_summary,
    COMPANIES_TO_ANALYZE,
    url_token_lookup,
    corpora_choice="dolma",
)

7059


In [None]:
# companies = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
# for company in companies:
#     # {URL --> Date --> Agent --> Status}
#     url_robots_summary[url][date][company]
#     # get latest
#     # df: [company1, company2, status1, status2]
#     # plot confusion matrix.
    

### Plot Num Tokens against Robots Restrictions per Company

In [450]:
size_bucket_to_urls = robots_util.bucket_urls_by_size(
    c4_url_to_counts, 
    bucket_boundaries=[0, 1000, 10000, 50000, 1000000, 10000000, 50000000, 9999999999999999]
)
robots_util.plot_size_against_restrictions(
    url_robots_summary,
    size_bucket_to_urls,
    "OpenAI",
    setting="train"
)

Bucket 0-1000: 4729
Bucket 1000-10000: 4064
Bucket 10000-50000: 1411
Bucket 50000-1000000: 896
Bucket 1000000-10000000: 2441
Bucket 10000000-50000000: 429
Bucket 50000000-9999999999999999: 19
7059
defaultdict(<function plot_size_against_restrictions.<locals>.<lambda> at 0x37eb5ea60>, {'10000000-50000000': [77, 167, 109], '1000000-10000000': [73, 223, 172], '50000000-9999999999999999': [5, 8, 4], '10000-50000': [16, 636, 327], '0-1000': [36, 1315, 886], '50000-1000000': [11, 220, 133], '1000-10000': [43, 1684, 914]})


# Scratch / Notes

In [None]:
Notes:

* Take All Agents and subtract it from the other plots to see the diffs between agents.
* How do these charts vary with different quantiles for number of tokens. (Behavior diffs for token rich and token poor)

* Incompatability between Robots.txt and ToS? Robots.txt is an encoded ToS for scrapers. Is there more intention detailed in the ToS than the robots?
* Robots and ToS update rate. --> How often that the other is updated within T time (robots and ToS).
* Analysis: Of all websites that restrict at least one AI bot, what other bots do they restrict? E.g. if you restrict cohere, you probaby also restrict OpenAI
P(Cohere restricted | OpenAI restricted)
* Include Midjourney, CCBot, IAbot, etc
* Vertical lines that show when bots get introduced.

* Restrictions are rising across the board.
* Company-wise restrictions
* Compare for each company their scraping restrictions vs RAG restrictions vs AI bot restrictions (OpenAI, Anthropic)

In [None]:
Data Sources Analysis:

0. Table: Methodology -- what metadata we collected/annotated (and how). [WIP]
1. Figure: Temporal changes in Robots / ToS (somehow over a collection or multiple companies?) [Waiting on ToS]
    C4/RF/(Dolma) vs (2k-Head)/Random vs Company(6)
2. Figure: Robots / ToS contradiction matrix [Waiting on ToS]
    Creative Commons.
3. Table: Robots permission differences by company + Stella(pairwise differences by confusion matrix) [WIP]
4. Table: Robots / ToS / other indicators variability by website num tokens (i.e. head vs tail) [WIP]
5. Figure: Commercial/market copyright concerns and comparison to WildChat + EU AI Act [WIP, waiting on WildChat]

In [None]:
Datasets Analysis:

(License: NC, C, Unspecified) x (Terms: Unspecified/None, NC, C).
Text =~ 200
Video = 11
Speech ~= 50

0. Tables for Text, Video, Speech [Will: WIP]
1. License & source (terms) restriction differences between Text, Video, Speech. (normalized stacked bar chart) [WIP]
2. Source domains by Text, Video, Speech: scraped, synthetic, crowdsourced, ....  (normalized stacked bar chart) [WIP]
2b. Include pretraining (broken down by modality + user content + illicit content)
3. Creator distribution for Text, Video, Speech by geography and organization type [WIP]

Extension: 


4. Temporal breakdown of license restriction categories by Text, Video, Speech [WIP]
5. License Type by modality for Text, Video, Speech [WIP]
6. Languages / Tasks.
(Will run statistical tests checking if diffs between Text,Video,Speech are significant on all of these. [WIP]

In [None]:


# People who restrict Anthropic but not OpenAI. <-- public awareness of organizations is the driving force here. 
# Ordered by notoriety. (Confusion matrix)

In [31]:
nayan = pd.read_csv("test_data/wildchat_annotations.csv")

In [3]:
# nayan

In [400]:
def sample_nayan(df, i):
    row = df.iloc[i]
    prompt = row["WildChat Example Prompt"]
    response = row["WildChat Example Response"]
    typs = row["Types of Service"]
    cd = row["Content Domain"]
    print(prompt)
    # print("*************" + response[:100])
    print(cd)
    print()

In [1]:
# for i in range(30, 60):
#     sample_nayan(nayan, i)