In [312]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [370]:
COMPANIES_TO_ANALYZE = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]

In [288]:
url_token_lookup = robots_util.URLTokenLookup('pretrain_data/relevant_url_token_counts.csv') # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")

In [8]:
# URL -> Date -> Robots.txt raw text
head_robots = io.read_json("robots_data/temporal_robots_head.json")
random_10k_robots = io.read_json("robots_data/temporal_robots_rand_10k.json")
joined_robots = copy.deepcopy(head_robots)
joined_robots.update(random_10k_robots)
print(len(head_robots))
print(len(random_10k_robots))
print(len(joined_robots))

830
6230
7059


In [9]:
# Print out start and end years of the data:

all_times = []
for k, vs in joined_robots.items():
    for time in vs:
        all_times.append(time)
all_times = set(all_times)
print(min(all_times))
print(max(all_times))
print(sorted(all_times)[:10])
print(sorted(all_times)[-10:])

2016-01-01
2024-04-19
['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08', '2016-01-09', '2016-01-10']
['2024-04-10', '2024-04-11', '2024-04-12', '2024-04-13', '2024-04-14', '2024-04-15', '2024-04-16', '2024-04-17', '2024-04-18', '2024-04-19']


In [371]:
agent_groups_to_track = robots_util.get_bot_groups(COMPANIES_TO_ANALYZE)
agents_to_track = robots_util.get_bots()
# {URL --> Date --> Agent --> Status}
url_robots_summary = robots_util.compute_url_date_agent_status(
    data=joined_robots, 
    # relevant_agents=agents_to_track)
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs])

In [12]:
# top N sample: TODO: Top 1k vs top 800 token info.
top_c4_urls = url_token_lookup.top_k_urls("c4", 1000)
url_robots_summary_head = {k: url_robots_summary[k] for k in top_c4_urls if k in top_c4_urls}
url_robots_summary_rand = {k: url_robots_summary[k] for k, _ in random_10k_robots.items()}

Number of tokens in 1000 URLs: 13808461301 | 8.12% of c4


In [289]:
size_bucket_to_urls = robots_util.bucket_urls_by_size(
    c4_url_to_counts, 
    bucket_boundaries=[0, 1000, 10000, 50000, 1000000, 10000000, 50000000, 9999999999999999]
)

Bucket 0-1000: 4729
Bucket 1000-10000: 4064
Bucket 10000-50000: 1411
Bucket 50000-1000000: 896
Bucket 1000000-10000000: 2441
Bucket 10000000-50000000: 429
Bucket 50000000-9999999999999999: 19


In [307]:
robots_util.plot_size_against_restrictions(
    url_robots_summary,
    size_bucket_to_urls,
    "OpenAI",
    setting="train"
)

7059
defaultdict(<function plot_size_against_restrictions.<locals>.<lambda> at 0x1366ec820>, {'10000000-50000000': [77, 167, 109], '1000000-10000000': [73, 223, 172], '50000000-9999999999999999': [5, 8, 4], '10000-50000': [16, 636, 327], '0-1000': [36, 1315, 886], '50000-1000000': [11, 220, 133], '1000-10000': [43, 1684, 914]})


In [296]:

# {Period --> Agent --> Status --> set(URLs)}
robots_filled_status_head_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_head, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time='2016-01-01', 
    end_time='2024-04-30',
    time_frequency="M",
)
robots_filled_status_rand_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_rand, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time='2016-01-01', 
    end_time='2024-04-30',
    time_frequency="M",
)


2016-01


  


2016-11
2017-09
2018-07
2019-05
2020-03
2021-01
2021-11
2022-09
2023-07
2016-01
2016-11
2017-09
2018-07
2019-05
2020-03
2021-01
2021-11
2022-09
2023-07


In [297]:

# Dataframe w/ [Period, Agent, Status, count(URLs or tokens)]
robots_temporal_head_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary,
    url_to_counts=c4_url_to_counts,
)
# Dataframe w/ [Period, Agent, Status, count(URLs or tokens)]
robots_temporal_rand_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_rand_summary,
    url_to_counts=c4_url_to_counts,
)

In [285]:
# robots_temporal_head_summary

In [349]:
url_to_info = analysis_util.extract_url_annotations(["annotated_websites/Task 1", "annotated_websites/Task 2"])
url_results_df = pd.DataFrame(analysis_util.process_url_annotations(url_to_info))
url_results_df = analysis_util.encode_size_columns(url_results_df, url_token_lookup)
url_results_df = robots_util.encode_latest_tos_robots_into_df(
    url_results_df, tos_policies, url_robots_summary,
    ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
)

url_correlation_df = analysis_util.analyze_url_variable_correlations(url_results_df, [100, 500, 2000], "dolma")

# Convert the dataframe to a LaTeX table
latex_table = url_correlation_df.to_latex(index=True, escape=True, float_format="{:.1f}".format)
print(latex_table)

6663 rows before filtering.
3017 rows after filtering. 1550 issues, 2096 unannotated.
7059
Num URLs in Top-100: 68
Num URLs in Top-500: 323
Num URLs in Top-2000: 1013
Num URLs in random sample: 1056
\begin{tabular}{lrrrrll}
\toprule
 & Top 100 & Top 500 & Top 2000 & Random & Chi-Squared Stat & P-value \\
\midrule
User Content & 22.1 & 19.5 & 18.9 & 19.1 & 2.44 & 0.12 \\
Paywall & 30.9 & 35.9 & 29.8 & 1.8 & 0.00 & 1.00 \\
Ads & 51.5 & 56.0 & 52.0 & 5.8 & 9.35 & 0.00 \\
Modality: Image & 92.7 & 98.1 & 98.3 & 94.0 & 0.03 & 0.86 \\
Modality: Video & 85.3 & 81.7 & 68.2 & 19.9 & 9.14 & 0.00 \\
Modality: Audio & 82.3 & 75.2 & 53.7 & 3.2 & 2.77 & 0.10 \\
Sensitive Content & 0.0 & 0.6 & 0.9 & 0.6 & 0.00 & 1.00 \\
Restrictive Robots.txt & 27.9 & 18.6 & 10.6 & 1.1 & 0.61 & 0.43 \\
Restrictive Terms & 23.5 & 23.2 & 17.5 & 3.9 & 2.89 & 0.09 \\
domain\_Blogs & 1.5 & 1.9 & 2.8 & 16.5 & 2.65 & 0.10 \\
domain\_Books & 7.3 & 4.3 & 3.8 & 2.9 & 0.05 & 0.82 \\
domain\_Business \& E-Commerce & 13.2 & 18.0 &

In [None]:
companies = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
for company in companies:
    # {URL --> Date --> Agent --> Status}
    url_robots_summary[url][date][company]
    # get latest
    # df: [company1, company2, status1, status2]
    # plot confusion matrix.
    
    



7059


In [350]:


# for agent in agents_to_track:
#     robots_util.plot_robots_time_map(robots_temporal_summary, agent)

# for group in agent_groups_to_track:
#     robots_util.plot_robots_time_map(robots_temporal_head_summary, group, val_key="count")

In [None]:
Notes:

* Take All Agents and subtract it from the other plots to see the diffs between agents.
* How do these charts vary with different quantiles for number of tokens. (Behavior diffs for token rich and token poor)

* Incompatability between Robots.txt and ToS? Robots.txt is an encoded ToS for scrapers. Is there more intention detailed in the ToS than the robots?
* Robots and ToS update rate. --> How often that the other is updated within T time (robots and ToS).
* Analysis: Of all websites that restrict at least one AI bot, what other bots do they restrict? E.g. if you restrict cohere, you probaby also restrict OpenAI
P(Cohere restricted | OpenAI restricted)
* Include Midjourney, CCBot, IAbot, etc
* Vertical lines that show when bots get introduced.

* Restrictions are rising across the board.
* Company-wise restrictions
* Compare for each company their scraping restrictions vs RAG restrictions vs AI bot restrictions (OpenAI, Anthropic)

In [None]:
- company-wise analysis: by agent, by RAG, by scraping
- rate of change over time
- most added agent in each period (bi-annually?)
- token quantity-adjusted
- correlation between token size and robots.txt restrictions?
- overleaf table for agent mentions and status

In [None]:
tos_policies = io.read_json("robots_data/tos_ai_scraping_policies.json")
print(len(tos_policies))

In [367]:
def prepare_tos_robots_confusion_matrix(
    tos_policies,
    url_robots_summary,
    companies,
    url_token_lookup,
    use_token_counts=True,
    corpora_choice="c4",
    font_size=20, 
    font_style='sans-serif',
    width=400,
    height=400,
):
    recent_url_robots, recent_tos_verdicts = robots_util.prepare_recent_robots_tos_info(
        tos_policies, url_robots_summary, companies,
    )

    ROBOTS_LABELS = {
        "none": "None",
        "some": "Partial",
        "all": "Restricted",
    }
    yaxis_order = ["Restricted", "Partial", "None"]
    xaxis_order = ["No Restrictions", "Conditional Restrictions", "Prohibits AI", "Prohibits Scraping", "Prohibits Scraping & AI"]
    
    # Create a defaultdict to store counts
    counts = defaultdict(lambda: defaultdict(int))
    token_counts = defaultdict(lambda: defaultdict(int))
    
    # Count the occurrences of each (status, policy) pair
    total_instances, total_tokens = 0, 0
    url_token_counts = url_token_lookup.get_url_to_token_map(corpora_choice)
    for url in set(recent_url_robots.keys()).intersection(set(recent_tos_verdicts.keys())):
    # for url in url_to_status.keys():
        status = ROBOTS_LABELS[recent_url_robots.get(url, "none")]
        policy = recent_tos_verdicts.get(url, "No Restrictions")
        counts[status][policy] += 1
        total_instances += 1
        token_counts[status][policy] += url_token_counts[url]
        total_tokens += url_token_counts[url]
    
    # Create a list of tuples (status, policy, count)
    data = [{"Robots Restrictions": status, "Terms of Service Policies": policy, "Count": count, "Token Counts": token_counts[status][policy],
             "Percent": round(100 * count / total_instances, 2), 
             "Percent Tokens": round(100 * token_counts[status][policy] / total_tokens, 2),}
            for status in yaxis_order
            for policy in xaxis_order
            if (count := counts[status][policy]) > 0]
    
    # Create a DataFrame from the list of tuples
    df = pd.DataFrame(data)
    df['Formatted Percent'] = df['Percent'].apply(lambda x: f"{x:.1f} %")
    df['Formatted Percent Tokens'] = df['Percent Tokens'].apply(lambda x: f"{x:.1f} %")
    
    if use_token_counts:
        color_axis, text_axis = "Percent Tokens", "Formatted Percent Tokens"
    else:
        color_axis, text_axis = "Percent", "Formatted Percent"

    # print(df)
    return visualization_util.plot_confusion_matrix(
        df,
        yaxis_order=yaxis_order, 
        xaxis_order=xaxis_order,
        text_axis=text_axis,
        color_axis=color_axis,
        yaxis_title="Robots Restrictions",
        xaxis_title="Terms of Service Policies",
        font_size=20, 
        font_style='sans-serif',
        width=400,
        height=400,
    )



In [368]:
prepare_tos_robots_confusion_matrix(
    tos_policies,
    url_robots_summary,
    ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"],
    url_token_lookup,
    corpora_choice="c4",
)

7059
   Robots Restrictions Terms of Service Policies  Count  Token Counts  \
0           Restricted           No Restrictions    124    1881478619   
1           Restricted  Conditional Restrictions      2       6100820   
2           Restricted        Prohibits Scraping     33     398210781   
3           Restricted   Prohibits Scraping & AI      9     192894509   
4              Partial           No Restrictions    538    4352218487   
5              Partial  Conditional Restrictions      9      92340616   
6              Partial        Prohibits Scraping     64     578228505   
7              Partial   Prohibits Scraping & AI      7      73522104   
8                 None           No Restrictions    290    1691396249   
9                 None  Conditional Restrictions     10     131290077   
10                None        Prohibits Scraping     35     419625075   
11                None   Prohibits Scraping & AI      4      47134951   

    Percent  Percent Tokens Formatted Percent

In [None]:
# clean-up
# Legend
# set fonts and sizes and legend

In [261]:
# c4_top2k_counts

In [None]:
Data Sources Analysis:

0. Table: Methodology -- what metadata we collected/annotated (and how). [WIP]
1. Figure: Temporal changes in Robots / ToS (somehow over a collection or multiple companies?) [Waiting on ToS]
    C4/RF/(Dolma) vs (2k-Head)/Random vs Company(6)
2. Figure: Robots / ToS contradiction matrix [Waiting on ToS]
    Creative Commons.
3. Table: Robots permission differences by company + Stella(pairwise differences by confusion matrix) [WIP]
4. Table: Robots / ToS / other indicators variability by website num tokens (i.e. head vs tail) [WIP]
5. Figure: Commercial/market copyright concerns and comparison to WildChat + EU AI Act [WIP, waiting on WildChat]

In [None]:
Datasets Analysis:

(License: NC, C, Unspecified) x (Terms: Unspecified/None, NC, C).
Text =~ 200
Video = 11
Speech ~= 50

0. Tables for Text, Video, Speech [Will: WIP]
1. License & source (terms) restriction differences between Text, Video, Speech. (normalized stacked bar chart) [WIP]
2. Source domains by Text, Video, Speech: scraped, synthetic, crowdsourced, ....  (normalized stacked bar chart) [WIP]
2b. Include pretraining (broken down by modality + user content + illicit content)
3. Creator distribution for Text, Video, Speech by geography and organization type [WIP]

Extension: 


4. Temporal breakdown of license restriction categories by Text, Video, Speech [WIP]
5. License Type by modality for Text, Video, Speech [WIP]
6. Languages / Tasks.
(Will run statistical tests checking if diffs between Text,Video,Speech are significant on all of these. [WIP]

In [None]:


# People who restrict Anthropic but not OpenAI. <-- public awareness of organizations is the driving force here. 
# Ordered by notoriety. (Confusion matrix)

In [31]:
nayan = pd.read_csv("test_data/wildchat_annotations.csv")

In [3]:
# nayan

In [400]:
def sample_nayan(df, i):
    row = df.iloc[i]
    prompt = row["WildChat Example Prompt"]
    response = row["WildChat Example Response"]
    typs = row["Types of Service"]
    cd = row["Content Domain"]
    print(prompt)
    # print("*************" + response[:100])
    print(cd)
    print()

In [1]:
# for i in range(30, 60):
#     sample_nayan(nayan, i)