In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util


%load_ext autoreload
%autoreload 2

### Define Paths to all relevant files

In [None]:
COMPANIES_TO_ANALYZE = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
FPATH_TO_RELEVANT_URL_TOKENS = 'pretrain_data/relevant_url_token_counts.csv'
FPATH_to_HEAD_ROBOTS = "robots_data/temporal_robots_head.json"
FPATH_TO_RAND_ROBOTS = "robots_data/temporal_robots_rand_10k.json"
FPATH_TO_TOS_DATA = "robots_data/tos_ai_scraping_policies.json"
DIRPATHS_TO_ANNOTATED_TASKS = ["annotated_websites/Task 1", "annotated_websites/Task 2"]
START_DATES = "robots_data/domain_start_dates.json"

TEMPORAL_ANALYSIS_START_DATE = '2016-01-01'
TEMPORAL_ANALYSIS_END_DATE = '2024-04-30'

### Load all URL splits (top vs random) and maps to Token Counts

In [None]:
url_token_lookup = robots_util.URLTokenLookup(FPATH_TO_RELEVANT_URL_TOKENS) # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")
rf_url_to_counts = url_token_lookup.get_url_to_token_map("rf")
dolma_url_to_counts = url_token_lookup.get_url_to_token_map("dolma")
top_c4_urls = url_token_lookup.top_k_urls("c4", 2000)
top_rf_urls = url_token_lookup.top_k_urls("rf", 2000)
top_dolma_urls = url_token_lookup.top_k_urls("dolma", 2000)
random_10k_urls = url_token_lookup.get_10k_random_sample()
all_urls = set(random_10k_urls + top_c4_urls + top_rf_urls + top_dolma_urls)

# Load website snapshots for relevant URLs
website_start_dates = robots_util.read_start_dates(START_DATES, all_urls)

### Define Agents and Agent Groups

In [None]:
agent_groups_to_track = robots_util.get_bot_groups(COMPANIES_TO_ANALYZE)
agents_to_track = robots_util.get_bots()

### Load Robots.txt info

In [None]:
# URL -> Date -> Robots.txt raw text
head_robots = io.read_json(FPATH_to_HEAD_ROBOTS)
random_10k_robots = io.read_json(FPATH_TO_RAND_ROBOTS)
joined_robots = copy.deepcopy(head_robots)
joined_robots.update(random_10k_robots)
robots_util.print_out_robots_info(head_robots)
robots_util.print_out_robots_info(random_10k_robots)

# {URL --> Date --> Agent --> Status}
url_robots_summary = robots_util.compute_url_date_agent_status(
    data=joined_robots, 
    # relevant_agents=agents_to_track)
    relevant_agents=[v for vs in agent_groups_to_track.values() for v in vs])

### Load ToS info

In [None]:
# URL --> Date --> ToS-suburl --> {"verdict": X, "evidence": Y}
tos_policies = io.read_json(FPATH_TO_TOS_DATA)
print(f"Num ToS URLs: {len(tos_policies)}")

### Load Manual Pretraining Annotations

In [None]:
url_to_info = analysis_util.extract_url_annotations(DIRPATHS_TO_ANNOTATED_TASKS)
url_results_df = pd.DataFrame(analysis_util.process_url_annotations(url_to_info))
url_results_df = analysis_util.encode_size_columns(url_results_df, url_token_lookup)
url_results_df = robots_util.encode_latest_tos_robots_into_df(
    url_results_df, tos_policies, url_robots_summary,
    COMPANIES_TO_ANALYZE
)

# Create Plots

### Preprocessing for Robots Head & Random URL splits

In [None]:
### DECISION POINT: Use C4, Dolma, or RefinedWeb here?

CHOSEN_CORPUS = "c4" # 'c4', 'rf', 'dolma'
if CHOSEN_CORPUS == "c4":
    HEAD_URL_SET = top_c4_urls
    URL_TO_COUNTS = c4_url_to_counts
elif CHOSEN_CORPUS == "rf":
    HEAD_URL_SET = top_rf_urls
    URL_TO_COUNTS = rf_url_to_counts
elif CHOSEN_CORPUS == "dolma":
    HEAD_URL_SET = top_dolma_urls
    URL_TO_COUNTS = dolma_url_to_counts

In [None]:
url_robots_summary_head = {url: url_robots_summary[url] for url in HEAD_URL_SET if url in url_robots_summary}
url_robots_summary_rand = {url: url_robots_summary[url] for url in random_10k_urls if url in url_robots_summary}

In [None]:
# HEAD URL SPLIT
# {Period --> Agent --> Status --> set(URLs)}
robots_filled_status_head_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_head, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)
# RANDOM URL SPLIT
robots_filled_status_rand_summary = robots_util.prepare_robots_temporal_summary(
    url_robots_summary=url_robots_summary_rand, 
    # group_to_agents={k: [k] for k in agents_to_track},
    group_to_agents=agent_groups_to_track,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)


In [None]:
# Dataframe w/ [Period, Agent, Status, count(URLs or tokens)]
robots_temporal_head_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_head_summary,
    url_to_counts=c4_url_to_counts,
)
# Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]
robots_temporal_rand_summary = robots_util.robots_temporal_to_df(
    robots_filled_status_rand_summary,
    url_to_counts=URL_TO_COUNTS,
)

### Preprocessing for ToS

In [None]:
# URL --> time --> ToS verdict string. 
url_to_time_to_tos_verdict = robots_util.get_tos_url_time_verdicts(tos_policies)
# Period --> Status --> set(URLs)
period_tos_verdict_urls = robots_util.prepare_tos_temporal_summary(
    url_to_time_to_tos_verdict,
    start_time=TEMPORAL_ANALYSIS_START_DATE, 
    end_time=TEMPORAL_ANALYSIS_END_DATE,
    time_frequency="M",
    website_start_dates=website_start_dates,
)
# Dataframe: [Period, Status, Count, Tokens]
tos_summary_df = robots_util.tos_temporal_to_df(
    period_tos_verdict_urls,
    url_set=HEAD_URL_SET,
    url_to_counts=URL_TO_COUNTS,
)

print()

## TODO: Add Forecasting code here:

In [None]:
for _,row in robots_temporal_head_summary.iterrows():
    print(row)
    break

In [197]:
import pandas as pd
import altair as alt
from statsmodels.tsa.ar_model import AutoReg

def forecast_and_plot(df, agent, lags):
    # Filter the DataFrame for the specific agent
    agent_df = df[df['agent'] == agent].copy()
    
    # Convert 'period' to timestamp if it's a Period object
    agent_df.loc[:, 'period'] = agent_df['period'].apply(lambda x: x.to_timestamp() if isinstance(x, pd.Period) else x)
    
    # Reshape the data
    pivoted_df = agent_df.pivot_table(index='period', columns='status', values='count')
    
    # Normalize the counts to percentages
    pivoted_df = pivoted_df.div(pivoted_df.sum(axis=1), axis=0) * 100
    
    # Doing these for each status individually
    status_dfs = {}
    for status in pivoted_df.columns:
        status_df = pivoted_df[[status]].reset_index()
        status_df.columns = ['ds', 'y']
        status_df.set_index('ds', inplace=True)
        status_dfs[status] = status_df
    
    # Fit model
    models = {}
    for status, status_df in status_dfs.items():
        model = AutoReg(status_df['y'], lags=lags)
        models[status] = model.fit()
    
    # Periods to predict (months)
    n_periods = 12  
    
    # Make future predictions
    future_periods = pd.date_range(start=agent_df['period'].max(), periods=n_periods, freq='M')
    predictions = {}
    conf_intervals = {}
    for status, model in models.items():
        forecast = model.predict(start=len(status_df), end=len(status_df) + n_periods - 1)
        conf_int = model.get_prediction(start=len(status_df), end=len(status_df) + n_periods - 1).conf_int()
        predictions[status] = forecast.values
        conf_intervals[status] = conf_int
    # Combine the predictions into a single DataFrame
    predicted_df = pd.DataFrame(predictions, index=future_periods)
    predicted_df = predicted_df.reset_index().melt(id_vars='index', var_name='status', value_name='count')
    predicted_df.columns = ['period', 'status', 'count']

    predicted_df['agent'] = agent
    
    # Concatenate the original and predicted DataFrames
    combined_df = pd.concat([agent_df, predicted_df], ignore_index=True)
    
    # Define the color scheme for the statuses
    status_colors = {'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    
    chart = robots_util.plot_robots_time_map_altair(
        combined_df, 
        agent_type=agent, 
        period_col='period', 
        status_col='status', 
        val_col='count', 
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors=status_colors
    )
     # map the confidence intervals to the predicted df
    for status, conf_int in conf_intervals.items():
        # Ensure the length of the confidence intervals matches the number of future periods
        predicted_df.loc[predicted_df['status'] == status, 'lower'] = conf_int['lower'].values
        predicted_df.loc[predicted_df['status'] == status, 'upper'] = conf_int['upper'].values


    return chart, predicted_df

df = robots_temporal_head_summary

agents = df['agent'].unique()
lags = [1, 3, 6, 9, 12] 

display = False
# Plots for each agent
for agent in agents:
    print(f"CHOSEN_CORPUS: {CHOSEN_CORPUS}")
    print(f"AGENT: {agent}")
    chart, predicted_df = forecast_and_plot(df, agent, lags)
    if display:
        display(predicted_df)
    chart.show()



CHOSEN_CORPUS: c4
AGENT: Google


CHOSEN_CORPUS: c4
AGENT: OpenAI


CHOSEN_CORPUS: c4
AGENT: Anthropic


CHOSEN_CORPUS: c4
AGENT: Cohere


CHOSEN_CORPUS: c4
AGENT: Common Crawl


CHOSEN_CORPUS: c4
AGENT: Meta


### Plot Temporal Robots Area Chart

In [None]:
# for agent in agents_to_track:
#     robots_util.plot_robots_time_map(robots_temporal_summary, agent)

for group in agent_groups_to_track:
    print(group)
    chart = robots_util.plot_robots_time_map_altair(
        robots_temporal_head_summary, 
        agent_type=group, 
        period_col='period', 
        status_col='status', 
        val_col='tokens',  # "count" / "tokens"
        title='Restriction Status over Time', 
        ordered_statuses=['no_robots', 'none', 'some', 'all'], 
        status_colors={'no_robots': 'gray', 'none': 'blue', 'some': 'orange', 'all': 'red'}
    )
    chart.show()
#     robots_util.plot_robots_time_map(robots_temporal_head_summary, group, val_key="count")

### Plot Temporal ToS Area Chart

In [None]:
chart = robots_util.plot_temporal_area_map_altair(
    tos_summary_df,
    period_col='period', 
    status_col='status', 
    val_col='tokens',  # "count" / "tokens"
    title='Restriction Status over Time', 
    ordered_statuses=['No Terms Pages', 'No Restrictions', 'Conditional Restrictions', 'Prohibits AI', 'Prohibits Scraping', 'Prohibits Scraping & AI'], 
    status_colors= {
        'No Terms Pages': 'gray', 'No Restrictions': 'blue', 
        'Conditional Restrictions': 'yellow', 'Prohibits AI': 'orange', 
        'Prohibits Scraping': 'red', 'Prohibits Scraping & AI': 'red'
    },
)
chart.show()

### WIP: Restrictions by Company

In [None]:
# Dataframe w/ [Period, Agent, Status, count(URLs), count(tokens)]
def plot_company_comparisons_altair(
    df,
):
    df = df.copy()  # Create a copy to avoid modifying the original DataFrame
    
    # Convert the 'Period' column to datetime
    df['Period'] = pd.to_datetime(df['Period'], format='%Y-%m')
    
    # Calculate the percentage of tokens for each status
    status_totals = df.groupby(['Period', 'Agent'])['count(tokens)'].sum().reset_index()
    status_totals = status_totals.pivot(index=['Period', 'Agent'], columns='Status', values='count(tokens)').reset_index()
    status_totals['percent_Restrictive'] = status_totals['all'] / status_totals.sum(axis=1) * 100
    
    # Filter the data for the 'Restrictive' status
    data = status_totals[['Period', 'Agent', 'percent_Restrictive']].dropna()
    
    # Create the Altair chart
    chart = alt.Chart(data).mark_line(point=True).encode(
        x=alt.X('yearmonth(Period):T', title='Year', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('percent_Restrictive:Q', title='Percentage of Tokens'),
        color=alt.Color('Agent:N', legend=alt.Legend(title='Agent')),
        tooltip=['Agent', alt.Tooltip('percent_Restrictive:Q', format='.2f')]
    ).properties(
        title='Percentage of Tokens with Restrictive Status',
        width=800,
        height=500
    ).configure_axis(
        grid=False
    ).configure_view(
        strokeWidth=0
    )
    
    return chart

In [None]:
plot_company_comparisons_altair(robots_temporal_head_summary)

### Create Correlations Tables

In [None]:
url_correlation_df = analysis_util.analyze_url_variable_correlations(url_results_df, [100, 500, 2000], "dolma")

# Convert the dataframe to a LaTeX table
latex_table = url_correlation_df.to_latex(index=True, escape=True, float_format="{:.1f}".format)
print(latex_table)

### Robots & ToS Confusion Matrix

In [None]:
robots_util.prepare_tos_robots_confusion_matrix(
    tos_policies,
    url_robots_summary,
    COMPANIES_TO_ANALYZE,
    url_token_lookup,
    corpora_choice="dolma",
)

In [None]:
# companies = ["Google", "OpenAI", "Anthropic", "Cohere", "Common Crawl", "Meta"]
# for company in companies:
#     # {URL --> Date --> Agent --> Status}
#     url_robots_summary[url][date][company]
#     # get latest
#     # df: [company1, company2, status1, status2]
#     # plot confusion matrix.
    

### Plot Num Tokens against Robots Restrictions per Company

In [None]:
size_bucket_to_urls = robots_util.bucket_urls_by_size(
    c4_url_to_counts, 
    bucket_boundaries=[0, 1000, 10000, 50000, 1000000, 10000000, 50000000, 9999999999999999]
)
robots_util.plot_size_against_restrictions(
    url_robots_summary,
    size_bucket_to_urls,
    "OpenAI",
    setting="train"
)

# Scratch / Notes

In [None]:
Notes:

* Take All Agents and subtract it from the other plots to see the diffs between agents.
* How do these charts vary with different quantiles for number of tokens. (Behavior diffs for token rich and token poor)

* Incompatability between Robots.txt and ToS? Robots.txt is an encoded ToS for scrapers. Is there more intention detailed in the ToS than the robots?
* Robots and ToS update rate. --> How often that the other is updated within T time (robots and ToS).
* Analysis: Of all websites that restrict at least one AI bot, what other bots do they restrict? E.g. if you restrict cohere, you probaby also restrict OpenAI
P(Cohere restricted | OpenAI restricted)
* Include Midjourney, CCBot, IAbot, etc
* Vertical lines that show when bots get introduced.

* Restrictions are rising across the board.
* Company-wise restrictions
* Compare for each company their scraping restrictions vs RAG restrictions vs AI bot restrictions (OpenAI, Anthropic)

In [None]:
Data Sources Analysis:

0. Table: Methodology -- what metadata we collected/annotated (and how). [WIP]
1. Figure: Temporal changes in Robots / ToS (somehow over a collection or multiple companies?) [Waiting on ToS]
    C4/RF/(Dolma) vs (2k-Head)/Random vs Company(6)
2. Figure: Robots / ToS contradiction matrix [Waiting on ToS]
    Creative Commons.
3. Table: Robots permission differences by company + Stella(pairwise differences by confusion matrix) [WIP]
4. Table: Robots / ToS / other indicators variability by website num tokens (i.e. head vs tail) [WIP]
5. Figure: Commercial/market copyright concerns and comparison to WildChat + EU AI Act [WIP, waiting on WildChat]

In [None]:
Datasets Analysis:

(License: NC, C, Unspecified) x (Terms: Unspecified/None, NC, C).
Text =~ 200
Video = 11
Speech ~= 50

0. Tables for Text, Video, Speech [Will: WIP]
1. License & source (terms) restriction differences between Text, Video, Speech. (normalized stacked bar chart) [WIP]
2. Source domains by Text, Video, Speech: scraped, synthetic, crowdsourced, ....  (normalized stacked bar chart) [WIP]
2b. Include pretraining (broken down by modality + user content + illicit content)
3. Creator distribution for Text, Video, Speech by geography and organization type [WIP]

Extension: 


4. Temporal breakdown of license restriction categories by Text, Video, Speech [WIP]
5. License Type by modality for Text, Video, Speech [WIP]
6. Languages / Tasks.
(Will run statistical tests checking if diffs between Text,Video,Speech are significant on all of these. [WIP]

In [None]:


# People who restrict Anthropic but not OpenAI. <-- public awareness of organizations is the driving force here. 
# Ordered by notoriety. (Confusion matrix)

In [None]:
nayan = pd.read_csv("test_data/wildchat_annotations.csv")

In [None]:
# nayan

In [None]:
def sample_nayan(df, i):
    row = df.iloc[i]
    prompt = row["WildChat Example Prompt"]
    response = row["WildChat Example Response"]
    typs = row["Types of Service"]
    cd = row["Content Domain"]
    print(prompt)
    # print("*************" + response[:100])
    print(cd)
    print()

In [None]:
# for i in range(30, 60):
#     sample_nayan(nayan, i)