In [17]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
import typing
import glob
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util, forecasting_util


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
df_c4 = pd.read_csv("robots_data/full_temporal_robots/full_temporal_robots_c4.csv")
df_rw = pd.read_csv("robots_data/full_temporal_robots/full_temporal_robots_refinedweb.csv")
df_dolma = pd.read_csv("robots_data/full_temporal_robots/full_temporal_robots_dolma.csv")

In [8]:
df

Unnamed: 0,period,rand,head,combined
0,2016-01-01 00:00:00,0.076771,0.014589,0.091360
1,2016-02-01 00:00:00,0.076887,0.014452,0.091339
2,2016-03-01 00:00:00,0.080342,0.015571,0.095913
3,2016-04-01 00:00:00,0.080584,0.015436,0.096020
4,2016-05-01 00:00:00,0.080893,0.015572,0.096465
...,...,...,...,...
107,2024-12-31 00:00:00,0.165647,0.049226,0.217620
108,2025-01-31 00:00:00,0.169188,0.049837,0.224107
109,2025-02-28 00:00:00,0.159115,0.050107,0.210791
110,2025-03-31 00:00:00,0.165315,0.051852,0.220898


In [16]:
df_c4['Dataset'] = 'C4'
df_rw['Dataset'] = 'RefinedWeb'
df_dolma['Dataset'] = 'Dolma'

# Combine the dataframes
df_combined = pd.concat([df_c4, df_rw, df_dolma])

# Convert 'period' to datetime
df_combined['period'] = pd.to_datetime(df_combined['period'])

# Plotting
chart = alt.Chart(df_combined).mark_line(point=True).encode(
    x=alt.X('period:T', title='Period'),
    y=alt.Y('combined:Q', title='Combined Value (%)', scale=alt.Scale(domain=[0, 1])),
    color=alt.Color('Dataset:N', title='Dataset'),
    tooltip=['period:T', 'combined:Q', 'Dataset:N']
).properties(
    title="Temporal Trends in Combined Values",
    width=800,
    height=400
)

chart

In [20]:
os.path.exists("output_data")

True

In [119]:

def load_data(pattern):
    """ Load data from CSV files matching the pattern. """
    file_paths = glob.glob(pattern)
    data = []
    for file in file_paths:
        df = pd.read_csv(file, index_col=0, parse_dates=True)
        # Extract dataset and subset from filename
        parts = os.path.basename(file).split('_')
        df['dataset'], df['subset'] = parts[0], parts[1]
        data.append(df)
    
    concat_df = pd.concat(data).reset_index().rename(columns={'index': 'Date'})
    return concat_df

def prepare_data_for_plot1(data):
    """ Prepare data for the first plot. """
    data = data[data['subset'] == 'all']
    data[['head_frac', 'combined_tokens']] *= 100  # Multiply values by 100
    return data.reset_index().melt(id_vars=['Date', 'dataset'], value_vars=['head_frac', 'combined_tokens'],
                                   var_name='Type', value_name='Percent')

def prepare_data_for_plot2(data):
    # data['subset'] = data['subset'].map(mapping).fillna(data['subset'])
    relevant_subsets = {
        "Academic": "Academic", 
        "News": "News", 
        'E': "E-commerce", 
        'Encyclopedia': "Encyclopedia", 
        'Government': "Government", 
        "Organization": "Org/Pers. site", 
        'Social Media': "Social Media/Forum",
    }
    data_numeric = data.drop(columns=['dataset'])  # Drop the 'dataset' column
    data_numeric[['head_frac', 'combined_tokens']] *= 100  # Multiply values by 100
    grouped = data_numeric.groupby(['Date', 'subset']).mean().reset_index()
    grouped = grouped[grouped["subset"].isin(relevant_subsets.keys())]
    grouped["subset"] = grouped['subset'].map(relevant_subsets).fillna(grouped['subset'])
    return grouped.melt(id_vars=['Date', 'subset'], value_vars=['head_frac', 'combined_tokens'],
                        var_name='Type', value_name='Percent')

def create_plot(data, title, x_title, y_title, font_style, font_size):
    """ Create and return an Altair plot. """
    return alt.Chart(data, title=title).mark_line().encode(
        x=alt.X('Date:T', title=x_title, axis=alt.Axis(format='%Y', tickCount={"interval": "year", "step": 1})),  # Yearly labels, data by month
        y=alt.Y('Percent:Q', title=y_title),
        color='dataset:N',
        strokeDash='Type:N'
    ).properties(
        width=600,
        height=400
    ).configure_axis(
        labelFontSize=font_size,
        titleFontSize=font_size,
        labelFont=font_style,
        titleFont=font_style,
        grid=False  # Remove gridlines
    ).configure_legend(
        labelFont=font_style,
        labelFontSize=font_size,
        titleFont=font_style,
        titleFontSize=font_size
    )

def create_plot2(data, title, x_title, y_title, font_style, font_size):
    """ Ensure 'Date' is a datetime and data is sorted. """
    data['Date'] = pd.to_datetime(data['Date'])  # Parse 'Date' as datetime if not already
    data = data.sort_values('Date')  # Sort data by 'Date'
    
    chart = alt.Chart(data, title=title).mark_line().encode(
        x=alt.X('Date:T', title=x_title, axis=alt.Axis(format='%Y', tickCount={"interval": "year", "step": 1})),  # Yearly labels, data by month
        y=alt.Y('Percent:Q', title=y_title),
        color=alt.Color('subset:N', legend=alt.Legend(orient='none', title='Subset',
                                                      labelFont=font_style, labelFontSize=font_size,
                                                      titleFont=font_style, titleFontSize=font_size,
                                                      legendX=10, legendY=10)),  # Position for color legend
        strokeDash=alt.StrokeDash('Type:N', legend=alt.Legend(orient='none', title='Type',
                                                              labelFont=font_style, labelFontSize=font_size,
                                                              titleFont=font_style, titleFontSize=font_size,
                                                              legendX=10, legendY=150))  # Position for strokeDash legend
    ).properties(
        width=600,
        height=400
    ).configure_axis(
        labelFontSize=font_size,
        titleFontSize=font_size,
        labelFont=font_style,
        titleFont=font_style,
        grid=False  # Remove gridlines
    )

    return chart


In [120]:

# Load data
robots_df = load_data('output_data_robots/*')  # Adjust path as needed
# print(data)
# print(data.index.name)

# Prepare data for plots
robots_df_full_plot = prepare_data_for_plot1(robots_df)
robots_df_service_plot = prepare_data_for_plot2(robots_df)
# print(data_plot2["subset"].unique())

# Create plots
plot1 = create_plot(robots_df_full_plot, 'Robots: Head Tokens vs Combined Tokens for each Dataset', 'Time', 'Percent', 'Times', 14)
plot2 = create_plot2(robots_df_service_plot, 'Robots: Average Percent of Tokens by Subset', 'Time', 'Percent', 'Times', 14)

# Display the plots
plot1.display()
plot2.display()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[['head_frac', 'combined_tokens']] *= 100  # Multiply values by 100


In [112]:
# Load data
tos_df = load_data('output_data_tos/*')  # Adjust path as needed
# print(data)
# print(data.index.name)

# Prepare data for plots
tos_df_full_plot = prepare_data_for_plot1(tos_df)
tos_df_service_plot = prepare_data_for_plot2(tos_df)
# print(data_plot2["subset"].unique())

# Create plots
plot1 = create_plot(tos_df_full_plot, 'TOS: Head Tokens vs Combined Tokens for each Dataset', 'Time', 'Percent', 'Times', 14)
plot2 = create_plot2(tos_df_service_plot, 'TOS: Average Percent of Tokens by Subset', 'Time', 'Percent', 'Times', 14)

# Display the plots
plot1.display()
plot2.display()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[['head_frac', 'combined_tokens']] *= 100  # Multiply values by 100
