In [235]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
import typing
import glob
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util, forecasting_util


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [277]:

def load_data(pattern):
    """ Load data from CSV files matching the pattern. """
    file_paths = glob.glob(pattern)
    data = []
    for file in file_paths:
        df = pd.read_csv(file, index_col=0, parse_dates=True)
        # Extract dataset and subset from filename
        parts = os.path.basename(file).split('_')
        df['Dataset'], df['Domain Type'] = parts[0], parts[1]
        df['Dataset'] = df['Dataset'].map({'c4': "C4", 'rf': 'RefinedWeb', 'dolma': 'Dolma'})
        data.append(df)
    
    concat_df = pd.concat(data).reset_index().rename(columns={'index': 'Date'})
    return concat_df

def prepare_data_for_plot1(data):
    """ Prepare data for the first plot. """
    data = data[data['Domain Type'] == 'all']
    # data[['head_frac', 'combined_tokens']] *= 100  # Multiply values by 100
    return data.reset_index().melt(id_vars=['Date', 'Dataset'], value_vars=['Head', 'Full Corpus'],
                                   var_name='Token Sample', value_name='Percent')

def prepare_data_for_plot2(data):
    # data['subset'] = data['subset'].map(mapping).fillna(data['subset'])
    relevant_subsets = {
        "Academic": "Academic", 
        "News": "News", 
        'E': "E-commerce", 
        'Encyclopedia': "Encycl.", 
        'Government': "Government", 
        "Organization": "Org/Pers site", 
        'Social Media': "Socials/Forum",
    }
    data_numeric = data.drop(columns=['Dataset'])  # Drop the 'dataset' column
    # data_numeric[['head_frac', 'combined_tokens']] *= 100  # Multiply values by 100
    grouped = data_numeric.groupby(['Date', 'Domain Type']).mean().reset_index()
    grouped = grouped[grouped["Domain Type"].isin(relevant_subsets.keys())]
    grouped["Domain Type"] = grouped['Domain Type'].map(relevant_subsets).fillna(grouped['Domain Type'])
    return grouped.melt(id_vars=['Date', 'Domain Type'], value_vars=['Head', 'Full Corpus'],
                        var_name='Token Sample', value_name='Percent')

def forecast_region(data, forecast_startdate, height, period_col="Date"):
    forecast_startdate = pd.to_datetime(forecast_startdate)
    shading = alt.Chart(
        pd.DataFrame({"start": [forecast_startdate], "end": [data[period_col].max()]})
    ).mark_rect(
        opacity=0.1,
        color="gray"
    ).encode(
        x=alt.X("start:T", title=""),
        x2="end:T"
    )

    forecast_rule = alt.Chart(
        pd.DataFrame({"period": [forecast_startdate]})
    ).mark_rule(
        color="gray"
    ).encode(
        x="period:T"
    )

    # Add a label in the middle of the forecasted region
    shading_text = alt.Chart(
        pd.DataFrame({"date": [forecast_startdate + (data[period_col].max() - forecast_startdate) / 2], "text": ["Forecast"]})
    ).mark_text(
        align="center",
        baseline="middle",
        dx=0,
        dy=height - 20,
        color="black",
        fontWeight="bold"
    ).encode(
        x="date:T",
        y=alt.value(0),
        text="text:N"
    )

    return shading + forecast_rule + shading_text

    
def temporal_corpus_estimation_plot(
    data, title, x_title, y_title, font_style, font_size,
    forecast_startdate="2022",
    show_legend=True,
    height=400,
    width=800,
):
    """ Create and return an Altair plot. """

    colorLegend = alt.Legend(orient='bottom', title='Dataset',
        labelFont=font_style, labelFontSize=font_size,
        titleFont=font_style, titleFontSize=font_size) if show_legend else None
    strokeDashLegend = alt.Legend(orient='none', title='Token Sample',
        labelFont=font_style, labelFontSize=font_size,
        titleFont=font_style, titleFontSize=font_size, direction='horizontal',
        legendX=0, legendY=280) if show_legend else None
    
    chart = alt.Chart(data).mark_line().encode(
        x=alt.X('Date:T', title=x_title, axis=alt.Axis(format='%Y', tickCount={"interval": "year", "step": 1})),  # Yearly labels, data by month
        y=alt.Y('Percent:Q', title=y_title, axis=alt.Axis(format="%")),
        # color='Dataset:N',
        # strokeDash='Token Sample:N',
        color=alt.Color('Dataset:N', legend=colorLegend),  # Position for color legend
        strokeDash=alt.StrokeDash('Token Sample:N', legend=strokeDashLegend))  # Position for strokeDash legend
    
    ################################################################
    # SHADE FORECASTED DATA REGIONS
    # Add a shaded region for forecasted data, if needed
    ################################################################
    if forecast_startdate:
        chart = chart + forecast_region(data, forecast_startdate, height)

    chart = chart.properties(
        width=width,
        height=height
    ).configure_axis(
        labelFontSize=font_size,
        titleFontSize=font_size,
        labelFont=font_style,
        titleFont=font_style,
        grid=False  # Remove gridlines
    ).configure_legend(
        labelFont=font_style,
        labelFontSize=font_size,
        titleFont=font_style,
        titleFontSize=font_size
    )
    return chart
        

def temporal_corpus_estimation_by_service_plot(
    data, title, x_title, y_title, font_style, font_size,
    forecast_startdate="2022",
    show_legend=True,
    height=400,
    width=800,
):
    """ Ensure 'Date' is a datetime and data is sorted. """
    data['Date'] = pd.to_datetime(data['Date'])  # Parse 'Date' as datetime if not already
    data = data.sort_values('Date')  # Sort data by 'Date'
    
    # chart = alt.Chart(data).mark_line().encode(
    #     x=alt.X('Date:T', title=x_title, axis=alt.Axis(format='%Y', tickCount={"interval": "year", "step": 1})),  # Yearly labels, data by month
    #     y=alt.Y('Percent:Q', title=y_title, axis=alt.Axis(format="%")),
    #     color=alt.Color('Domain Type:N', legend=alt.Legend(orient='none', title='Domain Type',
    #                                                   labelFont=font_style, labelFontSize=font_size,
    #                                                   titleFont=font_style, titleFontSize=font_size,
    #                                                   legendX=10, legendY=10, symbolType='circle')),  # Position for color legend
    #     strokeDash=alt.StrokeDash('Token Sample:N', legend=alt.Legend(orient='none', title='Token Sample',
    #                                                           labelFont=font_style, labelFontSize=font_size,
    #                                                           titleFont=font_style, titleFontSize=font_size,
    #                                                           legendX=140, legendY=10)))  # Position for strokeDash legend

    # Optional settings for legends if they are to be displayed
    legend_color = alt.Legend(orient='bottom', title='Domain Type',
                              labelFont=font_style, labelFontSize=font_size,
                              titleFont=font_style, titleFontSize=font_size, legendY=-20, # symbolType='circle'
                              columns=7) if show_legend else None
    
    legend_stroke_dash = alt.Legend(orient='none', title='Token Sample',
                                    labelFont=font_style, labelFontSize=font_size,
                                    titleFont=font_style, titleFontSize=font_size,
                                    direction='horizontal', legendX=0, legendY=280) if show_legend else None
    
    # Chart code with conditional legends
    chart = alt.Chart(data).mark_line().encode(
        x=alt.X('Date:T', title=x_title, axis=alt.Axis(format='%Y', tickCount={"interval": "year", "step": 1})),
        y=alt.Y('Percent:Q', title=y_title, axis=alt.Axis(format="%")),
        color=alt.Color('Domain Type:N', legend=legend_color),
        strokeDash=alt.StrokeDash('Token Sample:N', legend=legend_stroke_dash)
    )

    ################################################################
    # SHADE FORECASTED DATA REGIONS
    # Add a shaded region for forecasted data, if needed
    ################################################################
    if forecast_startdate:
        chart = chart + forecast_region(data, forecast_startdate, height)

    chart = chart.properties(
        width=width,
        height=height
    ).configure_axis(
        labelFontSize=font_size,
        titleFontSize=font_size,
        labelFont=font_style,
        titleFont=font_style,
        grid=False  # Remove gridlines
    )

    return chart


In [278]:

# Load data
robots_df = load_data('output_data_robots/*')  # Adjust path as needed

# Prepare data for plots
robots_df_full_plot = prepare_data_for_plot1(robots_df)
robots_df_service_plot = prepare_data_for_plot2(robots_df)
# print(data_plot2["subset"].unique())

# Create plots
robots_corpus_plot = temporal_corpus_estimation_plot(
    robots_df_full_plot, 'Robots: Head Tokens vs Combined Tokens for each Dataset', '', '', 'Times', 14,
    forecast_startdate="2022",
    show_legend=False,
    height=200, width=450,
)
robots_services_plot = temporal_corpus_estimation_by_service_plot(
    robots_df_service_plot, 'Robots: Average Percent of Tokens by Subset', '', '', 'Times', 14,
    forecast_startdate="2022",
    show_legend=False,
    height=200, width=450,
)

# Display the plots
robots_corpus_plot.display()
robots_services_plot.display()

# Head, Combined
# Robots vs ToS

In [279]:
# Load data
tos_df = load_data('output_data_tos/*')  # Adjust path as needed

# Prepare data for plots
tos_df_full_plot = prepare_data_for_plot1(tos_df)
tos_df_service_plot = prepare_data_for_plot2(tos_df)
# print(data_plot2["subset"].unique())

# Create plots
tos_corpus_plot = temporal_corpus_estimation_plot(
    tos_df_full_plot, 'Robots: Head Tokens vs Combined Tokens for each Dataset', '', '', 'Times', 14,
    forecast_startdate="2022",
    show_legend=True,
    height=200, width=600,
)
tos_services_plot = temporal_corpus_estimation_by_service_plot(
    tos_df_service_plot, 'Robots: Average Percent of Tokens by Subset', '', '', 'Times', 14,
    forecast_startdate="2022",
    show_legend=True,
    height=200, width=600,
)
# Display the plots
tos_corpus_plot.display()
tos_services_plot.display()

In [252]:
# head == rand for every single one.
# E-commerce mean shouldn't be that high.

In [243]:
tos_df_service_plot

Unnamed: 0,Date,Domain Type,Token Sample,Percent
0,2016-01-01,Academic,Head,0.053086
1,2016-01-01,E-commerce,Head,0.053086
2,2016-01-01,Encyclopedia,Head,0.053086
3,2016-01-01,Government,Head,0.053086
4,2016-01-01,News,Head,0.053086
...,...,...,...,...
1395,2024-04-01,Encyclopedia,Full Corpus,0.060544
1396,2024-04-01,Government,Full Corpus,0.011189
1397,2024-04-01,News,Full Corpus,0.195563
1398,2024-04-01,Org/Pers. site,Full Corpus,0.048338
