In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
import typing
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import analysis_util, analysis_constants, visualization_util
from web_analysis import parse_robots
from web_analysis import robots_util, forecasting_util


%load_ext autoreload
%autoreload 2

In [10]:
df_c4 = pd.read_csv("robots_data/full_temporal_robots/full_temporal_robots_c4.csv")
df_rw = pd.read_csv("robots_data/full_temporal_robots/full_temporal_robots_refinedweb.csv")
df_dolma = pd.read_csv("robots_data/full_temporal_robots/full_temporal_robots_dolma.csv")

In [8]:
df

Unnamed: 0,period,rand,head,combined
0,2016-01-01 00:00:00,0.076771,0.014589,0.091360
1,2016-02-01 00:00:00,0.076887,0.014452,0.091339
2,2016-03-01 00:00:00,0.080342,0.015571,0.095913
3,2016-04-01 00:00:00,0.080584,0.015436,0.096020
4,2016-05-01 00:00:00,0.080893,0.015572,0.096465
...,...,...,...,...
107,2024-12-31 00:00:00,0.165647,0.049226,0.217620
108,2025-01-31 00:00:00,0.169188,0.049837,0.224107
109,2025-02-28 00:00:00,0.159115,0.050107,0.210791
110,2025-03-31 00:00:00,0.165315,0.051852,0.220898


In [16]:
df_c4['Dataset'] = 'C4'
df_rw['Dataset'] = 'RefinedWeb'
df_dolma['Dataset'] = 'Dolma'

# Combine the dataframes
df_combined = pd.concat([df_c4, df_rw, df_dolma])

# Convert 'period' to datetime
df_combined['period'] = pd.to_datetime(df_combined['period'])

# Plotting
chart = alt.Chart(df_combined).mark_line(point=True).encode(
    x=alt.X('period:T', title='Period'),
    y=alt.Y('combined:Q', title='Combined Value (%)', scale=alt.Scale(domain=[0, 1])),
    color=alt.Color('Dataset:N', title='Dataset'),
    tooltip=['period:T', 'combined:Q', 'Dataset:N']
).properties(
    title="Temporal Trends in Combined Values",
    width=800,
    height=400
)

chart