# 01. Put historical data in df

Data were saved as json files, now put them in a single df

- Ignore folders only containing a "_SUCCESS" file but no data
- Remove duplicate rows in df that arise because of getting the same data from arXiv multiple times

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, from_json, schema_of_json, regexp_replace, udf
from pylatexenc.latex2text import LatexNodes2Text

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
# total number of folders saved

count = sum(
    1 for name in os.listdir("spark/notebooks")
    if name.startswith("saved_data-") and os.path.isdir(os.path.join("spark/notebooks", name))
)

print(count)

In [None]:
def has_json_file(folder_path):
    try:
        files = [f for f in os.listdir(folder_path) if "SUCCESS" not in f]

        if not files:
            return False

        else:
            return True

    except Exception:
        return False

In [None]:
has_json_file("spark/notebooks/saved_data-1743417480000") # test on folder that only contains _SUCCESS file

In [None]:
has_json_file("spark/notebooks/saved_data-1743416280000") # test on folder that actually contains data

In [None]:
data_folders = []

for folder in os.listdir("spark/notebooks"):
    folder_path = os.path.join("spark/notebooks", folder)
    if folder.startswith("saved_data") and os.path.isdir(folder_path):
        if has_json_file(folder_path):
            data_folders.append(folder_path)

In [None]:
print(data_folders[0])

In [None]:
len(data_folders)

In [None]:
first_file = spark.read.text(data_folders[0]).limit(1).collect()[0][0]
first_file

In [None]:
schema = schema_of_json(first_file)

In [None]:
df_raw = spark.read.text(data_folders)
df_parsed = df_raw.withColumn("data", from_json(col("value"), schema)).select("data.*")
df_parsed.count()

In [None]:
df_parsed_deduped = df_parsed.drop_duplicates(["title"])

df_parsed_deduped.show()

In [None]:
df_parsed_deduped.count()

In [None]:
df_parsed_deduped.write.mode("overwrite").parquet("data/df_all_deduped")

In [None]:
df_parsed_deduped = spark.read.parquet("data/df_all_deduped")

In [None]:
df_parsed_deduped.show()

In [None]:
def latex_to_text(s):
    return LatexNodes2Text().latex_to_text(s)

In [None]:
def latex_to_text(s):
    if s is None:
        return None
    try:
        converted = LatexNodes2Text().latex_to_text(s)
        return converted.replace("\n", " ").replace("\r", " ").replace("^", "")
    except Exception:
        return None

In [None]:
latex_udf = udf(latex_to_text, StringType())

df_cleaned = df_parsed_deduped.withColumn("title", latex_udf(col("title"))).withColumn("summary", latex_udf(col("summary")))
df_cleaned.show()

In [None]:
df_cleaned.filter(col("published") == "2025-03-27T09:58:07Z").select(col("summary")).collect()

In [None]:
df_cleaned.write.mode("overwrite").parquet("data/df_all_cleaned")

# 02 Scrape arXiv categories

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
url = 'https://arxiv.org/category_taxonomy'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# soup

In [None]:
# soup.find_all(['h2', 'h3', 'h4'])

In [None]:
categories = []

for header in soup.find_all(['h2', 'h3', 'h4']):
    if header.name == 'h2':
        group_name = header.get_text(strip=True)
    elif header.name == 'h3':
        archive_name = header.get_text(strip=True)
    elif header.name == 'h4':
        category_info = header.get_text(strip=True)
        # Extract category ID and name
        if '(' in category_info and ')' in category_info:
            category_id = category_info.split('(')[-1].strip(')')
            category_name = category_info.split('(')[0].strip()
        else:
            category_id = ''
            category_name = category_info
        categories.append({
            'group': group_name,
            'archive': archive_name,
            'category_id': category_id,
            'category_name': category_name
        })

In [None]:
df = pd.DataFrame(categories)
df = df[df.group != "Group Name"]

df.head(20)

In [None]:
# check High Energy Physics categories - merge them into 1 subcategory?

df[df.category_name.str.contains('hep')]

In [None]:
df.to_parquet("data/arxiv_categories.parquet.gzip")

# 03 Create training, validation, and test set

In [None]:
from pyspark.sql.functions import col, split, when, collect_set, concat, concat_ws, lit, regexp_replace, rand
from sklearn.model_selection import train_test_split

In [None]:
# where applicable, split main category (e.g. cs.HC) into level 1 (e.g. cs) and level 2 (e.g. HC) categories

df_pub = spark.read.parquet("data/df_all_cleaned")
df_pub = df_pub.withColumn("level1_category", split(df_pub["main_category"], "\.")[0]) \
    .withColumn("level2_category", split(df_pub["main_category"], "\.")[1])
df_pub.show()

In [None]:
# also split arXiv categories into level 1 and level 2, to make it possible to join them with the data

df_categories = spark.read.parquet("data/arxiv_categories.parquet.gzip")
df_categories = df_categories.withColumn("level1_category", split(df_categories["category_name"], "\.")[0]) \
    .withColumn("level2_category", split(df_categories["category_name"], "\.")[1])
df_categories.show()

In [None]:
df_categories.select('category_name').distinct().count()

In [None]:
# create df with level 1 abbreviation, "group" (main categories) and "subgroup" (main categories but Physics split up) categories

df_level1_categories = df_categories.select(col('group'), col('level1_category'), col('archive')).drop_duplicates().sort(col('group'))
df_level1_categories = df_level1_categories.withColumn("subgroup",
    when(col("level1_category").isin("cs", "econ", "eess", "math", "q-bio", "q-fin", "stat"), col("group")).otherwise(concat(col("group"), lit(": "), col("archive")))
    ).drop('archive')
df_level1_categories = df_level1_categories.withColumn("subgroup", regexp_replace(col("subgroup"), r"\(.*?\)", ""))
df_level1_categories = df_level1_categories.withColumn("subgroup", regexp_replace("subgroup", "-.*", "")) # merge the 4 High Energy Physics categories

df_level1_categories.show()

In [None]:
df_level1_categories.toPandas().to_parquet("data/df_level1_categories.gzip", index=False)

In [None]:
# check all subgroups

row = df_level1_categories.agg(collect_set("subgroup").alias("subgroups")).collect()[0]

print(row['subgroups'])

In [None]:
# add categories written out in full (group and subgroup) to the data, based on level 1 category abbreviation (e.g. cs)

df_labeled = df_pub.join(df_level1_categories, on = "level1_category", how = "left")
df_labeled.show()

## Stratified train / val / test set with the 8 main groups

In [None]:
# concatenate title and summary; keep only that column and the label (relevant columns)
df_relevant = df_labeled.withColumn("text", concat_ws(". ", col("title"), col("summary"))).select(col('text'), col('group'))
df_relevant.show()

In [None]:
# shuffle the data and convert to Pandas df in order to make stratified train, validation, and test set
df_shuffled = df_relevant.orderBy(rand()).toPandas().reset_index(drop = True)

In [None]:
df_shuffled.groupby('group')['text'].nunique()

In [None]:
df_counts = df_shuffled.groupby('group')['text'].nunique().reset_index(name='n_text')
total = df_shuffled['text'].nunique()
df_counts['pct'] = df_counts['n_text'] / total
df_counts

In [None]:
# split into train/validation/test
train_df, test_df = train_test_split(df_shuffled, test_size = 0.3, stratify = df_shuffled['group'], random_state=16)
val_df, test_df = train_test_split(test_df, test_size = 0.5, stratify = test_df['group'], random_state=16)

In [None]:
df_train_counts = train_df.groupby('group')['text'].nunique().reset_index(name='n_text')
total_train = train_df['text'].nunique()
df_train_counts['pct'] = df_train_counts['n_text'] / total_train
df_train_counts

In [None]:
train_df.to_parquet("data/df_train.parquet.gzip", index=False)
val_df.to_parquet("data/df_val.parquet.gzip", index=False)
test_df.to_parquet("data/df_test.parquet.gzip", index=False)

In [None]:
test_df.shape

## Stratified train / val / test set with subgroups for physics

In [None]:
# concatenate title and summary; keep only that column and the label
df_relevant2 = df_labeled.withColumn("text", concat_ws(". ", col("title"), col("summary"))).select(col('text'), col('subgroup'))
df_relevant2.show()

In [None]:
df_shuffled2 = df_relevant2.orderBy(rand()).toPandas().reset_index(drop = True)
df_shuffled2.groupby('subgroup')['text'].nunique()

In [None]:
# split into train/validation/test
train_df2, test_df2 = train_test_split(df_shuffled2, test_size = 0.3, stratify = df_shuffled2['subgroup'], random_state=16)
val_df2, test_df2 = train_test_split(test_df2, test_size = 0.5, stratify = test_df2['subgroup'], random_state=16)

In [None]:
train_df2.to_parquet("data/df_train_17cats.parquet.gzip", index=False)
val_df2.to_parquet("data/df_val_17cats.parquet.gzip", index=False)
test_df2.to_parquet("data/df_test_17cats.parquet.gzip", index=False)