In [1]:
from pathlib import Path
import pandas as pd
import sys
import os
from tqdm import tqdm
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# source of data_utils
sys.path.append(os.path.join('/home/siebenschuh/Projects/dataprep/code/DPO'))

from data_utils import compile_DatasetFrames

## Script: Identify the most simplistic text-based classifier (that doesn't require LLM embeddings)
A set of approaches:
- *Bag of Words (BoW)*:
- *N-gram*:
- *fasttext*: embeddings for words

#### Important: train and test on the problem at hand (not aux. statistical problems)
- nobody has the attention span anymore

# Load Data
Use subset of `text` as input `[:3200]` and attempt to predict `bleu_pymupdf`, `bleu_nougat`, and `bleu_marker` from it

In [2]:
# path constants
p_embeddings_root_dir = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/embeddings/emb_by_model')
p_response_csv_path = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/parser_metrics_without_text_output.csv')
parser = 'pymupdf'
normalized = False
predefined_split = True
p_split_yaml_path = Path('/home/siebenschuh/Projects/dataprep/code/DPO/meta_split/pymupdf.yaml')

# compile the dataset frames (train/val/test) using `compile_DatasetFrames`
df_train, df_test, df_val = compile_DatasetFrames(
    p_embeddings=p_embeddings_root_dir,
    p_response=p_response_csv_path,
    parser=parser,
    normalized=normalized,
    predefined_split=predefined_split,
    p_split_yaml_path=p_split_yaml_path
)


Load pre-defined split...

Train-Val Overlap: 0
Train-Test Overlap: 0
Val-Test Overlap: 0
df_train, df_test, df_val


# 1. BoW Approach

In [None]:
%%time

# Record start time
start_time = time.time()

# max number of chars
n_max_chars = 600
n_rows = 15000

# subset
df_train = df_train.loc[0:n_rows,:]

# format data
# - raw X
X_train = df_train['text'].str[:n_max_chars]
X_val = df_val['text'].str[:n_max_chars]
# - process X
vectorizer = CountVectorizer(max_features=100)  # Adjust max_features as needed
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
# - y
y_train = df_train[['bleu_marker', 'bleu_nougat', 'bleu_pymupdf']]
y_val = df_val[['bleu_marker', 'bleu_nougat', 'bleu_pymupdf']]

# init model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

# fit the model
model.fit(X_train_vec, y_train)

# predict on the validation set
y_pred = model.predict(X_val_vec)

# calculate MSE
mse = mean_squared_error(y_val, y_pred, multioutput='raw_values')
print("MSE:", mse)

# Calculate R²
r2 = r2_score(y_val, y_pred, multioutput='raw_values')
df_val['R2_pymupdf'], df_val['R2_nougat'], df_val['R2_marker'] = r2
print("R²:", r2)

# Calculate RMSE
rmse = np.sqrt(mse)
df_val['rmse_pymupdf'], df_val['rmse_nougat'], df_val['rmse_marker'] = rmse
print("RMSE:", rmse)

# Record end time and calculate duration
end_time = time.time()
training_duration = end_time - start_time

# Create DataFrame to store metrics
df_pred_metrics = pd.DataFrame({
    'name': ['bow'],
    'n_max_char': [n_max_chars],
    'N': [len(X_train)],
    'training_duration': [training_duration],
    'MSE_pymupdf': [mse[2]],
    'MSE_nougat': [mse[1]],
    'MSE_marker': [mse[0]],
    'R2_pymupdf': [r2[2]],
    'R2_nougat': [r2[1]],
    'R2_marker': [r2[0]],
    'RMSE_pymupdf': [rmse[2]],
    'RMSE_nougat': [rmse[1]],
    'RMSE_marker': [rmse[0]]
})

# Print the resulting DataFrame
print(df_pred_metrics)

In [None]:
y_pred