In [21]:
from pathlib import Path
from tqdm import tqdm, trange
import pandas as pd
import numpy as np

from data import load_data, process_data
from model import train_model
from validate import evaluate

In [26]:
%%time

parsers=['pymupdf', 'nougat', 'marker']

# data
all_df_metrics = []
for parser in parsers:
    # get raw data frames
    df_train, df_test, df_val = load_data(parser=parser)

    # DEBUFG
    df_train = df_train.loc[:450,:]

    # subset
    for score in ['rouge']: #, 'bleu', 'car']:
        for mode in ['countvectorizer']: #, 'fasttext', 'llm']:
            # process data
            data_list = process_data(df_train, df_test, df_val, n_max_chars=3200, max_features=1000, score=score, mode=mode, parsers=parsers)
            (X_train,y_train), (X_val,y_val), (X_test,y_test) = data_list
            
            # - keep score list (for cls task to recoup BLEU regret)
            y_score_list=[data_list[i][1] for i in range(len(data_list))]
            # tasks
            # - derive cls task
            y_train_cls = np.array(y_train).argmax(1).reshape(-1, 1)
            y_val_cls = np.array(y_val).argmax(1).reshape(-1, 1)
            y_test_cls = np.array(y_test).argmax(1).reshape(-1, 1)
            # - recombine
            data_list_cls = (X_train, y_train_cls), (X_val, y_val_cls), (X_test, y_test_cls)
            # - models
            for model in ['ridge']:
                for task in ['reg', 'cls']:
                    # meta
                    info = {'mode' : mode, 'model' : model, 'score' : score, 'parser' : parser, 'task' : task}
                    # classification
                    if task=='cls':
                        # train
                        trained_model = train_model(model, X_train, y_train_cls)
                        # evaluate
                        out = evaluate(trained_model, data_list_cls, y_score_list, info, parsers)
                    else:
                        # train
                        trained_model = train_model(model, X_train, y_train)
                        # evaluate
                        out = evaluate(trained_model, data_list, y_score_list, info, parsers)
                    # append
                    all_df_metrics.append(out)
                        
                        


Load pre-defined split...

Train-Val Overlap: 0
Train-Test Overlap: 0
Val-Test Overlap: 0
df_train, df_test, df_val
(X_train_vec, y_train), (X_val_vec, y_val), (X_test_vec, y_test)





Load pre-defined split...

Train-Val Overlap: 0
Train-Test Overlap: 0
Val-Test Overlap: 0
df_train, df_test, df_val
(X_train_vec, y_train), (X_val_vec, y_val), (X_test_vec, y_test)





Load pre-defined split...

Train-Val Overlap: 0
Train-Test Overlap: 0
Val-Test Overlap: 0
df_train, df_test, df_val
(X_train_vec, y_train), (X_val_vec, y_val), (X_test_vec, y_test)




CPU times: user 14.6 s, sys: 5.33 s, total: 19.9 s
Wall time: 20 s




In [27]:
print("Done!!!", len(all_df_metrics))

Done!!! 6


In [28]:
pd.concat(all_df_metrics)

Unnamed: 0,mode,model,score,parser,task,subset,r2,rmse,rmae,rir,...,r2_marker,rmse_pymupdf,rmse_nougat,rmse_marker,rmae_pymupdf,rmae_nougat,rmae_marker,n,prec,rec
0,countvectorizer,ridge,rouge,pymupdf,reg,train,0.326752,0.135422,0.306202,0.9868,...,0.303603,0.171031,0.114173,0.112828,0.360145,0.278209,0.272352,229,,
1,countvectorizer,ridge,rouge,pymupdf,reg,val,-0.153303,0.238827,0.386197,0.853867,...,-0.278077,0.179732,0.212266,0.306193,0.354874,0.370856,0.428922,1376,,
2,countvectorizer,ridge,rouge,pymupdf,reg,test,-0.13492,0.236836,0.384211,0.851229,...,-0.258732,0.183239,0.209494,0.301346,0.35693,0.36867,0.423719,2882,,
0,countvectorizer,ridge,rouge,pymupdf,cls,train,,,,4.752753,...,,,,,,,,229,0.333333,0.720524
1,countvectorizer,ridge,rouge,pymupdf,cls,val,,,,1.02391,...,,,,,,,,1376,0.333333,0.732316
2,countvectorizer,ridge,rouge,pymupdf,cls,test,,,,1.260847,...,,,,,,,,2882,0.333333,0.730164
0,countvectorizer,ridge,rouge,nougat,reg,train,0.324264,0.136093,0.305555,0.986752,...,0.310777,0.172976,0.114072,0.112385,0.361114,0.276774,0.270341,230,,
1,countvectorizer,ridge,rouge,nougat,reg,val,-0.170717,0.234547,0.385244,0.846905,...,-0.268744,0.184954,0.19119,0.307043,0.363766,0.358767,0.429186,1349,,
2,countvectorizer,ridge,rouge,nougat,reg,test,-0.142104,0.230869,0.382286,0.84842,...,-0.253245,0.186589,0.182952,0.302679,0.364354,0.355254,0.423636,2817,,
0,countvectorizer,ridge,rouge,nougat,cls,train,,,,4.772768,...,,,,,,,,230,0.333333,0.717391


In [9]:
all_df_metrics[10]

Unnamed: 0,mode,model,score,parser,task,subset,acc,prec,rec,rir,n
0,fasttext,ridge,rouge,pymupdf,reg,train,0.615721,0.871907,0.333333,1.713679,229
1,fasttext,ridge,rouge,pymupdf,reg,val,0.412791,0.804264,0.333333,0.978152,1376
2,fasttext,ridge,rouge,pymupdf,reg,test,0.404233,0.801411,0.333333,1.020944,2882


## Script: Identify the most simplistic text-based classifier (that doesn't require LLM embeddings)
A set of approaches:
- *Bag of Words (BoW)*:
- *N-gram*:
- *fasttext*: embeddings for words

#### Important: train and test on the problem at hand (not aux. statistical problems)
- nobody has the attention span anymore

# Load Data
Use subset of `text` as input `[:3200]` and attempt to predict `bleu_pymupdf`, `bleu_nougat`, and `bleu_marker` from it

In [None]:
# path constants
p_embeddings_root_dir = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/embeddings/emb_by_model')
p_response_csv_path = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/parser_metrics_without_text_output.csv')
parser = 'pymupdf'
normalized = False
predefined_split = True
p_split_yaml_path = Path('/home/siebenschuh/Projects/dataprep/code/DPO/meta_split/pymupdf.yaml')

# compile the dataset frames (train/val/test) using `compile_DatasetFrames`
df_train, df_test, df_val = compile_DatasetFrames(
    p_embeddings=p_embeddings_root_dir,
    p_response=p_response_csv_path,
    parser=parser,
    normalized=normalized,
    predefined_split=predefined_split,
    p_split_yaml_path=p_split_yaml_path
)

In [None]:
len(df_train)

# 1. BoW Approach

In [None]:
%%time


# max number of chars
n_max_chars = 3200

# format data
# - raw X
X_train = df_train['text'].str[:n_max_chars]
X_val = df_val['text'].str[:n_max_chars]
# - process X
vectorizer = CountVectorizer(max_features=2500)  # Adjust max_features as needed
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# - y
y_train = df_train[['bleu_marker', 'bleu_nougat', 'bleu_pymupdf']]
y_val = df_val[['bleu_marker', 'bleu_nougat', 'bleu_pymupdf']]

In [None]:
%%time 

# define parameter grids for each model
ridge_param_dist = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]  # Adjust alpha values based on your data
}
lasso_param_dist = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
}
elastic_net_param_dist = {
    'alpha': [0.1, 1.0, 10.0, 100.0],   # Adjust alpha values based on your data
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  # Adjust l1_ratio values based on your data
}

# init model
# Set up RandomizedSearchCV for each model
ridge_search = RandomizedSearchCV(Ridge(), ridge_param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
elastic_net_search = RandomizedSearchCV(ElasticNet(), elastic_net_param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit RandomizedSearchCV to find the best hyperparameters
ridge_model = MultiOutputRegressor(ridge_search)
elastic_net_model = MultiOutputRegressor(elastic_net_search)

lasso_param_dist = {'alpha': uniform(0.01, 10)}
lasso_search = RandomizedSearchCV(Lasso(), lasso_param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
lasso_model = MultiOutputRegressor(lasso_search)

# fit the model
ridge_model.fit(X_train_vec, y_train)
elastic_net_model.fit(X_train_vec, y_train)

In [None]:
df_metrics_list = []
for model in [ridge_model, elastic_net_model]:
    # predict on the validation set
    y_pred = model.predict(X_val_vec)
    
    # calculate MSE
    mse = mean_squared_error(y_val, y_pred, multioutput='raw_values')
    print("MSE:", mse)
    
    # Calculate R²
    r2 = r2_score(y_val, y_pred, multioutput='raw_values')
    df_val['R2_pymupdf'], df_val['R2_nougat'], df_val['R2_marker'] = r2
    print("R²:", r2)
    
    # Calculate RMSE
    rmse = np.sqrt(mse)
    df_val['rmse_pymupdf'], df_val['rmse_nougat'], df_val['rmse_marker'] = rmse
    print("RMSE:", rmse)
    
    # Record end time and calculate duration
    end_time = time.time()
    training_duration = end_time - start_time
    
    # Create DataFrame to store metrics
    df_pred_metrics = pd.DataFrame({
        'name': ['bow'],
        'n_max_char': [n_max_chars],
        'N': [len(X_train)],
        'training_duration': [training_duration],
        'MSE_pymupdf': [mse[2]],
        'MSE_nougat': [mse[1]],
        'MSE_marker': [mse[0]],
        'R2_pymupdf': [r2[2]],
        'R2_nougat': [r2[1]],
        'R2_marker': [r2[0]],
        'RMSE_pymupdf': [rmse[2]],
        'RMSE_nougat': [rmse[1]],
        'RMSE_marker': [rmse[0]]
    })
    
    # show
    df_metrics_list.append(df_pred_metrics)

In [None]:
pd.concat