In [41]:
import fasttext

import numpy as np
from scipy.stats import uniform, randint
import pandas as pd
from pathlib import Path
import json

from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

from model import train_model
from validate import evaluate

from data import load_data, process_data
from matplotlib import pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [39]:
!pip install matplotlib

Defaulting to user installation because normal site-packages is not writeable


## DATA

In [20]:
# load data
# exclude these: 
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/zip/latest_pdfs_zip_assignments/LATEST_zip_dict_16.json')
with open(p, 'r') as f:
    assign_dict = json.load(f)

# to be excluded
l_of_l = [assign_dict[k] for k in assign_dict.keys()]
included_files = [f.split('/')[-1] for sub_list in l_of_l for f in sub_list]
len(included_files)

5120

In [21]:
%%time 

# Initialize empty lists to collect DataFrames
df_train_list = []
df_test_list = []
df_val_list = []

# List of parsers
parsers = ['pymupdf']
subsample_percentages = [1.0, 0.2, 0.2]

# load data
# exclude these: 
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/strong_scaling/data/zip/latest_pdfs_zip_assignments/LATEST_zip_dict_16.json')
with open(p, 'r') as f:
    assign_dict = json.load(f)

# to be excluded
l_of_l = [assign_dict[k] for k in assign_dict.keys()]
to_be_excluded_files = [f.split('/')[-1] for sub_list in l_of_l for f in sub_list]

# load data for the current parser
df_train, df_test, df_val = load_data(parser)

# exclude strong scaling data by extracting 'file_name' from 'path'
df_train['file_name'] = df_train['path'].str.split('/').str[-1]
df_test['file_name'] = df_test['path'].str.split('/').str[-1]
df_val['file_name'] = df_val['path'].str.split('/').str[-1]

# print lengths of each of the three
print(f'Parser: {parser}')
print(f'  Train rows: {len(df_train)}')
print(f'  Test rows: {len(df_test)}')
print(f'  Validation rows: {len(df_val)}\n')

# Subset the DataFrames to exclude rows where 'file_name' is in 'included_files'
df_train = df_train[~df_train['file_name'].isin(to_be_excluded_files)]
df_test = df_test[~df_test['file_name'].isin(to_be_excluded_files)]
df_val = df_val[~df_val['file_name'].isin(to_be_excluded_files)]


Load pre-defined split...

Train-Val Overlap: 0
Train-Test Overlap: 0
Val-Test Overlap: 0
df_train, df_test, df_val
Parser: pymupdf
  Train rows: 19137
  Test rows: 2882
  Validation rows: 1376

CPU times: user 3.32 s, sys: 2.14 s, total: 5.46 s
Wall time: 5.47 s


In [22]:
# MODEL
task = 'reg'
mode = 'fasttext'
model = 'ridge'
score = 'bleu'


In [24]:
df_train.columns

Index(['text', 'path', 'title', 'authors', 'creationdate', 'keywords', 'doi',
       'producer', 'format', 'first_page', 'abstract', 'bleu_nougat',
       'rouge_nougat', 'car_nougat', 'bleu_nougat_norm', 'rouge_nougat_norm',
       'car_nougat_norm', 'bleu_pymupdf', 'rouge_pymupdf', 'car_pymupdf',
       'bleu_pymupdf_norm', 'rouge_pymupdf_norm', 'car_pymupdf_norm',
       'bleu_grobid', 'rouge_grobid', 'car_grobid', 'bleu_grobid_norm',
       'rouge_grobid_norm', 'car_grobid_norm', 'bleu_pypdf', 'rouge_pypdf',
       'car_pypdf', 'bleu_pypdf_norm', 'rouge_pypdf_norm', 'car_pypdf_norm',
       'bleu_marker', 'rouge_marker', 'car_marker', 'bleu_marker_norm',
       'rouge_marker_norm', 'car_marker_norm', 'journal_cls', 'best_bleu_cls',
       'best_bleu_norm_cls', 'best_rouge_cls', 'best_rouge_norm_cls',
       'best_car_cls', 'best_car_norm_cls', 'file_name'],
      dtype='object')

In [23]:
# process data
# - X
data_list = process_data(df_train, df_test, df_val, n_max_chars=1600, max_features=1500, score=score, mode=mode, parsers=['pymupdf', 'nougat'])
(X_train, y_train), (X_val, y_val), (X_test, y_test) = data_list

# - y
y_score_list = [data_list[i][1] for i in range(len(data_list))]

# tasks
# - derive cls task
y_train_cls = np.array(y_train).argmax(1).reshape(-1, 1)
y_val_cls = np.array(y_val).argmax(1).reshape(-1, 1)
y_test_cls = np.array(y_test).argmax(1).reshape(-1, 1)

Read 3M words
Number of words:  43467
Number of labels: 0
Progress: 100.0% words/sec/thread:    1568 lr:  0.000000 avg.loss:  1.872119 ETA:   0h 0m 0s


(X_train_vec, y_train), (X_val_vec, y_val), (X_test_vec, y_test)


## Keep fix

In [25]:
all_df_metrics = []

# TRAINING
# - train
trained_model = train_model(model, X_train, y_train)

# INFO
info = {'mode': mode, 'model': model, 'score': score, 'parser': parser, 'task': task}

# - evaluate
out = evaluate(trained_model, data_list, y_score_list, info, parsers)

# append
all_df_metrics.append({'metrics' : out, 'model' : model})



## MODEL

In [26]:
# load data
out

Unnamed: 0,mode,model,score,parser,task,subset,r2,rmse,rmae,rir,acc,r2_pymupdf,rmse_pymupdf,rmae_pymupdf,n
0,fasttext,ridge,bleu,pymupdf,reg,train,0.458508,0.13972,0.320708,0.928704,0.569796,0.37668,0.160631,0.344181,16190
1,fasttext,ridge,bleu,pymupdf,reg,val,0.245811,0.164032,0.355629,0.93103,0.569075,0.308822,0.160061,0.355803,789
2,fasttext,ridge,bleu,pymupdf,reg,test,-0.034285,0.173638,0.368612,0.935256,0.5625,0.011859,0.156657,0.352913,1296


In [53]:
# save model
type(trained_model)

sklearn.multioutput.MultiOutputRegressor

In [52]:
predictions = trained_model.predict(X_train)
row_wise_differences = predictions[:, 0] - predictions[:, 1]
threshold = 0.05
exceeding_indices = np.where(row_wise_differences > threshold, 1, 0)

np.mean(exceeding_indices)

np.float64(0.016244595429277333)

In [31]:
# Assuming `predictions` is the array returned by `trained_model.predict(X_train)`
predictions = trained_model.predict(X_train)
threshold = 0.5
exceeding_indices = np.where(np.any(row_wise_differences > threshold, axis=1))[0]

print("Indices where threshold is exceeded:", len(exceeding_indices))

Indices where threshold is exceeded: 5059


In [63]:
import joblib
import os

def save_model(model, save_path):
    """Saves a scikit-learn model to the specified path."""
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure the directory exists
    joblib.dump(model, save_path)
    print(f"Model saved to {save_path}")

def load_model(load_path):
    """Loads a scikit-learn model from the specified path."""
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"No model found at {load_path}")
    return joblib.load(load_path)

# Example usage
save_path = '/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/model_weights/scikit_pred/multioutput_model.pkl'
save_model(trained_model, save_path)

# PREDICTIONS

# Later when loading the model
loaded_model = load_model(save_path)
preds = loaded_model.predict(X_train)

preds

Model saved to /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/model_weights/scikit_pred/multioutput_model.pkl


array([[0.4504497 , 0.44716   ],
       [0.45007408, 0.44793683],
       [0.4570262 , 0.4821627 ],
       ...,
       [0.52012384, 0.50316215],
       [0.48782963, 0.48362923],
       [0.35260308, 0.39054576]], dtype=float32)

In [72]:
type(X_train)

numpy.ndarray

In [None]:
def load_model(load_path):
    """Loads a scikit-learn model from the specified path."""
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"No model found at {load_path}")
    return joblib.load(load_path)
loaded_model = load_model(save_path)

In [71]:
preds_diff = preds[:, 0] - preds[:, 1]

# Find the indices where the difference is greater than 0.05
indices = np.where(preds_diff > 0.1)[0]
indices

array([5525, 5958])

In [69]:
loaded_model.predict(X_train).mean(axis=0)

array([0.41305426, 0.4236159 ], dtype=float32)

In [None]:
import fasttext
import joblib
import os
import numpy as np
from sklearn.linear_model import Ridge


from data import load_data, process_data
from matplotlib import pyplot as plt