In [1]:
import os
import pandas as pd
import json
from pathlib import Path
import pymupdf
import numpy as np

from utils import mode, simulated_scanned_effect, remove_text_layer

# HyperPara
categories = ['ComputerScience', 'Engineering', 'Physics', 'Chemistry', 'Mathematics', 'Economics', 'Biology', 'Medicine']
p_joint = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint/')

# SAVE?
save_flag = False # stored on Oct 2nd, already

## Two runs to predict categories

### 1. GPT-4 ()
See
```
../statistical_tasks/gpt4_category_inference/category_frames/gpt4_v1.csv
```

### 2. HuggingFace
Various runs stored in here
```
/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/predicted_categories
```

In [2]:
# = = = = =
# 1. GPT-4
# = = = = =
# path
p_root = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint/')
# destination
p_dst_shaved = Path('/home/siebenschuh/Projects/dataprep/code/data_assembly/')

# load 
p_categories = Path('../statistical_tasks/gpt4_category_inference/category_frames/gpt4_v1.csv')

# read
df_gpt4 = pd.read_csv(p_categories, sep='|')

# assign publisher
df_gpt4['publisher'] = df_gpt4['path'].str.split('/').str[0]

# - class
df_gpt4['class'] = df_gpt4['class'].str.replace(' ', '')
df_gpt4 = df_gpt4[df_gpt4['class'].isin(categories)]


In [8]:
p = Path('/home/siebenschuh/Projects/dataprep/code/statistical_tasks/gpt4_category_inference/safecopy/super_and_sup_classes_v2_SECOND_RUN_2901.csv', sep='|')

df_gpt4_2 = pd.read_csv(p, sep='|')
df.columns

Index(['path', 'class', 'subclass'], dtype='object')

In [19]:
df_gpt4_2['class']

0       Based on the provided text, which contains onl...
1                                 Broad Category: Biology
2       Based on the provided text, which contains onl...
3       Based on the provided text, which consists of ...
4       Based on the provided text, which contains onl...
                              ...                        
2896    Based on the provided text, which consists of ...
2897    Based on the provided text, which contains onl...
2898                              Broad Category: Biology
2899                              Broad Category: Biology
2900                              Broad Category: Biology
Name: class, Length: 2901, dtype: object

In [9]:
df_gpt4_1 = df_gpt4[['path', 'class', 'subclass']]

In [14]:
df_gpt4 = pd.concat([df_gpt4_1, df_gpt4_2], ignore_index=True)

In [18]:
df_gpt4.to_csv('../statistical_tasks/gpt4_category_inference/category_frames/gpt4_v2_reduced.csv', sep='|', index=None)

In [3]:
# = = = = = = = = = = = = = = =
# 2. Various HF Models
# = = = = = = = = = = = = = = =

# path
p_LLM_frames = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/predicted_categories')
# list of file paths for CSVs
csv_files = [p_LLM_frames / f for f in os.listdir(p_LLM_frames) if f.endswith('.csv')]
# read each CSV file into a DataFrame and store them in a list
df_list = [pd.read_csv(f) for f in csv_files]
# concatenate the DataFrames row-wise
df_combined = pd.concat(df_list, axis=0, ignore_index=True)
# handle absolute paths --> bring back to `arxiv/pdf/3748.35.pdf` etc.
split_paths = df_combined[df_combined['path'].str.startswith('/lus')]['path'].str.split('/').str[-3:]
df_combined['path'] = split_paths.str.join('/')

# all entries
df_combined['publisher'] = df_combined['path'].str.split('/').str[0]
df_combined = df_combined[~((df_combined['predicted_category']!='Mathematics') & (df_combined['predicted_subcategory']=='Probability'))]
df_combined[df_combined['publisher']=='nature']['class'] = 'Nature'

# unqiue
df_unique = df_combined.groupby('path').agg({
    'publisher': mode,
    'predicted_category': mode,
    'predicted_subcategory': mode
}).reset_index()


# filter steps ("weak human supervision")
df_unique = df_unique[~((df_unique['predicted_category']!='Mathematics') & (df_unique['predicted_subcategory']=='Probability'))]

# assign class
df_unique['class'] = df_unique['predicted_category'].str.replace(' ', '')
df_unique = df_unique[df_unique['class'].isin(categories)]

# merge
df = pd.merge(left=df_gpt4,right=df_unique,on='path',how='outer')

# consolidate
# - clss
df['class'] = np.where(df['class_x'].notna(), df['class_x'], df['class_y'])
df['class_src'] = np.where(df['class_x'].notna(), 'gpt4', 'hf_ensemble')
# - subclass
df['subclass'] = np.where(df['subclass'].notna(), df['subclass'], df['predicted_subcategory'])
df['subclass_src'] = np.where(df['subclass'].notna(), 'gpt4', 'hf_ensemble')

# subset to non-NaN
df = df[~df['class'].isna()]

# attach publisher info
df['publisher'] = df['path'].str.split('/').str[0]

# subset to `path`, `class`, `subclass`
df = df[['path', 'publisher', 'class', 'subclass', 'class_src', 'subclass_src']]

# = = = = = = = = = = = = = = =
# 3. Manual Handling of `Nature`
# = = = = = = = = = = = = = = =
nature_dir = p_joint / 'nature'
nature_paths = ["/".join(str(p).split('/')[-3:]) for p in list(nature_dir.rglob('*.pdf'))]

df_nature = pd.DataFrame({'path' : nature_paths, 
                          'publisher' : ['Nature'] * len(nature_paths), 
                          'class' : ['Nature'] * len(nature_paths), 
                          'subclass': ['-'] * len(nature_paths),
                          'class_src': ['-'] * len(nature_paths),
                          'subclass_src': ['-'] * len(nature_paths)}, index=None)

# append
df = pd.concat([df, df_nature], axis=0, ignore_index=True)

# save
if save_flag:
    df.to_csv('./final_predicted_meta/predicted_categories_final.csv', sep='|', index=None)