In [None]:
import pandas as pd
from pathlib import Path
import os
import json
import numpy as np
import random

### Script that reads in the metadata CSVs files (in subdirectory `csv`) stored along with the scrapped web text (in `html` subdirectory)
- since `html` content and metadata is web-scraped per platform, the way content is stored slightly varies
- this script "unifies" them again so they can be stored in a dataset-wide datatable 

In [163]:
def read_meta_table(p:Path):
    '''
    Read meta CSV
    '''
    
    f_p = str(p)
    try:
        if ('/medrxiv/' in f_p) or ('/biorxiv/' in f_p) or ('/bmc/' in f_p):
            df = pd.read_csv(p, sep='|', header=None)
            df = df.T
            df.columns = df.iloc[0]
            df = df.drop(0)
            df = df.loc[:, ~df.columns.isna()]
            df = df.reset_index(drop=True)
            
        elif ('/nature/' in f_p):
            df = pd.read_csv(p, sep='|', header=None)
            key = df.iloc[0, 0]  # 'html_url'
            value = df.iloc[0, 1]  # The URL
            df = pd.DataFrame({key: [value]})
        elif ('/arxiv/' in f_p):
            df = pd.read_csv(p, sep='|')
    
        elif ('/mdpi/' in f_p):
            df = pd.read_csv(p, sep='|')
            df = df.T
            df.columns = df.iloc[0]
            df = df.drop('Unnamed: 0')
            df = df.loc[:, ~df.columns.isna()]
            df = df.reset_index(drop=True)
            df = df.drop(columns=[df.columns[0]])
            
            # drop columns name
            df.columns.name=None
    
        # append (presumed) path of source PDF
        df['pdf_path'] = f_p.replace('/csv/', '/pdf/').replace('.csv', '.pdf')
    except:
        df = pd.DataFrame({'pdf_path' : f_p.replace('/csv/', '/pdf/').replace('.csv', '.pdf')}, index=[0])

    return df
        

In [161]:
# read raw
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database')
df_raw = pd.read_csv(p / 'parser_output_raw.csv', sep='|')

In [174]:
%%time 

# source path
p = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint')

# extract all CSVs
csv_files = list(p.rglob('*.csv'))

# subset
n = len(csv_files)
out_list = [read_meta_table(csv_file) for csv_file in csv_files[:n]]

# merge
merged_df = pd.concat(out_list, axis=0, join='outer', ignore_index=True)

# path
merged_df['path'] = merged_df['pdf_path'].apply(lambda x: '/'.join(x.split('/')[-3:]))

# sort columns
merged_df = merged_df[['path', 'prim_cat', 'title', 'pdf_url', 'html_url', 'summary', 'comment', 'id', 'date_published', 'date_updated', 'doi', 'categories', 'journal_ref', 'pdf_path']]

CPU times: user 36.5 s, sys: 6.03 s, total: 42.5 s
Wall time: 45.7 s


In [175]:
# sort index
df_proc_merged_sorted = merged_df.set_index('path').loc[df_raw['path']].reset_index()

In [176]:
df_proc_merged_sorted.head()

Unnamed: 0,path,prim_cat,title,pdf_url,html_url,summary,comment,id,date_published,date_updated,doi,categories,journal_ref,pdf_path
0,arxiv/pdf/2207.11282v4.pdf,q-bio.NC,Communities in C.elegans connectome through th...,http://arxiv.org/pdf/2207.11282v4.pdf,http://arxiv.org/html/2207.11282v4,The fundamental relationship between the mesos...,This work previously appeared as arXiv:2207.00...,http://arxiv.org/abs/2207.11282v4,2022-07-22 18:15:55+00:00,2024-01-10 18:44:01+00:00,10.1038/s41598-023-49503-5,,,/lus/eagle/projects/argonne_tpc/siebenschuh/au...
1,arxiv/pdf/2303.02697v2.pdf,cond-mat.soft,Self-organized vortex phases and hydrodynamic ...,http://arxiv.org/pdf/2303.02697v2.pdf,http://arxiv.org/html/2303.02697v2,Flocking behavior is observed in biological sy...,"Version 2: 11 pages, 9 figures, even more data...",http://arxiv.org/abs/2303.02697v2,2023-03-05 15:22:06+00:00,2024-05-23 16:53:52+00:00,,,,/lus/eagle/projects/argonne_tpc/siebenschuh/au...
2,arxiv/pdf/2306.11599v2.pdf,q-fin.MF,Collective Arbitrage and the Value of Cooperation,http://arxiv.org/pdf/2306.11599v2.pdf,http://arxiv.org/html/2306.11599v2,We introduce the notions of Collective Arbitra...,,http://arxiv.org/abs/2306.11599v2,2023-06-20 15:21:37+00:00,2024-05-30 09:28:00+00:00,,,,/lus/eagle/projects/argonne_tpc/siebenschuh/au...
3,arxiv/pdf/2306.11872v2.pdf,eess.SY,Predicting Strategic Energy Storage Behaviors,http://arxiv.org/pdf/2306.11872v2.pdf,http://arxiv.org/html/2306.11872v2,Energy storage are strategic participants in e...,"accepted by IEEE Transactions on Smart Grid, 2023",http://arxiv.org/abs/2306.11872v2,2023-06-20 20:10:09+00:00,2024-01-31 19:43:42+00:00,10.1109/TSG.2023.3303469,,,/lus/eagle/projects/argonne_tpc/siebenschuh/au...
4,arxiv/pdf/2307.00277v1.pdf,eess.SY,Optimally Coordinated Energy Management Framew...,http://arxiv.org/pdf/2307.00277v1.pdf,http://arxiv.org/html/2307.00277v1,Contemporary distribution network can be seen ...,,http://arxiv.org/abs/2307.00277v1,2023-07-01 09:11:50+00:00,2023-07-01 09:11:50+00:00,,,,/lus/eagle/projects/argonne_tpc/siebenschuh/au...


In [178]:
# store
df_proc_merged_sorted.to_csv('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database/meta_raw_table.csv', index=None, sep='|')