In [3]:
import pandas as pd
from datasets import load_dataset
import numpy as np
import json

In [5]:
abstract_texts_path = '../data/vector_store_csLG/abstract_texts.json'
abstracts_embeddings_path = '../data/vector_store_csLG/abstract_embeddings.npy'

abstract_texts = json.load(open(abstract_texts_path))
abstracts_embeddings = np.load(abstracts_embeddings_path)

In [28]:
# Load the dataset from huggingface
ds = load_dataset('charlieoneill/csLG')

In [15]:
hf_ids = ds['train']['id']
hf_ids_set = set(hf_ids)

In [16]:
our_ids = abstract_texts['doc_ids']
our_ids_set = set(our_ids)

In [17]:
# Print length and intersection of the two sets
print(len(hf_ids_set), len(our_ids_set))
print(len(hf_ids_set.intersection(our_ids_set)))

153146 153146
153146


In [24]:
ds

DatasetDict({
    train: Dataset({
        features: ['categories', 'doi', 'id', 'year', 'venue', 'link', 'updated', 'published', 'title', 'abstract', 'authors'],
        num_rows: 153146
    })
})

In [29]:
# Drop columns from the dataset
ds = ds.remove_columns(['categories', 'abstract', 'venue', 'link', 'updated', 'published'])
ds

DatasetDict({
    train: Dataset({
        features: ['doi', 'id', 'year', 'title', 'authors'],
        num_rows: 153146
    })
})

In [32]:
# Convert to pandas
df = pd.DataFrame(ds['train'])   
df.head()

Unnamed: 0,doi,id,year,title,authors
0,,1004,,Multiplicative Algorithm for Orthgonal Groups ...,[['Toshinao Akuzawa']]
1,,1008,,Predicting the expected behavior of agents tha...,[['Jose M. Vidal' 'Edmund H. Durfee']]
2,,1027,,Pattern Discovery and Computational Mechanics,[['Cosma Rohilla Shalizi' 'James P. Crutchfiel...
3,,2006,,Multiplicative Nonholonomic/Newton -like Algor...,[['Toshinao Akuzawa' 'Noboru Murata']]
4,,3072,,MOO: A Methodology for Online Optimization thr...,[['Jason W. H. Lee' 'Y. C. Tay' 'Anthony K. H....


In [33]:
df.rename(columns={'id': 'arxiv_id'}, inplace=True)
df['citation_count'] = 0
df.head()

Unnamed: 0,doi,arxiv_id,year,title,authors,citation_count
0,,1004,,Multiplicative Algorithm for Orthgonal Groups ...,[['Toshinao Akuzawa']],0
1,,1008,,Predicting the expected behavior of agents tha...,[['Jose M. Vidal' 'Edmund H. Durfee']],0
2,,1027,,Pattern Discovery and Computational Mechanics,[['Cosma Rohilla Shalizi' 'James P. Crutchfiel...,0
3,,2006,,Multiplicative Nonholonomic/Newton -like Algor...,[['Toshinao Akuzawa' 'Noboru Murata']],0
4,,3072,,MOO: A Methodology for Online Optimization thr...,[['Jason W. H. Lee' 'Y. C. Tay' 'Anthony K. H....,0


In [35]:
# Replace NaN in year column with 2023
df['year'] = df['year'].fillna(2023)
df['year'] = df['year'].astype(int)

# Convert arxiv_id column to str
df['arxiv_id'] = df['arxiv_id'].astype(str)

In [36]:
df.head()

Unnamed: 0,doi,arxiv_id,year,title,authors,citation_count
0,,1004,2023,Multiplicative Algorithm for Orthgonal Groups ...,[['Toshinao Akuzawa']],0
1,,1008,2023,Predicting the expected behavior of agents tha...,[['Jose M. Vidal' 'Edmund H. Durfee']],0
2,,1027,2023,Pattern Discovery and Computational Mechanics,[['Cosma Rohilla Shalizi' 'James P. Crutchfiel...,0
3,,2006,2023,Multiplicative Nonholonomic/Newton -like Algor...,[['Toshinao Akuzawa' 'Noboru Murata']],0
4,,3072,2023,MOO: A Methodology for Online Optimization thr...,[['Jason W. H. Lee' 'Y. C. Tay' 'Anthony K. H....,0


In [37]:
# Save
df.to_csv('sae_data_csLG/paper_metadata.csv', index=False)