## Library imports

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import ujson
import requests
import re
import networkx as nx
import numpy as np
import plotly.graph_objs as go
import plotly.express as px


from tqdm.auto import tqdm
from pathlib import Path
from typing import List, Optional, Tuple, Union
from joblib import Parallel, delayed
from os import cpu_count
from bs4 import BeautifulSoup
from plotly.offline import plot
from itertools import combinations
from functools import partial
from IPython.display import IFrame

## Get full category tree of ArXiV codes
---
We can utilize meta-information of categories for more convenient analysis, i.e. given the code `astro-ph.EP` we know

```yaml
- category 1: Physics
- category 2: Astrophysics
- category 3: Earth and Planetary Astrophysics
- category description: Interplanetary medium, planetary physics, planetary astrobiology, extrasolar planets, comets, asteroids, meteorites. Structure and formation of the solar system
```
---
Also, HTML output (i.e. for Plotly's labels) can be seen in `html_header` column:

```python
from IPython.core.display import display, HTML

display(HTML(data.loc['astro-ph.EP', 'html_header']))
```
which leads to representation:

`<h3>Physics</h3><h4>Astrophysics</h4><h4>Earth and Planetary Astrophysics</h4><h4>astro-ph.EP</h4><h5>Interplanetary medium, planetary physics, planetary astrobiology, extrasolar planets, comets, asteroids, meteorites. Structure and formation of the solar system</h5>`

or rendered label:

<h3>Physics</h3><h4>Astrophysics</h4><h4>Earth and Planetary Astrophysics</h4><h4>astro-ph.EP</h4><h5>Interplanetary medium, planetary physics, planetary astrobiology, extrasolar planets, comets, asteroids, meteorites. Structure and formation of the solar system</h5>
   
---
**(!)We will use `code_full` index to join category data with the papers**

In [3]:
# parse category tree
def get_category_tree_data(category_url: str) -> pd.DataFrame:
    """
    Get category tree for ArXiv codes 
    in convenient dataframe representation.
    Concrete codes are located in `full_code` index.
    Stores up to 3 category levels, 
    lvl 3 being the node, 
    lvl 1 being the root(s)
    
    Parameters
    ----------
    category_url: str
        What page to parse to get the taxonomy
        
    Returns
    -------
    data: pd.DataFrame
        Dataframe representation of categories, with columns
        [
            'cat1', 'cat1_code', 
            'cat2', 'cat2_code', 
            'cat3', 'cat3_code',
           'description'
        ]
    """
    category_page = requests.get(category_url).text
    soup = BeautifulSoup(category_page, features='html.parser')
    table = soup.find(id='category_taxonomy_list')
    groups = [e.text for e in table.findAll('h2')]
    subgroups = [e for e in table.findAll('h3')]
    categories = table.find_all(class_='columns divided')

    categories = [
        [block for block in re.split('\n+',e.text) if block]
        for e in categories
    ]

    round_brackets_regex = r'\((.*?)\)'

    # create category df
    data = pd.DataFrame(categories, columns=['cat3', 'description'])
    data['code_full'] = data['cat3'].str.split('\s+').str[0]
    data['cat3_code'] = data['code_full'].str.split('.').str[1]
    data['cat3'] = data['cat3'].str.extract(round_brackets_regex)
    data['cat2_code'] = data['code_full'].str.split('.').str[0]

    # add cat2 titles (physics)
    df2 = pd.DataFrame([sg.text for sg in subgroups], columns=['cat2'])
    df2['cat2_code'] = df2['cat2'].str.extract(round_brackets_regex)
    df2['cat2'] = df2['cat2'].str.split('(').str[0]
    data['cat2'] = data['cat2_code'].map(df2.set_index('cat2_code')['cat2'])

    cats2 = data[data.cat2.isnull()]['cat2_code'].unique()
    exclude_cat1 = 'Physics' # the only parent category that has 3 levels
    groups.pop(groups.index(exclude_cat1))

    df1 = pd.DataFrame(
        data={
            'cat1': groups, 
            'cat1_code': data[data.cat2.isnull()]['cat2_code'].unique()
        }
    )
    data = data.merge(df1, how='left', left_on='cat2_code', right_on='cat1_code')
    # fill NA properly
    data.loc[data['cat1'].isnull(), 'cat1'] = exclude_cat1
    cat2_na_idx = data['cat2'].isnull()
    data.loc[cat2_na_idx, 'cat2'] = data.loc[cat2_na_idx, 'cat1']
    cat1_na_idx = data['cat1_code'].isnull()
    data.loc[cat1_na_idx, 'cat1_code'] = \
    data.loc[cat1_na_idx, 'cat1'].str.lower()
    cat3_na_idx = data['cat3_code'].isnull()
    data.loc[cat3_na_idx, 'cat3_code'] = \
    data.loc[cat3_na_idx, 'cat2_code']
    
    data = data.sort_index(axis=1)
    
    data['html_header'] = data.apply(
        lambda x: ''.join(
            f'<h{min(5, i+3)}>{v}</h{min(5, i+3)}>' 
            for (i, v) in enumerate(x.filter(regex='(code_full|cat[0-9]$|description)').values)
        ), 
        axis=1
    )
    
    data = data.set_index('code_full')
    
    return data


def _get_obj_name(obj):
    name =[x for x in globals() if globals()[x] is obj][0]
    return name

Let's save obtained data as csv 
<br>and also check sample category data

In [4]:
category_url = "https://arxiv.org/category_taxonomy"
category_data = get_category_tree_data(category_url=category_url)

# cat_df_name = _get_obj_name(category_data)
# with open(f'data/{cat_df_name}.json', 'w') as f:
#     ujson.dump(
#         category_data.to_dict(orient='index'),
#         f
#     )

# check sample data
category_data.to_csv('data/arxiv_categories.csv')
category_data.sample(5, random_state=42)

Unnamed: 0_level_0,cat1,cat1_code,cat2,cat2_code,cat3,cat3_code,description,html_header
code_full,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
astro-ph.GA,Physics,physics,Astrophysics,astro-ph,Astrophysics of Galaxies,GA,Phenomena pertaining to galaxies or the Milky ...,<h3>Physics</h3><h4>Astrophysics</h4><h5>Astro...
q-fin.GN,Quantitative Finance,q-fin,Quantitative Finance,q-fin,General Finance,GN,Development of general quantitative methodolog...,<h3>Quantitative Finance</h3><h4>Quantitative ...
cs.OS,Computer Science,cs,Computer Science,cs,Operating Systems,OS,Roughly includes material in ACM Subject Class...,<h3>Computer Science</h3><h4>Computer Science<...
cs.NI,Computer Science,cs,Computer Science,cs,Networking and Internet Architecture,NI,Covers all aspects of computer communication n...,<h3>Computer Science</h3><h4>Computer Science<...
physics.flu-dyn,Physics,physics,Physics,physics,Fluid Dynamics,flu-dyn,"Turbulence, instabilities, incompressible/comp...",<h3>Physics</h3><h4>Physics</h4><h5>Fluid Dyna...


## Filter and load the data
---
We can load the full data or just sample its part to reduce RAM used and time spent.
- you can control it either by
  - sample size (`n_rows` arg), i.e. 10000 *random* rows
  - sample fraction (`frac` arg), i.e. *random* 5% of data
  - **maximum(file_size * frac, n_rows) rows is taken as a result**
  - reproducibility is controlled by `seed` arg
- also, you can specify what column subset to get (`columns` arg)
- also, you can filter 
    - articles **newer** than `published_after` timestamp
    - articles updated **after** `last_update_after` timestamp

In [4]:
def get_sampled_data(
    data_path: Union[str, Path],
    frac: Optional[float] = None,
    n_rows: Optional[int] = None,
    columns: Optional[List[str]] = None,
    last_update_after: Optional[str] = None,
    published_after: Optional[str] = None,
    random_state: int = 42,
) -> pd.DataFrame:
    
    # get # of records
    with open(data_path, 'r') as fp:
        num_lines = sum(1 for line in tqdm(fp, desc='counting # rows in a file...') if line.rstrip())

    arxiv_data = []
    
    if last_update_after:
        assert 'update_date' in columns, 'cannot filter on missing column `update_date`'
        last_update_after = pd.to_datetime(last_update_after, utc=True)
    if published_after:
        assert 'versions' in columns, 'cannot filter on missing column `versions`'
        published_after = pd.to_datetime(published_after, utc=True)
    
    if n_rows is not None or frac is not None:
        assert 1 <= n_rows <= num_lines, 'no more lines than file length'
        assert 0. < frac <= 1., 'no more fraction than 100%'
        
        n_samples = max( 
            1,
            n_rows,
            (int(frac*num_lines) if frac else 0) 
        )
    
    else:
        n_samples = num_lines
        
    sample_indexes = set(
        pd.Series(
            np.arange(num_lines)
        ).sample(n=n_samples, random_state=random_state).values
    )
    
    with open(data_path, 'r') as fp:
        
        for idx, line in tqdm(
            enumerate(fp),
            total=num_lines, 
            desc=f"loading up to {n_samples}/{num_lines} articles"
        ):
            if idx in sample_indexes:
                
                chunk = ujson.loads(line)
                chunk['created_at'] = chunk['versions'][0]['created']
                    
                if columns:
                    chunk = {k:v for k,v in chunk.items() if k in (columns + ['created_at']) and k != 'versions'}
                    
                arxiv_data.append(chunk)
        
    df = pd.DataFrame.from_records(arxiv_data)
    
    df['created_at'] = pd.to_datetime(df['created_at'], utc=True)
    df['update_date'] = pd.to_datetime(df['update_date'], utc=True)
    
    publish_idx = (
        df['created_at'] >= pd.to_datetime(published_after, utc=True) if published_after 
        else np.ones(len(df)).astype(bool)
    )
    last_update_idx = (
        df['update_date'] >= pd.to_datetime(last_update_after, utc=True) if last_update_after 
        else np.ones(len(df)).astype(bool)
    )
    
    if last_update_after or published_after:
        
        df = df[publish_idx & last_update_idx]
        
        print(
            f"""
            filtered out {n_samples - len(df)}/{n_samples} records 
            due to update/publish date filters"""
        )
    
    return df

In [5]:
##Input data obtained from Kaggle: https://www.kaggle.com/datasets/Cornell-University/arxiv
data_path = 'data/arxiv-metadata-oai-snapshot.json'
columns_to_get = [
    'id',
    # 'authors',
    'title',
    'categories',
    'abstract',
    'versions',
    'update_date',
    'authors_parsed',
]

last_update_after = None
published_after = '2010-01-01'
frac = 1. # reduce to speed-up and to lower RAM consumption
n_rows = 1
seed = 42

df = get_sampled_data(
    data_path=data_path,
    # rows to get = max of two
    frac=frac,
    n_rows=n_rows,
    columns=columns_to_get,
    published_after=published_after,
    last_update_after=last_update_after,
    random_state=seed,
)
df['authors_parsed'] = df['authors_parsed'].apply(lambda x: [' '.join(a).strip() for a in x])
df['categories'] = df['categories'].str.split('\s+')

print(df.shape)

# check sample data
df.head()

counting # rows in a file...: 0it [00:00, ?it/s]

loading up to 2268252/2268252 articles:   0%|          | 0/2268252 [00:00<?, ?it/s]


            filtered out 579570/2268252 records 
            due to update/publish date filters
(1688682, 7)


Unnamed: 0,id,title,categories,abstract,update_date,authors_parsed,created_at
165291,1001.0003,A landscape of non-supersymmetric AdS vacua on...,[hep-th],We construct new families of non-supersymmet...,2010-05-19 00:00:00+00:00,"[Koerber Paul, Kors Simon]",2010-01-04 13:51:46+00:00
165302,1001.0014,Jet Shapes and Jet Algorithms in SCET,[hep-ph],Jet shapes are weighted sums over the four-m...,2010-12-06 00:00:00+00:00,"[Ellis Stephen D., Hornig Andrew, Lee Christop...",2010-01-04 20:56:57+00:00
165303,1001.0015,A Comprehensive Analysis of Uncertainties Affe...,[astro-ph.CO],We conduct a comprehensive analysis of the r...,2010-06-15 00:00:00+00:00,"[Behroozi Peter S., Conroy Charlie, Wechsler R...",2010-01-03 19:43:29+00:00
165305,1001.0017,"Testing product states, quantum Merlin-Arthur ...",[quant-ph],We give a test that can distinguish efficien...,2013-10-03 00:00:00+00:00,"[Harrow Aram W., Montanaro Ashley]",2010-01-04 18:01:41+00:00
165310,1001.0022,Mu-Tau Production at Hadron Colliders,"[hep-ph, hep-ex]",Motivated by large nu_mu-nu_tau flavor mixin...,2010-04-30 00:00:00+00:00,"[Han Tao, Lewis Ian, Sher Marc]",2010-01-04 04:10:52+00:00


## Save data

Save the filtered data dataframe to a csv for further analysis

In [6]:
df.to_csv('data/arxiv_since_2010.csv')