## Default imports

In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
import ujson
import requests
import re
import networkx as nx
import numpy as np
import plotly.graph_objs as go
import plotly.express as px


from tqdm.auto import tqdm
from pathlib import Path
from typing import List, Optional, Tuple, Union
from joblib import Parallel, delayed
from os import cpu_count
from bs4 import BeautifulSoup
from plotly.offline import plot
from itertools import combinations
from functools import partial
from IPython.display import IFrame

## Get full category tree of ArXiV codes
---
We can utilize meta-information of categories for more convenient analysis, i.e. given the code `astro-ph.EP` we know

```yaml
- category 1: Physics
- category 2: Astrophysics
- category 3: Earth and Planetary Astrophysics
- category description: Interplanetary medium, planetary physics, planetary astrobiology, extrasolar planets, comets, asteroids, meteorites. Structure and formation of the solar system
```
---
Also, HTML output (i.e. for Plotly's labels) can be seen in `html_header` column:

```python
from IPython.core.display import display, HTML

display(HTML(data.loc['astro-ph.EP', 'html_header']))
```
which leads to representation:

`<h3>Physics</h3><h4>Astrophysics</h4><h4>Earth and Planetary Astrophysics</h4><h4>astro-ph.EP</h4><h5>Interplanetary medium, planetary physics, planetary astrobiology, extrasolar planets, comets, asteroids, meteorites. Structure and formation of the solar system</h5>`

or rendered label:

<h3>Physics</h3><h4>Astrophysics</h4><h4>Earth and Planetary Astrophysics</h4><h4>astro-ph.EP</h4><h5>Interplanetary medium, planetary physics, planetary astrobiology, extrasolar planets, comets, asteroids, meteorites. Structure and formation of the solar system</h5>
   
---
**(!)We will use `code_full` index to join category data with the papers**

In [10]:
# parse category tree
def get_category_tree_data(category_url: str) -> pd.DataFrame:
    """
    Get category tree for ArXiv codes 
    in convenient dataframe representation.
    Concrete codes are located in `full_code` index.
    Stores up to 3 category levels, 
    lvl 3 being the node, 
    lvl 1 being the root(s)
    
    Parameters
    ----------
    category_url: str
        What page to parse to get the taxonomy
        
    Returns
    -------
    data: pd.DataFrame
        Dataframe representation of categories, with columns
        [
            'cat1', 'cat1_code', 
            'cat2', 'cat2_code', 
            'cat3', 'cat3_code',
           'description'
        ]
    """
    category_page = requests.get(category_url).text
    soup = BeautifulSoup(category_page, features='html.parser')
    table = soup.find(id='category_taxonomy_list')
    groups = [e.text for e in table.findAll('h2')]
    subgroups = [e for e in table.findAll('h3')]
    categories = table.find_all(class_='columns divided')

    categories = [
        [block for block in re.split('\n+',e.text) if block]
        for e in categories
    ]

    round_brackets_regex = r'\((.*?)\)'

    # create category df
    data = pd.DataFrame(categories, columns=['cat3', 'description'])
    data['code_full'] = data['cat3'].str.split('\s+').str[0]
    data['cat3_code'] = data['code_full'].str.split('.').str[1]
    data['cat3'] = data['cat3'].str.extract(round_brackets_regex)
    data['cat2_code'] = data['code_full'].str.split('.').str[0]

    # add cat2 titles (physics)
    df2 = pd.DataFrame([sg.text for sg in subgroups], columns=['cat2'])
    df2['cat2_code'] = df2['cat2'].str.extract(round_brackets_regex)
    df2['cat2'] = df2['cat2'].str.split('(').str[0]
    data['cat2'] = data['cat2_code'].map(df2.set_index('cat2_code')['cat2'])

    cats2 = data[data.cat2.isnull()]['cat2_code'].unique()
    exclude_cat1 = 'Physics' # the only parent category that has 3 levels
    groups.pop(groups.index(exclude_cat1))

    df1 = pd.DataFrame(
        data={
            'cat1': groups, 
            'cat1_code': data[data.cat2.isnull()]['cat2_code'].unique()
        }
    )
    data = data.merge(df1, how='left', left_on='cat2_code', right_on='cat1_code')
    # fill NA properly
    data.loc[data['cat1'].isnull(), 'cat1'] = exclude_cat1
    cat2_na_idx = data['cat2'].isnull()
    data.loc[cat2_na_idx, 'cat2'] = data.loc[cat2_na_idx, 'cat1']
    cat1_na_idx = data['cat1_code'].isnull()
    data.loc[cat1_na_idx, 'cat1_code'] = \
    data.loc[cat1_na_idx, 'cat1'].str.lower()
    cat3_na_idx = data['cat3_code'].isnull()
    data.loc[cat3_na_idx, 'cat3_code'] = \
    data.loc[cat3_na_idx, 'cat2_code']
    
    data = data.sort_index(axis=1)
    
    data['html_header'] = data.apply(
        lambda x: ''.join(
            f'<h{min(5, i+3)}>{v}</h{min(5, i+3)}>' 
            for (i, v) in enumerate(x.filter(regex='(code_full|cat[0-9]$|description)').values)
        ), 
        axis=1
    )
    
    data = data.set_index('code_full')
    
    return data


def _get_obj_name(obj):
    name =[x for x in globals() if globals()[x] is obj][0]
    return name

Let's save obtained data as JSON dict 
<br>and also check sample category data

In [11]:
category_url = "https://arxiv.org/category_taxonomy"
category_data = get_category_tree_data(category_url=category_url)

cat_df_name = _get_obj_name(category_data)
with open(f'data/{cat_df_name}.json', 'w') as f:
    ujson.dump(
        category_data.to_dict(orient='index'),
        f
    )

# check sample data
category_data.sample(5, random_state=42)

Unnamed: 0_level_0,cat1,cat1_code,cat2,cat2_code,cat3,cat3_code,description,html_header
code_full,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
astro-ph.GA,Physics,physics,Astrophysics,astro-ph,Astrophysics of Galaxies,GA,Phenomena pertaining to galaxies or the Milky ...,<h3>Physics</h3><h4>Astrophysics</h4><h5>Astro...
q-fin.GN,Quantitative Finance,q-fin,Quantitative Finance,q-fin,General Finance,GN,Development of general quantitative methodolog...,<h3>Quantitative Finance</h3><h4>Quantitative ...
cs.OS,Computer Science,cs,Computer Science,cs,Operating Systems,OS,Roughly includes material in ACM Subject Class...,<h3>Computer Science</h3><h4>Computer Science<...
cs.NI,Computer Science,cs,Computer Science,cs,Networking and Internet Architecture,NI,Covers all aspects of computer communication n...,<h3>Computer Science</h3><h4>Computer Science<...
physics.flu-dyn,Physics,physics,Physics,physics,Fluid Dynamics,flu-dyn,"Turbulence, instabilities, incompressible/comp...",<h3>Physics</h3><h4>Physics</h4><h5>Fluid Dyna...


## Load the data
---
We can load the full data or just sample its part to reduce RAM used and time spent.
- you can control it either by
  - sample size (`n_rows` arg), i.e. 10000 *random* rows
  - sample fraction (`frac` arg), i.e. *random* 5% of data
  - **maximum(file_size * frac, n_rows) rows is taken as a result**
  - reproducibility is controlled by `seed` arg
- also, you can specify what column subset to get (`columns` arg)
- also, you can filter 
    - articles **newer** than `published_after` timestamp
    - articles updated **after** `last_update_after` timestamp

In [12]:
def get_sampled_data(
    data_path: Union[str, Path],
    frac: Optional[float] = None,
    n_rows: Optional[int] = None,
    columns: Optional[List[str]] = None,
    last_update_after: Optional[str] = None,
    published_after: Optional[str] = None,
    random_state: int = 42,
) -> pd.DataFrame:
    """TODO: add docstring"""
    
    # get # of records
    with open(data_path, 'r') as fp:
        num_lines = sum(1 for line in tqdm(fp, desc='counting # rows in a file...') if line.rstrip())

    arxiv_data = []
    
    if last_update_after:
        assert 'update_date' in columns, 'cannot filter on missing column `update_date`'
        last_update_after = pd.to_datetime(last_update_after, utc=True)
    if published_after:
        assert 'versions' in columns, 'cannot filter on missing column `versions`'
        published_after = pd.to_datetime(published_after, utc=True)
    
    if n_rows is not None or frac is not None:
        assert 1 <= n_rows <= num_lines, 'no more lines than file length'
        assert 0. < frac <= 1., 'no more fraction than 100%'
        
        n_samples = max( 
            1,
            n_rows,
            (int(frac*num_lines) if frac else 0) 
        )
    
    else:
        n_samples = num_lines
        
    sample_indexes = set(
        pd.Series(
            np.arange(num_lines)
        ).sample(n=n_samples, random_state=random_state).values
    )
    
    with open(data_path, 'r') as fp:
        
        for idx, line in tqdm(
            enumerate(fp),
            total=num_lines, 
            desc=f"loading up to {n_samples}/{num_lines} articles"
        ):
            if idx in sample_indexes:
                
                chunk = ujson.loads(line)
                chunk['created_at'] = chunk['versions'][0]['created']
                    
                if columns:
                    chunk = {k:v for k,v in chunk.items() if k in (columns + ['created_at']) and k != 'versions'}
                    
                arxiv_data.append(chunk)
        
    df = pd.DataFrame.from_records(arxiv_data)
    
    df['created_at'] = pd.to_datetime(df['created_at'], utc=True)
    df['update_date'] = pd.to_datetime(df['update_date'], utc=True)
    
    publish_idx = (
        df['created_at'] >= pd.to_datetime(published_after, utc=True) if published_after 
        else np.ones(len(df)).astype(bool)
    )
    last_update_idx = (
        df['update_date'] >= pd.to_datetime(last_update_after, utc=True) if last_update_after 
        else np.ones(len(df)).astype(bool)
    )
    
    if last_update_after or published_after:
        
        df = df[publish_idx & last_update_idx]
        
        print(
            f"""
            filtered out {n_samples - len(df)}/{n_samples} records 
            due to update/publish date filters"""
        )
    
    return df

In [13]:
data_path = 'data/arxiv-metadata-oai-snapshot.json'
columns_to_get = [
    'id',
    # 'authors',
    'title',
    'categories',
    'abstract',
    'versions',
    'update_date',
    'authors_parsed',
]

last_update_after = None
published_after = '2010-01-01'
frac = .1 # reduce to speed-up and to lower RAM consumption
n_rows = 1
seed = 42

df = get_sampled_data(
    data_path=data_path,
    # rows to get = max of two
    frac=frac,
    n_rows=n_rows,
    columns=columns_to_get,
    published_after=published_after,
    last_update_after=last_update_after,
    random_state=seed,
)
df['authors_parsed'] = df['authors_parsed'].apply(lambda x: [' '.join(a).strip() for a in x])
df['categories'] = df['categories'].str.split('\s+')

print(df.shape)

# check sample data
df.head()

counting # rows in a file...: 0it [00:00, ?it/s]

loading up to 226825/2268252 articles:   0%|          | 0/2268252 [00:00<?, ?it/s]


            filtered out 57908/226825 records 
            due to update/publish date filters
(168917, 7)


Unnamed: 0,id,title,categories,abstract,update_date,authors_parsed,created_at
16569,1001.0043,Strong Constraints to the Putative Planet Cand...,"[astro-ph.EP, astro-ph.IM]",We present new radial velocity measurements ...,2015-05-14 00:00:00+00:00,"[Anglada-Escude Guillem, Shkolnik Evgenya L., ...",2010-01-01 00:07:58+00:00
16579,1001.0158,What is the role of continuity in continuous l...,"[math.GN, math.FA]",The recent extensions of domain theory have ...,2013-01-03 00:00:00+00:00,[Poncet Paul],2010-01-03 11:42:35+00:00
16583,1001.0195,On refined Young inequalities,[math.FA],"In this paper, we study refinements of some ...",2012-01-27 00:00:00+00:00,"[Furuichi Shigeru, Lin Minghua]",2010-01-04 15:26:30+00:00
16584,1001.0203,On the generalized Feynman-Kac transformation ...,[math.PR],Suppose $X$ is a right process which is asso...,2010-01-05 00:00:00+00:00,"[Ma Li, Sun Wei]",2010-01-01 01:59:39+00:00
16585,1001.0211,Optimal control with moderation incentives,"[math.OC, math.DS]",A purely state-dependent cost function can b...,2010-01-05 00:00:00+00:00,[Lewis Debra],2010-01-01 20:46:45+00:00


## Produce Graph visualizations (no-ML)
- authors network
- category interactions
- etc.

Those are simple interactive visualizations built purely on `nx.Graph` and `plotly`, just to get better understanding of objects interaction

---

### Category interactions
Given set of assigned categories to each article, "flatten" them, create pair-wise interaction and calculate the size and the "strength" of category interactions/cooccurences

- node = category (i.e. `cs.AI`)
- node size ~ # of papers having this label assigned
- edge `(u,v)` ~ link between categories `u` and `v` (if they are met within the same article)
- edge `(u,v)` weight ~ number of co-occurences of categories `u` and `v`
- node color ~ category title (1st level - less or 2nd level - more distinct colors/categories)

In [14]:
def produce_interaction_graph(
    data: pd.Series,
    edge_weight_smoothing_fn: np.log1p,
    node_size_smoothing_fn: np.log1p,
    connection_thld: Optional[int] = 10,
    name: Optional[str] = 'category_interaction'
) -> nx.Graph:
    """
    TODO: add docstring
    """
    category_pairs = (
        data
        .apply(lambda x: list(combinations(x, 2)))
        .explode()
    )
    
    category_pairs = category_pairs[category_pairs.notnull()].value_counts()
    # apply threshold
    category_pairs = category_pairs[category_pairs > connection_thld]
    
    category_links = edge_weight_smoothing_fn(
        category_pairs
    ).to_dict()

    category_graph = nx.Graph(name=name)
    # add nodes
    node_size = node_size_smoothing_fn(
        (
            data
            .apply(lambda x: list(combinations(x, 2)) if len(x) > 1 else [x])
            .explode()
        ).explode().value_counts()
    )

    category_graph.add_nodes_from(node_size.index.tolist())
    category_graph_node_sizes = [node_size[n] for n in category_graph.nodes]

    # add edges/connections between categories
    category_graph.add_weighted_edges_from(
        (*k, v) for k,v in category_links.items()
    )

    return category_graph, category_graph_node_sizes


def plot_interaction_graph(
    G: nx.Graph, 
    category_data: pd.DataFrame,
    category_color_lvl: int = 2,
    node_sizes: Optional[List[float]] = None,
    node_colors: Optional[List[int]] = None,
    seed: int = 42, 
    title: str = '<br>Category interactions',
    edge_width: float = 0.1
):
    """TODO: add docstring"""
    # add node positions
    node_pos = nx.spring_layout(G, weight='weight', seed=seed)

    for n, p in node_pos.items():
        G.nodes[n]['pos'] = p
    
    # 1
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=edge_width, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = G.nodes[node]['pos']
        node_x.append(x)
        node_y.append(y)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
    #         colorscale='YlGnBu',
            colorscale='Inferno',
    #         reversescale=True,
            color=[],
            size=node_sizes if node_sizes is not None else 10,
#             colorbar=dict(
#                 thickness=10,
#                 title='Node Connections',
#                 xanchor='left',
#                 titleside='right'
#             ),
            line_width=2))

    # 2
    node_adjacencies = []
    node_text = []

    # node_trace.marker.color = node_adjacencies
    node_categories = pd.Series(G.nodes).index.map(category_data[f'cat{category_color_lvl}'])
    node_names = pd.Series(G.nodes).index.map(category_data['cat3'])
    
    if node_colors is None:
        node_colors = node_categories.astype('category').codes

    # for node, (cat, adjacencies) in enumerate(zip(G.adjacency(), node_categories.tolist())):
    for adjacencies, cat, name, code in zip(
        G.adjacency(), 
        node_categories.tolist(), 
        node_names.tolist(), 
        G.nodes
    ):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append(
            f'<b>id: {code}</b><br>name: {name}<br>category: {cat}<br># of connections: {len(adjacencies[1])}'
        )

    node_trace.marker.color = node_colors.tolist()
    node_trace.text = node_text

    # 3 
    fig = go.Figure(data=[edge_trace, node_trace],
                 layout=go.Layout(
                    title=title,
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    fig.show()

In [15]:
df['categories']

16569              [astro-ph.EP, astro-ph.IM]
16579                      [math.GN, math.FA]
16583                               [math.FA]
16584                               [math.PR]
16585                      [math.OC, math.DS]
                         ...                 
185493                               [hep-th]
185494             [astro-ph.GA, astro-ph.CO]
185495    [cs.CV, cs.AI, cs.GR, cs.HC, cs.RO]
185496                                [cs.CV]
185497                         [cs.CV, cs.HC]
Name: categories, Length: 168917, dtype: object

In [17]:
connection_thld = 1  # if set > 1 - be aware of isolated nodes, it affects scale of a figure

cG, cG_node_sizes = produce_interaction_graph(
    data=df['categories'],
    edge_weight_smoothing_fn=np.log1p,
    node_size_smoothing_fn=np.log1p,
    connection_thld=connection_thld,
    name='category_interaction',
)
# print(nx.info(cG))

# filter out isolated nodes
components = [x for x in list(nx.connected_components(cG)) if len(x) > 1]

subgraph_nodes = set.union(*components)
cG = cG.subgraph(subgraph_nodes)
    
    
plot_interaction_graph(
    G=cG, 
    category_data=category_data,
    category_color_lvl=1,
    node_sizes=cG_node_sizes,
    seed=seed, 
    title='<br>Category interactions',
    edge_width=0.1,
)

It's easy to notice that there are **almost exact categories** with the same names within **different parent categories**
- `cs.IT (Information Theory)` in `Computer Science` and `math.IT (Information Theory)` in `Mathematics`
- `math.MP (Mathematical Physics)` in `Mathematics` and `math-ph (Mathematical Physics)` in `Physics`
- etc.

As well as **different aspects of the same discipline (hardware, software)**:
- `eess.AP (Audio and Speech Processing)` in `Electrical Engineering and Systems Science` (hardware-like) and `cs.SD (Sound)` in `Computer Science` (software-like)

In [18]:
# you can see duplicate names of 3rd level categories 
# that lie in different disciplines yet share the same name :)
category_data['cat3'].value_counts().head(10)

Systems and Control              2
Information Theory               2
Numerical Analysis               2
Mathematical Physics             2
Machine Learning                 2
Statistics Theory                2
Chemical Physics                 1
Biological Physics               1
Atomic Physics                   1
Atomic and Molecular Clusters    1
Name: cat3, dtype: int64

### Produce authors' network and interactions

**Let's visualize some communities of authors, given subset of data**
- we filter articles on some 2nd level category (i.e. `cs (Computer Science)`)
- then, we create all C(n,2) combinations of author pairs for a single article to get connection strength (edges and its weights) 
- we exclude "isolated" (solo) authors from graph
- we observe top-N biggest communities (by size) from remaining [connected components of a graph](https://en.wikipedia.org/wiki/Component_(graph_theory))
- we see what are the most popular topics (3rd level categories) within particular author community

In [22]:
connection_thld = 1  # if set > 1 - be aware of isolated nodes, it affects scale of a figure

# limit subset of data, i.e. first X of `Computer Science - Artificial Intelligence` domain
categories = frozenset(category_data[category_data.index == 'cs.AI'].index)
category_idx = df['categories'].apply(lambda x: len(frozenset(x) & categories) > 0)

aG, aG_node_sizes = produce_interaction_graph(
    data=df.loc[category_idx, 'authors_parsed'],
    edge_weight_smoothing_fn=lambda x: np.power(x+1, 4/10),
    node_size_smoothing_fn=lambda x: np.power(x+1, 3/10),
    connection_thld=connection_thld,
    name='author_interaction',
)
# print(nx.info(aG))

# get subgraph without isolated nodes
components = [
    x for x in list(nx.connected_components(aG)) 
    if len(x) > 1 and len(x) < 250 # constrain up here, otherwise we can find too big communities :)
]

df_cc = pd.DataFrame({'cc': components})
df_cc['len'] = df_cc['cc'].apply(len)
df_cc = df_cc.sort_values(by='len', ascending=False)
offset = 0 # if 0 then it's truly equal to top-X communities
top_n = 5
cc_indexes = slice(offset, offset + top_n)
top_n_cc = df_cc['cc'].iloc[cc_indexes].tolist()

subgraph_nodes = set.union(*top_n_cc)
aSG = aG.subgraph(subgraph_nodes)

# print(nx.info(aSG))
    
plot_interaction_graph(
    G=aSG, 
    category_data=category_data,
    category_color_lvl=1,
    node_sizes=10,
    node_colors=np.array([
        {a:i for i,c in enumerate(top_n_cc) for a in c}[node]
        for node in aSG.nodes
    ]),
    seed=seed, 
    title='<br>Author interactions',
    edge_width=0.7,
)

Let's get the most "influential" author within the community (connected component)

We'll define it based on [PageRank algorithm](https://en.wikipedia.org/wiki/PageRank) which is a measure of "centrality" and "importance"

After we calculate and plot those influencers with their neighbours, we'll see the "central" nodes are those obtained influencer guys

In [23]:
# let's get the most "influential" author within the community (connected component)
top_influencers_in_cc = [
    pd.Series(
        nx.pagerank(aG.subgraph(cc), alpha=0.95, max_iter=1000)
    ).sort_values(ascending=False).index[0]
    for cc in top_n_cc
]

print(f'top co-authors:\n{top_influencers_in_cc}')

influencers_network = set()
# add those influencers and their direct connections (adjacent nodes)
for n in top_influencers_in_cc:
    influencers_network.update(aG.neighbors(n))
    
influencers_network.update(top_influencers_in_cc)

plot_interaction_graph(
    G=aG.subgraph(influencers_network), 
    category_data=category_data,
    category_color_lvl=1,
    node_sizes=10,
#     node_colors=np.array([
#         {a:i for i,c in enumerate(top_n_cc) for a in c}[node]
#         for node in aSG.nodes
#     ]),
    seed=seed, 
    title='<br>Co-author interactions',
    edge_width=0.7,
)

top co-authors:
['Gao Jianfeng', 'Yu Philip S.', 'Levine Sergey', 'Fung Pascale', 'Heess Nicolas']


<h4>Let's check whether topic distribution <b>differs</b> within different communities</h4>

In [24]:
# check what topics are the most popular within top-communities
top_topics = []
for cc in tqdm(top_n_cc, desc='top-X categories for community...'):
    broadest_community_idx = df['authors_parsed'].apply(lambda x: len(frozenset(x) & cc) > 0)
    topics_distribution = df.loc[broadest_community_idx, 'categories'].explode().value_counts() 
    topics_distribution /= topics_distribution.sum()
    top_topics.append(topics_distribution.iloc[:3])
    
df_topics = pd.DataFrame(
    top_topics, 
    index=pd.Series(list(range(1, top_n+1)), name='community#')
)

# top-1 topic for community
print(df_topics.idxmax(axis=1).map(category_data['cat3']))

# see top-x categories for each author community
px.bar(df_topics)

top-X categories for community...:   0%|          | 0/5 [00:00<?, ?it/s]

community#
1    Machine Learning
2    Machine Learning
3    Machine Learning
4    Machine Learning
5    Machine Learning
dtype: object


### Exploring top-authors

Given particular category, let's see who are top "hardworking" writers within particular category, i.e. `Computer Science`
and how they distribute their efforts across 3rd-level categories

- we "flatten" categories as well as authors
- for each author we count
    - how many distinct articles they wrote
    - what % of their work is dedicated to particular level3 category <br>(sum can exceed 100%, as we have multilabels)

In [25]:
cat2_to_explore = 'cs'
categories = frozenset(category_data[category_data['cat2_code'] == cat2_to_explore].index)
category_idx = df['categories'].apply(lambda x: len(frozenset(x) & categories) > 0)

In [26]:
articles_by_author = df.loc[
    category_idx, 
    ['authors_parsed', 'categories']
].explode('authors_parsed')

article_cnt_by_author = \
articles_by_author['authors_parsed'].value_counts().sort_values(ascending=True)

top_n_authors = article_cnt_by_author.iloc[-10:]
px.bar(top_n_authors, orientation='h', title=f'top authors by article cnt in `{cat2_to_explore}`')

Let's explore one of the fathers of AI (Hinton, LeCun, Bengio), in more details
According to data, we can see while some of them prefer to write papers **with almost the same co-authors**, the others (like Bengio) are **open to collaboration with broader scientific audience**.

In [27]:
authors_to_explore = ['Hinton Geoffrey', 'Bengio Yoshua', 'LeCun Yann']


for ate in authors_to_explore:
    author_idx = df['authors_parsed'].apply(lambda x: ate in frozenset(x))

    coauthors = list(aG.neighbors(ate)) + [ate]
    plot_interaction_graph(
        G=aG.subgraph(coauthors), 
        category_data=category_data,
        category_color_lvl=1,
        node_sizes=10,
        seed=seed, 
        title=f'<br>{ate} Co-authors network',
        edge_width=0.7,
    )

---
## (WIP) working with articles

To understand particular article in-depth (i.e. to understand why it was given particular category codes or was author's name extraction accurate enough, given `article_id` and using built-in capabilities of `ipython.display`, we can **download and embed** .pdf file of interest with `IFrame` help:

In [28]:
def preview_pdf(
    article_id: str, 
    size: Tuple[int] = (800, 600), 
    verbose: bool = True
) -> IFrame:
    
    url = f"https://arxiv.org/pdf/{article_id}.pdf"
    response = requests.get(url)
    cont = response.content

    local_path = f"{article_id}.pdf"
    with open(local_path, "wb") as file:
        file.write(cont)
        
    frame = IFrame(f"./{local_path}", width=800, height=600)
    
    if verbose:
        print(f'download link: {url}')
    
    return frame

In [29]:
# article_id = '1501.00223'
article_id = df.sample(1, random_state=seed)['id'].iloc[0]
print(f'article: {article_id}')
preview_pdf(article_id=article_id)

article: 2008.11587
download link: https://arxiv.org/pdf/2008.11587.pdf
