In [26]:
import heapq

import httpx
import networkx as nx
import time
import sqlite3
import pandas as pd
import persistqueue
from heapq import heappush, heappop

In [18]:
# Find the collaborators of a given author_id

def get_collaborators(author_id : str) -> set[str]:
    response = httpx.request(
        method="GET",
        url=f"https://api.openalex.org/works?filter=author.id:{author_id}",
    )

    collaborators = {
        authorship["author"]["id"].split("/")[-1]
        for result in response.json()["results"]
        for authorship in result["authorships"]
    }

    collaborators.remove(author_id)
    return collaborators


In [22]:
sample_author = 'A5035471624'

In [55]:
# A Crawler Builds a Network by Snowball Sampling (a BFS Visit)

class OpenAlexSurfer:

    def __init__(
            self,
            seed : str,
            max_distance : int = 2,
            base_url : str = "https://api.openalex.org",
    ):
        self.queue : list[tuple[int, tuple[str, str]]] = [(0, (seed, "Matteo Zignani"))]
        self.visited : set[str] = set(seed)
        self.max_distance : int = max_distance
        self.base_url : str = base_url

    def _get_collaborators(
            self,
            author_id : str
    ) -> set[[str, str]]:
        collaborators = set()

        try:
            response = httpx.request(
                method="GET",
                url=f"{self.base_url}/works?filter=author.id:{author_id}",
            )
            response.raise_for_status()
            data = response.json()
        except Exception as e:
            return set()

        results = data.get("results", [])
        if not isinstance(results, list):
            return set()

        for result in results:
            authorships = result.get("authorships", [])
            if not isinstance(results, list):
                continue

            for authorship in authorships:
                author = authorship.get("author", {})
                collaborator_id = author.get("id", "").split("/")[-1]
                collaborator_name = author.get("display_name", "")
                if collaborator_id and collaborator_id != author_id:
                    collaborators.add((collaborator_id, collaborator_name))

        return collaborators

    def visit(
            self
    ) -> set[tuple[tuple[str, str], tuple[str, str]]]:
        links : set[tuple[tuple[str, str], tuple[str, str]]] = set()

        while self.queue:
            (curr_dist, (curr_id, curr_name)) = heappop(self.queue)
            self.visited.add(curr_id)

            if curr_dist >= self.max_distance:
                return links

            for (next_id, next_name) in self._get_collaborators(curr_id):
                links.add(((curr_id, curr_name), (next_id, next_name)))
                if next_id not in self.visited:
                    heappush(self.queue, (curr_dist + 1, (next_id, next_name)))

            time.sleep(0.3)

        return links


In [58]:
crawler = OpenAlexSurfer(
    seed=sample_author,
    max_distance=2
)

links = crawler.visit()

In [59]:
len(links), links

(3834,
 {(('A5072088997', 'F. Auxilia'), ('A5076175449', 'Annalisa Bargellini')),
  (('A5039194707', 'Bruno Alessandro Rivieccio'),
   ('A5083427290', 'Dario Bernacchia')),
  (('A5044649325', 'Mauricio Soto'), ('A5059236519', 'Benoît Piégu')),
  (('A5055844783', 'Silvia Giordano'), ('A5056754015', 'Stefano Cresci')),
  (('A5091097820', 'Silvana Castaldi'), ('A5054062963', 'Beatrice Benatti')),
  (('A5015647058', 'Laura Ricci'), ('A5096817512', 'I Privare')),
  (('A5044649325', 'Mauricio Soto'), ('A5074962821', 'Gino Corsini')),
  (('A5055844783', 'Silvia Giordano'), ('A5058432383', 'Gina R. Kuperberg')),
  (('A5055881236', 'Cheick Tidiane Bâ'),
   ('A5042060179', 'John N. Pougué Biyong')),
  (('A5015647058', 'Laura Ricci'), ('A5096817496', 'P Dfiri')),
  (('A5065440896', 'Alessandro Comunian'),
   ('A5039437349', 'Alessandra Micheletti')),
  (('A5029966654', 'Alessandra Sala'), ('A5069050073', 'Stefan Toggweiler')),
  (('A5072088997', 'F. Auxilia'), ('A5020666494', 'Pietro Contegiacomo

In [62]:
links_ids = [(a, c) for (a, b), (c, d) in links]
authors_names = {}
for (a, b), (c, d) in links:
    authors_names[a] = b
    authors_names[c] = d

G = nx.from_edgelist(links_ids)

In [63]:
G.order(), G.size()

(2892, 3672)

In [64]:
# Add names

nx.set_node_attributes(G, authors_names, 'name')

In [65]:
nx.write_gexf(
    G=G,
    path="../data/openalex/openalex.gexf"
)