__Obiettivo__

Ricavare i metadati mancanti mediante l'impiego di ulteriori _REST API_.

Il notebook ricalca gli stessi obiettivi delineati in __metadata.ipynb__ e __grobid.ipynb__. Tuttavia, in questo caso, l'intento consiste nel completamento dei metadati mancanti, mediante l'impiego di ulteriori infrastrutture digitali dedite alla memorizzazione degli articoli scientifici posti in ambito accademido

In [4]:
base_url = "https://api.zotero.org"

In [5]:
import json

from typing import List, Dict

def get_json():
    with open("../../json/extraction/metadata_completed.json", "r") as file:
        content = file.read()

    return json.loads(content)

def define_missing_metadata_dict(_json: List[Dict[str, Dict]]) -> Dict[str, Dict]:
    dict_missing_metadata: Dict[str, Dict]  = {}

    for _dict in _json:
        for key, value in _dict.items():
            if value.get("DOI") is None:
                dict_missing_metadata[key] = value
    
    return dict_missing_metadata

_json = get_json()
dict_missing_metadata = define_missing_metadata_dict(_json)

In [6]:
from difflib import SequenceMatcher

def similar(title_retrieved: str, title_found: str) -> float:
    return SequenceMatcher(None, title_retrieved, title_found).ratio()

In [14]:
import os

from pyzotero import zotero
from dotenv import load_dotenv

load_dotenv()

zotero_client = zotero.Zotero(library_id=os.getenv("ZOTERO_USER_ID"), library_type="user", api_key=os.getenv("ZOTERO_KEY"))

In [12]:
import arxiv 

arxiv_client = arxiv.Client()

_dict = {key: value for key, value in dict_missing_metadata.items() if value["Title"] is not None and len(value["Author"]) > 0}

for key, value in _dict.items():
    _title = value["Title"]
    _author = " ".join(value["Author"])

    search = arxiv.Search(query=f"au:{_author} AND ti:{_title}")

    for result in arxiv_client.results(search):
            try:
                if similar(result.title, _title) > 0.7:  
                    print(result.pdf_url) 
            except Exception:
                continue

http://arxiv.org/pdf/math/0011253v1


__PyAlex__ rappresenta un _wrapper_ della _REST API_ offerta da __OpenAlex__. _OpenAlex_ è un catalogo globale contenente un vastissimo bacino di articoli e paper scientifici.

In [None]:
import pyalex

def split_url(url: str) -> str:
    tokens = url.split("/")
    return tokens[-2] + "/" + tokens[-1]

for key, value in dict_missing_metadata.items():
    try:
        _title = value.get("Title")

        works = pyalex.Works().search_filter(title=value.get("Title")).get()

        for work in works:
            if work["doi"] is not None and similar(work["title"], _title) > 0.7:

                # Key "doi" in the response contains the paper's DOI as URL
                dict_missing_metadata[key]["DOI"] = split_url(work["doi"])
                break
    except Exception as e:
        continue

In [13]:
dict_retrieved_metadata = {key: value for key, value in dict_missing_metadata.items() if value["DOI"] is not None}

for key, value in dict_retrieved_metadata.items():
    print(key, value)