In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import requests as r

## Dumping DANS

First, we use the pyDataverse library to connect to the DANS repository (it's a dataverse repository)

https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#connect-to-native-api

In [2]:
from pyDataverse.api import NativeApi
api = NativeApi("https://ssh.datastations.nl/")

api.get_info_version().json()

{'status': 'OK',
 'data': {'version': '6.3', 'build': 'DANS-DataStation-PATCH-9'}}

We dump the entire repository by getting the contents of the special dataverse ":root". Warning: this takes like 15 mins.

In [5]:
tree = api.get_dataverse_contents(":root")

tree.json()

{'status': 'OK',
 'data': [{'id': 1920,
   'identifier': 'dans-zhr-eswk',
   'persistentUrl': 'https://doi.org/10.17026/dans-zhr-eswk',
   'protocol': 'doi',
   'authority': '10.17026',
   'publisher': 'DANS Data Station Social Sciences and Humanities',
   'publicationDate': '1990-01-01',
   'storageIdentifier': 'surf://10.17026/dans-zhr-eswk',
   'type': 'dataset'},
  {'id': 1921,
   'identifier': 'dans-xeh-f6xm',
   'persistentUrl': 'https://doi.org/10.17026/dans-xeh-f6xm',
   'protocol': 'doi',
   'authority': '10.17026',
   'publisher': 'DANS Data Station Social Sciences and Humanities',
   'publicationDate': '1995-01-01',
   'storageIdentifier': 'surf://10.17026/dans-xeh-f6xm',
   'type': 'dataset'},
  {'id': 1924,
   'identifier': 'dans-zct-en83',
   'persistentUrl': 'https://doi.org/10.17026/dans-zct-en83',
   'protocol': 'doi',
   'authority': '10.17026',
   'publisher': 'DANS Data Station Social Sciences and Humanities',
   'publicationDate': '1999-01-01',
   'storageIdentifie

Save it here so we don't have to do the above again

In [9]:
with open("dans.json", "wb") as f:
    f.write(tree.read())

Move the data into pandas to start working with it

In [8]:
df_dans = pd.DataFrame.from_dict(tree.json()["data"])

df_dans

Unnamed: 0,id,identifier,persistentUrl,protocol,authority,publisher,publicationDate,storageIdentifier,type
0,1920,dans-zhr-eswk,https://doi.org/10.17026/dans-zhr-eswk,doi,10.17026,DANS Data Station Social Sciences and Humanities,1990-01-01,surf://10.17026/dans-zhr-eswk,dataset
1,1921,dans-xeh-f6xm,https://doi.org/10.17026/dans-xeh-f6xm,doi,10.17026,DANS Data Station Social Sciences and Humanities,1995-01-01,surf://10.17026/dans-xeh-f6xm,dataset
2,1924,dans-zct-en83,https://doi.org/10.17026/dans-zct-en83,doi,10.17026,DANS Data Station Social Sciences and Humanities,1999-01-01,surf://10.17026/dans-zct-en83,dataset
3,1949,dans-xjk-kqxa,https://doi.org/10.17026/dans-xjk-kqxa,doi,10.17026,DANS Data Station Social Sciences and Humanities,2004-01-01,surf://10.17026/dans-xjk-kqxa,dataset
4,2082,dans-xpa-uek9,https://doi.org/10.17026/dans-xpa-uek9,doi,10.17026,DANS Data Station Social Sciences and Humanities,1993-01-01,surf://10.17026/dans-xpa-uek9,dataset
...,...,...,...,...,...,...,...,...,...
8687,615257,SS/GIJTA1,https://doi.org/10.17026/SS/GIJTA1,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-28,surf://10.17026/SS/GIJTA1,dataset
8688,615280,SS/IHYCRX,https://doi.org/10.17026/SS/IHYCRX,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-13,surf://10.17026/SS/IHYCRX,dataset
8689,615286,SS/ZY6QIP,https://doi.org/10.17026/SS/ZY6QIP,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-12,surf://10.17026/SS/ZY6QIP,dataset
8690,615288,SS/TM4SW5,https://doi.org/10.17026/SS/TM4SW5,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-09,surf://10.17026/SS/TM4SW5,dataset


In [39]:
df_dans.sample(5)["persistentUrl"]

6898    https://doi.org/10.17026/dans-xa6-6gpf
696     https://doi.org/10.17026/dans-2ca-hxmd
1540    https://doi.org/10.17026/dans-zkw-e99f
1431    https://doi.org/10.17026/dans-z4j-yrfe
6605    https://doi.org/10.17026/dans-z8w-wqqs
Name: persistentUrl, dtype: object

In [46]:
def fair_checker(pid: str):
    res = r.get("https://fair-checker.france-bioinformatique.fr/api/check/legacy/metrics_all", params={"url": pid})
    return {metric["metric"]: int(metric["score"]) for metric in res.json()}

In [49]:
sample = df_dans.sample(2)["persistentUrl"]
results: pd.DataFrame = sample.apply(fair_checker).apply(pd.Series) / 2

pd.concat([sample, results], axis=1)

Unnamed: 0,persistentUrl,F1A,F1B,F2A,F2B,A1.1,A1.2,I1,I2,I3,R1.1,R1.2,R1.3
232,https://doi.org/10.17026/dans-279-hy72,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.5
1625,https://doi.org/10.17026/dans-xqf-78qs,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.5


In [50]:
results.to_latex()

'\\begin{tabular}{lrrrrrrrrrrrr}\n\\toprule\n & F1A & F1B & F2A & F2B & A1.1 & A1.2 & I1 & I2 & I3 & R1.1 & R1.2 & R1.3 \\\\\n\\midrule\n232 & 1.000000 & 1.000000 & 0.500000 & 0.500000 & 1.000000 & 1.000000 & 0.500000 & 0.500000 & 1.000000 & 1.000000 & 1.000000 & 0.500000 \\\\\n1625 & 1.000000 & 1.000000 & 0.500000 & 0.500000 & 1.000000 & 1.000000 & 0.500000 & 0.500000 & 1.000000 & 1.000000 & 1.000000 & 0.500000 \\\\\n\\bottomrule\n\\end{tabular}\n'