In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import requests as r

Matplotlib is building the font cache; this may take a moment.


## Dumping DANS

First, we use the pyDataverse library to connect to the DANS repository (it's a dataverse repository)

https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#connect-to-native-api

In [2]:
from pyDataverse.api import NativeApi
api = NativeApi("https://ssh.datastations.nl/")

api.get_info_version().json()

{'status': 'OK',
 'data': {'version': '6.3', 'build': 'DANS-DataStation-PATCH-9'}}

We dump the entire repository by getting the contents of the special dataverse ":root". Warning: this takes like 15 mins.

In [3]:
try:
    with open("dans.json", "rb") as f:
        dans = json.load(f)

except FileNotFoundError:
    tree = api.get_dataverse_contents(":root")

    dans = tree.json()

    with open("dans.json", "wb") as f:
        f.write(tree.read())

Move the data into pandas to start working with it

In [4]:
df_dans = pd.DataFrame.from_dict(dans["data"])

df_dans

Unnamed: 0,id,identifier,persistentUrl,protocol,authority,publisher,publicationDate,storageIdentifier,type
0,1920,dans-zhr-eswk,https://doi.org/10.17026/dans-zhr-eswk,doi,10.17026,DANS Data Station Social Sciences and Humanities,1990-01-01,surf://10.17026/dans-zhr-eswk,dataset
1,1921,dans-xeh-f6xm,https://doi.org/10.17026/dans-xeh-f6xm,doi,10.17026,DANS Data Station Social Sciences and Humanities,1995-01-01,surf://10.17026/dans-xeh-f6xm,dataset
2,1924,dans-zct-en83,https://doi.org/10.17026/dans-zct-en83,doi,10.17026,DANS Data Station Social Sciences and Humanities,1999-01-01,surf://10.17026/dans-zct-en83,dataset
3,1949,dans-xjk-kqxa,https://doi.org/10.17026/dans-xjk-kqxa,doi,10.17026,DANS Data Station Social Sciences and Humanities,2004-01-01,surf://10.17026/dans-xjk-kqxa,dataset
4,2082,dans-xpa-uek9,https://doi.org/10.17026/dans-xpa-uek9,doi,10.17026,DANS Data Station Social Sciences and Humanities,1993-01-01,surf://10.17026/dans-xpa-uek9,dataset
...,...,...,...,...,...,...,...,...,...
8687,615257,SS/GIJTA1,https://doi.org/10.17026/SS/GIJTA1,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-28,surf://10.17026/SS/GIJTA1,dataset
8688,615280,SS/IHYCRX,https://doi.org/10.17026/SS/IHYCRX,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-13,surf://10.17026/SS/IHYCRX,dataset
8689,615286,SS/ZY6QIP,https://doi.org/10.17026/SS/ZY6QIP,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-12,surf://10.17026/SS/ZY6QIP,dataset
8690,615288,SS/TM4SW5,https://doi.org/10.17026/SS/TM4SW5,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-09,surf://10.17026/SS/TM4SW5,dataset


In [26]:
samples = df_dans.sample(10)

## FAIR Checker

In [34]:
def fair_checker_query(pid: str):
    res = r.get("https://fair-checker.france-bioinformatique.fr/api/check/legacy/metrics_all", params={"url": pid})
    return {metric["metric"]: int(metric["score"]) for metric in res.json()}

In [36]:
def fair_checker(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(fair_checker_query).apply(pd.Series) / 2

In [37]:
results = fair_checker(samples["persistentUrl"])

pd.concat([samples["persistentUrl"], results, results.mean(axis=1)], axis=1)

Unnamed: 0,persistentUrl,F1A,F1B,F2A,F2B,A1.1,A1.2,I1,I2,I3,R1.1,R1.2,R1.3,0
3051,https://doi.org/10.17026/dans-29x-6wne,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5,0.708333
6061,https://doi.org/10.17026/dans-zmv-mncs,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5,0.708333
1818,https://doi.org/10.17026/dans-2b4-35gf,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5,0.708333
8098,https://doi.org/10.17026/dans-x28-ztsk,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.5,0.791667
8631,https://doi.org/10.17026/dans-zx8-52x3,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.5,0.791667
5194,https://doi.org/10.17026/dans-xwz-bsdm,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5,0.708333
4150,https://doi.org/10.17026/dans-z2t-3hy4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
835,https://doi.org/10.17026/dans-xc3-5c7u,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.5,0.791667
1045,https://doi.org/10.17026/dans-xv2-eexd,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5,0.708333
3125,https://doi.org/10.17026/dans-zev-ph88,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5,0.708333


In [45]:
final_fair_checker = pd.concat([
    results.loc[:, results.columns.str.startswith("F")].mean(axis=1).to_frame("Findability"),
    results.loc[:, results.columns.str.startswith("A")].mean(axis=1).to_frame("Accessibility"),
    results.loc[:, results.columns.str.startswith("I")].mean(axis=1).to_frame("Interoperability"),
    results.loc[:, results.columns.str.startswith("R")].mean(axis=1).to_frame("Reusability"),
    results.mean(axis=1).to_frame("Overall"),
], axis=1)

final_fair_checker

Unnamed: 0,Findability,Accessibility,Interoperability,Reusability,Overall
3051,0.83879,0.333333,0.6875,0.43,0.623494
6061,0.965278,0.666667,0.6875,0.69,0.783824
1818,0.965278,0.666667,0.6875,0.69,0.783824
8098,0.972222,0.666667,0.6875,0.81,0.821569
8631,0.858135,0.666667,0.6875,0.81,0.781303
5194,0.83879,0.333333,0.6875,0.43,0.623494
4150,0.484127,0.333333,0.0,0.31,0.301261
835,0.972222,0.666667,0.6875,0.81,0.821569
1045,0.83879,0.333333,0.6875,0.43,0.623494
3125,0.83879,0.333333,0.6875,0.43,0.623494


## F-UJI

In [8]:
FUJI_URL = "http://localhost:1071/fuji/api/v1/evaluate"

In [39]:
def fuji_query(pid: str):
    headers = {
        "accept": "application/json",
        "Authorization": "Basic bWFydmVsOndvbmRlcndvbWFu",
        "Content-Type": "application/json",
    }

    req = {"object_identifier": pid, "use_datacite": True}
    res = r.post(FUJI_URL, json=req, headers=headers).json()

    return {metric: score / res["summary"]["score_total"][metric] for metric, score in res["summary"]["score_earned"].items()}

In [41]:
def fuji(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(fuji_query).apply(pd.Series)

In [42]:
results: pd.DataFrame = fuji(samples["persistentUrl"])

results

Unnamed: 0,A,F,I,R,A1,F1,F2,F3,F4,I1,I2,I3,R1,R1.1,R1.2,R1.3,FAIR
3051,0.333333,0.928571,0.75,0.4,0.333333,1.0,1.0,0.5,1.0,1.0,0.0,1.0,0.25,0.5,0.5,0.5,0.604167
6061,0.666667,1.0,0.75,0.7,0.666667,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.75,0.5,0.5,1.0,0.791667
1818,0.666667,1.0,0.75,0.7,0.666667,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.75,0.5,0.5,1.0,0.791667
8098,0.666667,1.0,0.75,0.8,0.666667,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.75,1.0,0.5,1.0,0.833333
8631,0.666667,0.857143,0.75,0.8,0.666667,1.0,1.0,1.0,0.5,1.0,0.0,1.0,0.75,1.0,0.5,1.0,0.791667
5194,0.333333,0.928571,0.75,0.4,0.333333,1.0,1.0,0.5,1.0,1.0,0.0,1.0,0.25,0.5,0.5,0.5,0.604167
4150,0.333333,0.571429,0.0,0.3,0.333333,1.0,0.5,0.0,0.5,0.0,0.0,0.0,0.25,0.0,0.5,0.5,0.333333
835,0.666667,1.0,0.75,0.8,0.666667,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.75,1.0,0.5,1.0,0.833333
1045,0.333333,0.928571,0.75,0.4,0.333333,1.0,1.0,0.5,1.0,1.0,0.0,1.0,0.25,0.5,0.5,0.5,0.604167
3125,0.333333,0.928571,0.75,0.4,0.333333,1.0,1.0,0.5,1.0,1.0,0.0,1.0,0.25,0.5,0.5,0.5,0.604167


In [46]:
final_fuji = results[["F", "A", "I", "R", "FAIR"]].rename(columns={
    "F": "Findability",
    "A": "Accessibility",
    "I": "Interoperability",
    "R": "Reusability",
    "FAIR": "Overall",
})

final_fuji

Unnamed: 0,Findability,Accessibility,Interoperability,Reusability,Overall
3051,0.928571,0.333333,0.75,0.4,0.604167
6061,1.0,0.666667,0.75,0.7,0.791667
1818,1.0,0.666667,0.75,0.7,0.791667
8098,1.0,0.666667,0.75,0.8,0.833333
8631,0.857143,0.666667,0.75,0.8,0.791667
5194,0.928571,0.333333,0.75,0.4,0.604167
4150,0.571429,0.333333,0.0,0.3,0.333333
835,1.0,0.666667,0.75,0.8,0.833333
1045,0.928571,0.333333,0.75,0.4,0.604167
3125,0.928571,0.333333,0.75,0.4,0.604167


# Analysis

In [48]:
def data_crosstab(df: pd.DataFrame) -> pd.DataFrame:
    melted = df.melt(var_name="metric", value_name="value")

    bins = [0, 0.25, 0.5, 0.75, 1.0]
    bin_labels = ['0%-25%', '25%-50%', '50%-75%', '75%-100%']

    melted['bin'] = pd.cut(melted['value'], bins=bins, labels=bin_labels, include_lowest=True)

    return pd.crosstab(melted["metric"], melted["bin"]).loc[df.columns]

In [49]:
data_crosstab(final_fair_checker)

bin,0%-25%,25%-50%,50%-75%,75%-100%
Findability,0,1,0,9
Accessibility,0,5,5,0
Interoperability,1,0,9,0
Reusability,0,5,2,3
Overall,0,1,4,5


In [50]:
data_crosstab(final_fuji)

bin,0%-25%,25%-50%,50%-75%,75%-100%
Findability,0,0,1,9
Accessibility,0,5,5,0
Interoperability,1,0,9,0
Reusability,0,5,2,3
Overall,0,1,4,5
