In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import requests as r
import rdflib
from tqdm import tqdm

tqdm.pandas()

## Dumping DANS

First, we use the `pyDataverse` library to connect to the DANS repository (it's a dataverse repository)

https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#connect-to-native-api

In [3]:
from pyDataverse.api import NativeApi
api = NativeApi("https://ssh.datastations.nl/")

api.get_info_version().json()

{'status': 'OK',
 'data': {'version': '6.5', 'build': 'DANS-DataStation-PATCH-7'}}

We dump the entire repository by getting the contents of the special dataverse `:root`. Warning: this takes like 15 mins.

In [4]:
try:
    with open("dans.json", "rb") as f:
        dans = json.load(f)

except FileNotFoundError:
    tree = api.get_dataverse_contents(":root")

    dans = tree.json()

    with open("dans.json", "wb") as f:
        f.write(tree.read())

Move the data into pandas to start working with it:

In [5]:
df_dans = pd.DataFrame.from_dict(dans["data"])

df_dans

Unnamed: 0,id,identifier,persistentUrl,protocol,authority,publisher,publicationDate,storageIdentifier,type
0,1920,dans-zhr-eswk,https://doi.org/10.17026/dans-zhr-eswk,doi,10.17026,DANS Data Station Social Sciences and Humanities,1990-01-01,surf://10.17026/dans-zhr-eswk,dataset
1,1921,dans-xeh-f6xm,https://doi.org/10.17026/dans-xeh-f6xm,doi,10.17026,DANS Data Station Social Sciences and Humanities,1995-01-01,surf://10.17026/dans-xeh-f6xm,dataset
2,1924,dans-zct-en83,https://doi.org/10.17026/dans-zct-en83,doi,10.17026,DANS Data Station Social Sciences and Humanities,1999-01-01,surf://10.17026/dans-zct-en83,dataset
3,1949,dans-xjk-kqxa,https://doi.org/10.17026/dans-xjk-kqxa,doi,10.17026,DANS Data Station Social Sciences and Humanities,2004-01-01,surf://10.17026/dans-xjk-kqxa,dataset
4,2082,dans-xpa-uek9,https://doi.org/10.17026/dans-xpa-uek9,doi,10.17026,DANS Data Station Social Sciences and Humanities,1993-01-01,surf://10.17026/dans-xpa-uek9,dataset
...,...,...,...,...,...,...,...,...,...
8687,615257,SS/GIJTA1,https://doi.org/10.17026/SS/GIJTA1,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-28,surf://10.17026/SS/GIJTA1,dataset
8688,615280,SS/IHYCRX,https://doi.org/10.17026/SS/IHYCRX,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-13,surf://10.17026/SS/IHYCRX,dataset
8689,615286,SS/ZY6QIP,https://doi.org/10.17026/SS/ZY6QIP,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-12,surf://10.17026/SS/ZY6QIP,dataset
8690,615288,SS/TM4SW5,https://doi.org/10.17026/SS/TM4SW5,doi,10.17026,DANS Data Station Social Sciences and Humanities,2025-05-09,surf://10.17026/SS/TM4SW5,dataset


In [6]:
samples_dans = df_dans.sample(200)

## ARCHE

ARCHE does not use Dataverse, but provides instead an API that returns RDF. We can use `rdflib` to parse this and extract all top-level datasets (there's only around 50 of them):

In [7]:
try:
    arche = pd.read_csv("arche.csv")

except FileNotFoundError:
    res = r.get("https://arche.acdh.oeaw.ac.at/api/search", params={
        "property[]": "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
        "value[]": "https://vocabs.acdh.oeaw.ac.at/schema#TopCollection",
        "format": "application/n-triples",
        "readMode": "resource",
        "resourceProperties[]": "https://vocabs.acdh.oeaw.ac.at/schema#hasPid"
    })

    g = rdflib.Graph()
    g.parse(data=res.text, format='nt')

    pids = []

    for i in g:
        if str(i[1]) != "https://vocabs.acdh.oeaw.ac.at/schema#hasPid":
            continue
        pids.append(i[2])

    arche = pd.DataFrame({"persistentUrl": pids})
    arche.to_csv("arche.csv", index=False)

arche

Unnamed: 0,persistentUrl
0,https://hdl.handle.net/21.11115/0000-000E-558C-5
1,https://hdl.handle.net/21.11115/0000-000D-CA69-A
2,https://hdl.handle.net/21.11115/0000-000C-2093-9
3,https://hdl.handle.net/21.11115/0000-000F-8289-3
4,https://hdl.handle.net/21.11115/0000-000E-8C49-3
5,https://hdl.handle.net/21.11115/0000-0012-20EF-B
6,https://hdl.handle.net/21.11115/0000-0014-44CA-A
7,https://hdl.handle.net/21.11115/0000-0012-210C-A
8,https://hdl.handle.net/21.11115/0000-000C-35DB-2
9,https://hdl.handle.net/21.11115/0000-000B-DCC4-0


We build a final sample list out of the 200 chosen at random from DANS and the 50 from ARCHE:

In [8]:
try:
    samples = pd.read_csv("samples.csv").squeeze("columns")

except FileNotFoundError:
    samples = pd.concat([arche["persistentUrl"], samples_dans["persistentUrl"]], ignore_index=True)
    samples.to_csv("samples.csv", index=False)
samples

0      https://hdl.handle.net/21.11115/0000-000E-558C-5
1      https://hdl.handle.net/21.11115/0000-000D-CA69-A
2      https://hdl.handle.net/21.11115/0000-000C-2093-9
3      https://hdl.handle.net/21.11115/0000-000F-8289-3
4      https://hdl.handle.net/21.11115/0000-000E-8C49-3
                             ...                       
245              https://doi.org/10.17026/dans-2a2-77kp
246              https://doi.org/10.17026/dans-2ze-9e6y
247              https://doi.org/10.17026/dans-x58-jgbs
248              https://doi.org/10.17026/dans-zmd-zuj9
249              https://doi.org/10.17026/dans-xpe-2ksa
Name: persistentUrl, Length: 250, dtype: object

## FAIR Checker

FAIR Checker hosts an instance of its API online. We use this to get results for the 250 datasets:

In [9]:
def fair_checker_query(pid: str):
    res = r.get("https://fair-checker.france-bioinformatique.fr/api/check/legacy/metrics_all", params={"url": pid})

    if not res.ok:
        print(pid)
        print(res)
        return

    return {metric["metric"]: int(metric["score"]) for metric in res.json()}

In [10]:
def fair_checker(pids: pd.Series) -> pd.DataFrame:
    results = pids.progress_apply(fair_checker_query).apply(pd.Series)
    results.index = pids
    return results

In [11]:
try:
    results_fair_checker = pd.read_csv("fair_checker.csv", index_col="persistentUrl")
except FileNotFoundError:
    results_fair_checker = fair_checker(samples)
    results_fair_checker.to_csv("fair_checker.csv", index=True)

results_fair_checker

Unnamed: 0_level_0,F1A,F1B,F2A,F2B,A1.1,A1.2,I1,I2,I3,R1.1,R1.2,R1.3
persistentUrl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
https://hdl.handle.net/21.11115/0000-000E-558C-5,1.0,0.0,0.5,0.5,1.0,0.0,0.5,0.5,1.0,0.0,0.0,0.5
https://hdl.handle.net/21.11115/0000-000D-CA69-A,1.0,0.0,0.5,0.5,1.0,0.0,0.5,0.5,1.0,0.0,0.0,0.5
https://hdl.handle.net/21.11115/0000-000C-2093-9,1.0,0.0,0.5,0.5,1.0,0.0,0.5,0.5,1.0,0.0,0.0,0.5
https://hdl.handle.net/21.11115/0000-000F-8289-3,1.0,0.0,0.5,0.5,1.0,0.0,0.5,0.5,1.0,0.0,0.0,0.5
https://hdl.handle.net/21.11115/0000-000E-8C49-3,1.0,0.0,0.5,0.5,1.0,0.0,0.5,0.5,1.0,0.0,0.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
https://doi.org/10.17026/dans-2a2-77kp,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5
https://doi.org/10.17026/dans-2ze-9e6y,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5
https://doi.org/10.17026/dans-x58-jgbs,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.0,1.0,0.5
https://doi.org/10.17026/dans-zmd-zuj9,1.0,1.0,0.5,0.5,1.0,1.0,0.5,0.5,0.0,1.0,1.0,0.5


In [12]:
final_fair_checker = pd.concat([
    results_fair_checker.loc[:, results_fair_checker.columns.str.startswith("F")].mean(axis=1).to_frame("Findability"),
    results_fair_checker.loc[:, results_fair_checker.columns.str.startswith("A")].mean(axis=1).to_frame("Accessibility"),
    results_fair_checker.loc[:, results_fair_checker.columns.str.startswith("I")].mean(axis=1).to_frame("Interoperability"),
    results_fair_checker.loc[:, results_fair_checker.columns.str.startswith("R")].mean(axis=1).to_frame("Reusability"),
    results_fair_checker.mean(axis=1).to_frame("Overall"),
], axis=1)

final_fair_checker

Unnamed: 0_level_0,Findability,Accessibility,Interoperability,Reusability,Overall
persistentUrl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://hdl.handle.net/21.11115/0000-000E-558C-5,0.50,0.5,0.666667,0.166667,0.458333
https://hdl.handle.net/21.11115/0000-000D-CA69-A,0.50,0.5,0.666667,0.166667,0.458333
https://hdl.handle.net/21.11115/0000-000C-2093-9,0.50,0.5,0.666667,0.166667,0.458333
https://hdl.handle.net/21.11115/0000-000F-8289-3,0.50,0.5,0.666667,0.166667,0.458333
https://hdl.handle.net/21.11115/0000-000E-8C49-3,0.50,0.5,0.666667,0.166667,0.458333
...,...,...,...,...,...
https://doi.org/10.17026/dans-2a2-77kp,0.75,1.0,0.333333,0.833333,0.708333
https://doi.org/10.17026/dans-2ze-9e6y,0.75,1.0,0.333333,0.833333,0.708333
https://doi.org/10.17026/dans-x58-jgbs,0.75,1.0,0.666667,0.833333,0.791667
https://doi.org/10.17026/dans-zmd-zuj9,0.75,1.0,0.333333,0.833333,0.708333


## F-UJI

F-UJI does not provide a hosted API instance. We thus have to host it locally, and we can do that with the provided `docker-compose.yml` file by running `docker compose up` in the project's directory. We can then use it to process the samples:

In [13]:
FUJI_URL = "http://localhost:1071/fuji/api/v1/evaluate"

In [14]:
def fuji_query(pid: str):
    headers = {
        "accept": "application/json",
        "Authorization": "Basic bWFydmVsOndvbmRlcndvbWFu",
        "Content-Type": "application/json",
    }

    req = {"object_identifier": pid, "use_datacite": True}
    res = r.post(FUJI_URL, json=req, headers=headers).json()

    return {metric: score / res["summary"]["score_total"][metric] for metric, score in res["summary"]["score_earned"].items()}

In [15]:
def fuji(pids: pd.Series) -> pd.DataFrame:
    results = pids.progress_apply(fuji_query).apply(pd.Series)
    results.index = pids
    return results

In [16]:
try:
    results_fuji = pd.read_csv("fuji.csv", index_col="persistentUrl")

except FileNotFoundError:
    results_fuji: pd.DataFrame = fuji(samples)

results_fuji

Unnamed: 0_level_0,A,F,I,R,A1,F1,F2,F3,F4,I1,I2,I3,R1,R1.1,R1.2,R1.3,FAIR
persistentUrl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
https://hdl.handle.net/21.11115/0000-000E-558C-5,0.333333,0.357143,0.25,0.0,0.333333,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.00,0.0,0.0,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000D-CA69-A,0.333333,0.357143,0.25,0.0,0.333333,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.00,0.0,0.0,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000C-2093-9,0.333333,0.357143,0.25,0.0,0.333333,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.00,0.0,0.0,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000F-8289-3,0.333333,0.357143,0.25,0.0,0.333333,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.00,0.0,0.0,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000E-8C49-3,0.333333,0.357143,0.25,0.0,0.333333,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.00,0.0,0.0,0.0,0.187500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://doi.org/10.17026/dans-2a2-77kp,0.666667,1.000000,0.75,0.7,0.666667,1.0,1.00,1.0,1.0,1.0,0.0,1.0,0.75,0.5,0.5,1.0,0.791667
https://doi.org/10.17026/dans-2ze-9e6y,0.666667,1.000000,0.75,0.7,0.666667,1.0,1.00,1.0,1.0,1.0,0.0,1.0,0.75,0.5,0.5,1.0,0.791667
https://doi.org/10.17026/dans-x58-jgbs,0.666667,1.000000,0.75,0.8,0.666667,1.0,1.00,1.0,1.0,1.0,0.0,1.0,0.75,1.0,0.5,1.0,0.833333
https://doi.org/10.17026/dans-zmd-zuj9,1.000000,1.000000,0.75,0.7,1.000000,1.0,1.00,1.0,1.0,1.0,0.0,1.0,0.75,0.5,0.5,1.0,0.833333


In [17]:
final_fuji = results_fuji[["F", "A", "I", "R", "FAIR"]].rename(columns={
    "F": "Findability",
    "A": "Accessibility",
    "I": "Interoperability",
    "R": "Reusability",
    "FAIR": "Overall",
})

final_fuji.index = samples
final_fuji

Unnamed: 0_level_0,Findability,Accessibility,Interoperability,Reusability,Overall
persistentUrl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://hdl.handle.net/21.11115/0000-000E-558C-5,0.357143,0.333333,0.25,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000D-CA69-A,0.357143,0.333333,0.25,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000C-2093-9,0.357143,0.333333,0.25,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000F-8289-3,0.357143,0.333333,0.25,0.0,0.187500
https://hdl.handle.net/21.11115/0000-000E-8C49-3,0.357143,0.333333,0.25,0.0,0.187500
...,...,...,...,...,...
https://doi.org/10.17026/dans-2a2-77kp,1.000000,0.666667,0.75,0.7,0.791667
https://doi.org/10.17026/dans-2ze-9e6y,1.000000,0.666667,0.75,0.7,0.791667
https://doi.org/10.17026/dans-x58-jgbs,1.000000,0.666667,0.75,0.8,0.833333
https://doi.org/10.17026/dans-zmd-zuj9,1.000000,1.000000,0.75,0.7,0.833333


# Analysis

## Cross-tabulation

In [18]:
def data_crosstab(df: pd.DataFrame) -> pd.DataFrame:
    melted = df.melt(var_name="metric", value_name="value")

    bins = [0, 0.25, 0.5, 0.75, 1.0]
    bin_labels = ['0%-25%', '25%-50%', '50%-75%', '75%-100%']

    melted['bin'] = pd.cut(melted['value'], bins=bins, labels=bin_labels, include_lowest=True)

    return pd.crosstab(melted["metric"], melted["bin"]).loc[df.columns]

In [19]:
crosstab_fair_checker = data_crosstab(final_fair_checker)
crosstab_fair_checker

bin,0%-25%,25%-50%,50%-75%,75%-100%
Findability,6,56,188,0
Accessibility,0,62,0,188
Interoperability,6,150,94,0
Reusability,62,0,0,188
Overall,6,56,144,44


In [20]:
crosstab_fuji = data_crosstab(final_fuji)
crosstab_fuji

bin,0%-25%,25%-50%,50%-75%,75%-100%
Findability,0,50,6,194
Accessibility,0,62,77,111
Interoperability,49,16,185,0
Reusability,50,15,150,35
Overall,50,12,3,185


## Correlation of results between the two tools

In [21]:
def data_correlate(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:

    return pd.DataFrame([df1[col].corr(df2[col]) for col in df1.columns], index=df1.columns)


In [22]:
correlation = data_correlate(final_fair_checker, final_fuji)
correlation

Unnamed: 0,0
Findability,0.883569
Accessibility,0.849621
Interoperability,-0.341329
Reusability,0.950187
Overall,0.877238


# Export to LaTeX

In [23]:
crosstab_fair_checker.to_latex("crosstab_fair_checker.tex")
crosstab_fuji.to_latex("crosstab_fuji.tex")
correlation.to_latex("correlation.tex")