# Integrate different data sources

In [1]:
import requests
import json
from urllib.parse import urlencode
import pandas as pd
import orjson
from tqdm import tqdm
import ast
import numpy as np

import warnings
warnings.filterwarnings("ignore")

### BSO

In [3]:
df_oa = []
for year in tqdm(range(2013, 2025)):
    df_oa_tmp = pd.read_csv(f'../data/interim/oa_initial_{year}.csv').drop(columns = 'Unnamed: 0')
    df_oa.append(df_oa_tmp)
df_oa = pd.concat((df_oa), ignore_index = True)
df_oa

100%|██████████| 12/12 [00:21<00:00,  1.76s/it]


Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries
0,https://doi.org/10.1016/j.cell.2013.05.039,2013,en,"Biochemistry, Genetics and Molecular Biology",Cell,S110447773,0092-8674,Cell Press,P4310315673,"['Universidad de Oviedo', 'Spanish National Ca...","['I165339363', 'I4210089594', 'I4210153965', '...","['Carlos López‐Otín', 'Marı́a A. Blasco', 'Lin...","['A5087974982', 'A5085977927', 'A5059311261', ...",bronze,"{'value': 10100, 'currency': 'USD', 'value_usd...",,"[False, False, False, True, False]","[['ES'], ['ES'], ['DE', 'GB'], ['ES'], ['FR']]"
1,https://doi.org/10.1051/0004-6361/201322068,2013,en,Physics and Astronomy,Astronomy and Astrophysics,S205231332,0004-6361,EDP Sciences,P4310319748,"['Max Planck Institute for Astronomy', 'Yale U...","['I4210109156', 'I32971472', 'I4210118524', 'I...","['Thomas Robitaille', 'Erik Tollerud', 'P. Gre...","['A5014651763', 'A5083524651', 'A5052341993', ...",bronze,,,"[False, False, False, False, False, False, Fal...","[['DE'], ['US'], ['FR', 'US'], ['FR', 'US'], [..."
2,https://doi.org/10.2458/azu_js_rc.55.16947,2013,en,Earth and Planetary Sciences,Radiocarbon,S35778795,0033-8222,Cambridge University Press,P4310311721,"[""Queen's University Belfast"", 'Institut de Re...","['I126231945', 'I4210166444', 'I1294671590', '...","['Reimer Paula J', 'Bard, Edouard', 'Bayliss A...","['A3095000969', 'A3024548407', 'A3135758542', ...",bronze,,,"[True, False, False, False, False, False, Fals...","[['GB'], ['FR'], ['GB'], ['US'], ['GB'], ['GB'..."
3,https://doi.org/10.1038/nature12477,2013,en,"Biochemistry, Genetics and Molecular Biology",Nature,S137773608,0028-0836,Nature Portfolio,P4310319908,"['Wellcome Sanger Institute', 'Wellcome Sanger...","['I2802476451', 'I2802476451', 'I2802476451', ...","['Ludmil B. Alexandrov', 'Serena Nik‐Zainal', ...","['A5080997789', 'A5072873709', 'A5004776968', ...",bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[['GB'], ['GB'], ['GB'], ['CA'], ['GB'], ['GB'..."
4,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,Medicine,,,,,,"['Cooper University Hospital', 'Phoenix Contac...","['I2800704349', 'I78801874', 'I27804330', 'I42...","['R. Phillip Dellinger', 'Mitchell M. Levy', '...","['A5066795709', 'A5000157972', 'A5083275742', ...",closed,,,"[False, False, False, False, False, False, Fal...","[['US'], ['US'], ['GB'], ['FR'], ['DE'], ['US'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545016,https://doi.org/10.1007/s10499-025-02358-1,2024,en,Agricultural and Biological Sciences,,,,,,"['Thang Long University', 'University of Hong ...","['I4210097659', 'I889458895', 'I70349855', 'I1...","['Phuong Thi Nhu Le', 'Vengatesen Thiyagarajan...","['A2472938279', 'A2527540559', 'A2166034414', ...",closed,"{'value': 2790, 'currency': 'EUR', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[['VN'], ['HK'], ['VN'], ['CN'], ['TH'], ['VN'..."
1545017,https://doi.org/10.37665/remcajp40761,2024,,Engineering,,,,,,['Valeo (France)'],['I220619192'],['Maurice Dore'],['A5120490549'],closed,,,[True],[['FR']]
1545018,https://doi.org/10.4000/15ara,2024,,Social Sciences,Revue européenne de migrations internationales,S4210238526,0765-0752,University of Poitiers,P4310311531,['École des hautes études en sciences sociales...,"['I90669466', 'I143804889', 'I2802994108']","['Audrey Lenoël', 'Christina Oelgemöller', 'Au...","['A2540735368', 'A292793731', 'A2540735368', '...",diamond,,,"[True, False, False, False]","[['FR'], ['GB', 'US'], [], []]"
1545019,https://doi.org/10.4000/11ndt,2024,fr,Social Sciences,Histoire Politique,S4210214659,1954-3670,,,['Centre d’histoire de Sciences Po'],['I4210100301'],['Cédric Pellen'],['A5079110810'],bronze,,,[True],[['FR']]


In [4]:
df = pd.read_csv('../data/interim/bso.csv').drop(columns='Unnamed: 0')

df_oa['doi_corr'] = df_oa['doi'].apply(lambda x: x[16:])
df_oa_comb = df_oa.merge(df, left_on='doi_corr', right_on='doi', how='left')
df_oa_comb['BSO'] = df_oa_comb['openalex_id'].notna()
df_oa_comb = (df_oa_comb.drop(columns=['doi_y', 'bso_country_corrected', 'genre', 'openalex_id']).rename(columns={'doi_x': 'doi'}))

df_oa_comb

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries,doi_corr,BSO
0,https://doi.org/10.1016/j.cell.2013.05.039,2013,en,"Biochemistry, Genetics and Molecular Biology",Cell,S110447773,0092-8674,Cell Press,P4310315673,"['Universidad de Oviedo', 'Spanish National Ca...","['I165339363', 'I4210089594', 'I4210153965', '...","['Carlos López‐Otín', 'Marı́a A. Blasco', 'Lin...","['A5087974982', 'A5085977927', 'A5059311261', ...",bronze,"{'value': 10100, 'currency': 'USD', 'value_usd...",,"[False, False, False, True, False]","[['ES'], ['ES'], ['DE', 'GB'], ['ES'], ['FR']]",10.1016/j.cell.2013.05.039,True
1,https://doi.org/10.1051/0004-6361/201322068,2013,en,Physics and Astronomy,Astronomy and Astrophysics,S205231332,0004-6361,EDP Sciences,P4310319748,"['Max Planck Institute for Astronomy', 'Yale U...","['I4210109156', 'I32971472', 'I4210118524', 'I...","['Thomas Robitaille', 'Erik Tollerud', 'P. Gre...","['A5014651763', 'A5083524651', 'A5052341993', ...",bronze,,,"[False, False, False, False, False, False, Fal...","[['DE'], ['US'], ['FR', 'US'], ['FR', 'US'], [...",10.1051/0004-6361/201322068,True
2,https://doi.org/10.2458/azu_js_rc.55.16947,2013,en,Earth and Planetary Sciences,Radiocarbon,S35778795,0033-8222,Cambridge University Press,P4310311721,"[""Queen's University Belfast"", 'Institut de Re...","['I126231945', 'I4210166444', 'I1294671590', '...","['Reimer Paula J', 'Bard, Edouard', 'Bayliss A...","['A3095000969', 'A3024548407', 'A3135758542', ...",bronze,,,"[True, False, False, False, False, False, Fals...","[['GB'], ['FR'], ['GB'], ['US'], ['GB'], ['GB'...",10.2458/azu_js_rc.55.16947,True
3,https://doi.org/10.1038/nature12477,2013,en,"Biochemistry, Genetics and Molecular Biology",Nature,S137773608,0028-0836,Nature Portfolio,P4310319908,"['Wellcome Sanger Institute', 'Wellcome Sanger...","['I2802476451', 'I2802476451', 'I2802476451', ...","['Ludmil B. Alexandrov', 'Serena Nik‐Zainal', ...","['A5080997789', 'A5072873709', 'A5004776968', ...",bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[['GB'], ['GB'], ['GB'], ['CA'], ['GB'], ['GB'...",10.1038/nature12477,True
4,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,Medicine,,,,,,"['Cooper University Hospital', 'Phoenix Contac...","['I2800704349', 'I78801874', 'I27804330', 'I42...","['R. Phillip Dellinger', 'Mitchell M. Levy', '...","['A5066795709', 'A5000157972', 'A5083275742', ...",closed,,,"[False, False, False, False, False, False, Fal...","[['US'], ['US'], ['GB'], ['FR'], ['DE'], ['US'...",10.1097/ccm.0b013e31827e83af,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545077,https://doi.org/10.1007/s10499-025-02358-1,2024,en,Agricultural and Biological Sciences,,,,,,"['Thang Long University', 'University of Hong ...","['I4210097659', 'I889458895', 'I70349855', 'I1...","['Phuong Thi Nhu Le', 'Vengatesen Thiyagarajan...","['A2472938279', 'A2527540559', 'A2166034414', ...",closed,"{'value': 2790, 'currency': 'EUR', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[['VN'], ['HK'], ['VN'], ['CN'], ['TH'], ['VN'...",10.1007/s10499-025-02358-1,False
1545078,https://doi.org/10.37665/remcajp40761,2024,,Engineering,,,,,,['Valeo (France)'],['I220619192'],['Maurice Dore'],['A5120490549'],closed,,,[True],[['FR']],10.37665/remcajp40761,False
1545079,https://doi.org/10.4000/15ara,2024,,Social Sciences,Revue européenne de migrations internationales,S4210238526,0765-0752,University of Poitiers,P4310311531,['École des hautes études en sciences sociales...,"['I90669466', 'I143804889', 'I2802994108']","['Audrey Lenoël', 'Christina Oelgemöller', 'Au...","['A2540735368', 'A292793731', 'A2540735368', '...",diamond,,,"[True, False, False, False]","[['FR'], ['GB', 'US'], [], []]",10.4000/15ara,False
1545080,https://doi.org/10.4000/11ndt,2024,fr,Social Sciences,Histoire Politique,S4210214659,1954-3670,,,['Centre d’histoire de Sciences Po'],['I4210100301'],['Cédric Pellen'],['A5079110810'],bronze,,,[True],[['FR']],10.4000/11ndt,False


### Download missing

In [5]:
df_bso_oa = df[~df.openalex_id.isna()]
missing = df_bso_oa[~df_bso_oa.doi.isin(df_oa_comb.doi_corr)].doi.unique()

BASE_URL = "https://api.openalex.org/works"

selected_fields = ["doi", "publication_year", "language", "indexed_in", "primary_location", "best_oa_location", "open_access", "authorships", 
                   "corresponding_author_ids", "corresponding_institution_ids", "apc_list", "apc_paid", "cited_by_count", "primary_topic", "awards", "funders"]


def fetch_by_ids(id_list):
    # Normalize IDs to a string separated by |
    doi_filter = "doi:" + "|".join(id_list)

    filters = [
        doi_filter,
        "indexed_in:crossref",
        "type:article|review",
    ]

    params = {
        "filter": ",".join(filters),
        "select": ",".join(selected_fields),
        "per_page": 200
    }

    url = f"{BASE_URL}?{urlencode(params)}"
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        data = r.json()
        return data.get("results", [])
    except requests.exceptions.JSONDecodeError:
        print("JSON decode error for URL:", url[:200])
        pass
    except requests.exceptions.RequestException as e:
        print("Request error:", e)
        pass

batch_size = 50  # try increasing, test carefully
all_fetched = []
for i in tqdm(range(0, len(missing), batch_size)):
    batch = missing[i:i+batch_size]
    results = fetch_by_ids(batch)
    all_fetched.extend(results)

    
output_file = "../data/interim/FranceInitialAPI/openalex_missing.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for w in all_fetched:
        f.write(json.dumps(w) + "\n")

print(f"Saved {len(all_fetched)} records to {output_file}")

100%|██████████| 2151/2151 [13:40<00:00,  2.62it/s]


Saved 11661 records to ../data/interim/FranceInitialAPI/openalex_missing.jsonl


In [6]:
interest = ['doi', 'publication_year', 'language', 'authorships', 'best_oa_location', 'primary_topic', 'open_access', 'apc_list', 'apc_paid']
keys = ['doi', 'publication_year', 'language', 'field_name_top_topic', 'journal', 'journal_id','publisher', 'publisher_id', 'display_name_institution', 'id_institution', 'display_name_author', 'id_author',
        'oa_status', 'apc_list', 'apc_paid','corresponding', 'countries']

records = []
with open(f"../data/interim/FranceInitialAPI/openalex_missing.jsonl", "rb") as f:
    for line in f:
        rec = orjson.loads(line)
        if not rec.get("doi"): # Skip records without DOI
            continue
        filtered = {k: rec.get(k) for k in interest} # Keep only the fields we care about

        # Extract field_names from topics
        filtered["field_name_top_topic"] =  filtered.get("primary_topic") .get("field", {}).get("display_name") if isinstance(filtered.get("primary_topic") , dict) else None

        # Extract journal and publisher from best_oa_location
        pl = filtered.get("best_oa_location") or {}
        source = pl.get("source") or {}
        filtered["journal"] = source.get("display_name")
        filtered["journal_id"] = source.get("id").split("/")[-1] if source.get("id") else None
        filtered["publisher"] = source.get("host_organization_name")
        filtered["publisher_id"] = source.get("host_organization").split("/")[-1] if source.get("host_organization") else None

        # Extract institution types and number of authors
        authorships = filtered.get("authorships") or []
        filtered["display_name_institution"] = [inst.get("display_name") for auth in authorships for inst in auth.get("institutions", []) if inst.get("display_name")]
        filtered["id_institution"] = [inst.get("id").split('/')[-1] for auth in authorships for inst in auth.get("institutions", []) if inst.get("id")]
        filtered["corresponding"] = [auth.get("is_corresponding") for auth in authorships]
        filtered["countries"] = [auth.get("countries") for auth in authorships]
        filtered["display_name_author"] = [auth["author"].get("display_name") for auth in authorships if auth.get("author", {}).get("display_name")]
        filtered["id_author"] = [auth["author"].get("id").split('/')[-1] for auth in authorships if auth.get("author", {}).get("id")]
            
        o_a = filtered.get("open_access") or {}
        filtered["oa_status"] = o_a.get("oa_status")

        records.append({k: filtered.get(k) for k in keys})

df_oa_missing = pd.DataFrame(records)
df_oa_missing

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries
0,https://doi.org/10.1088/0004-637x/772/1/7,2013,en,Physics and Astronomy,The Astrophysical Journal,S1980519,IOP Publishing,P4310320083,[],[],"[Sebastien Guillot, Mathieu Servillat, Natalie...",[],bronze,"{'value': 4499, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False]","[[], [], [], []]"
1,https://doi.org/10.1016/j.datak.2013.06.002,2013,en,Computer Science,HAL (Le Centre pour la Communication Scientifi...,S4306402512,Centre National de la Recherche Scientifique,I1294671590,[],[],"[Camille Kurtz, Pierre Gançarski, Nicolas Pass...","[A5090167453, A5031872978, A5037096002, A50097...",green,"{'value': 2590, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False]","[[], [], [], []]"
2,https://doi.org/10.1209/0295-5075/101/30006,2013,en,Physics and Astronomy,arXiv (Cornell University),S4306400194,Cornell University,I205783295,[],[],"[C. Trefzger, Yvan Castin]","[A5062989403, A5031838262]",green,,,"[True, False]","[[], []]"
3,https://doi.org/10.3917/rtm.213.0071,2013,fr,Social Sciences,Tiers-Monde,S91700283,Publications de la Sorbonne,P4322697890,[],[],[Marc-Antoine Pérouse de Montclos],[A5017575945],bronze,,,[True],[[]]
4,https://doi.org/10.4000/africanistes.3563,2013,fr,Arts and Humanities,,,,,[Institut National des Langues et Civilisation...,"[I162346809, I4210116128]",[Mélanie Bourlet],[A5033776478],closed,,,[True],[[FR]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11656,https://doi.org/10.1007/s00028-024-00946-x,2024,en,Mathematics,arXiv (Cornell University),S4306400194,Cornell University,I205783295,"[Centre National de la Recherche Scientifique,...","[I1294671590, I4210160189, I21491767]",[Anatole Gaudin],[A5034110472],green,"{'value': 2290, 'currency': 'EUR', 'value_usd'...",,[True],[[FR]]
11657,https://doi.org/10.1016/j.ymssp.2024.111486,2024,en,Engineering,Mechanical Systems and Signal Processing,S128368299,Elsevier BV,P4310320990,"[Engie (France), Institut National des Science...","[I4210124897, I48430043, I86767153, I421011597...","[Adrien Marsick, Hugo André, Ilyes Khelf, Quen...","[A5051979735, A5022853612, A5087236139, A50695...",hybrid,"{'value': 4830, 'currency': 'USD', 'value_usd'...","{'value': 4830, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False]","[[FR], [FR], [FR], [FR], [FR]]"
11658,https://doi.org/10.1016/j.finel.2024.104163,2024,en,Engineering,,,,,[Laboratoire de génie civil et génie mécanique...,"[I4387155956, I4210115072, I4210115072, I43871...","[Yassir Wardi, Pisey Keo, Mohammed Hjiaj]","[A5095881675, A5090408324, A5107447778]",closed,"{'value': 3080, 'currency': 'USD', 'value_usd'...",,"[True, False, False]","[[FR], [FR], [FR]]"
11659,https://doi.org/10.54563/cahiers-duras.519,2024,fr,Psychology,Cahiers Marguerite Duras.,S4404673873,,,[],[],[Marie-Hélène Boblet],[A5107548769],hybrid,,,[True],[[]]


In [7]:
df_oa_missing['doi_corr'] = df_oa_missing['doi'].apply(lambda x: x[16::])
df_oa_comb_2 = df_oa_missing.merge(df, left_on = 'doi_corr', right_on = 'doi', how = 'left')
df_oa_comb_2['BSO'] = df_oa_comb_2['openalex_id'].notna()
df_oa_comb_2 = df_oa_comb_2.drop(columns = ['doi_y', 'bso_country_corrected', 'genre', 'openalex_id']).rename(columns = {'doi_x' : 'doi'})

df_tmp = pd.concat((df_oa_comb, df_oa_comb_2)).reset_index(drop = True)
df_tmp

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries,doi_corr,BSO
0,https://doi.org/10.1016/j.cell.2013.05.039,2013,en,"Biochemistry, Genetics and Molecular Biology",Cell,S110447773,0092-8674,Cell Press,P4310315673,"['Universidad de Oviedo', 'Spanish National Ca...","['I165339363', 'I4210089594', 'I4210153965', '...","['Carlos López‐Otín', 'Marı́a A. Blasco', 'Lin...","['A5087974982', 'A5085977927', 'A5059311261', ...",bronze,"{'value': 10100, 'currency': 'USD', 'value_usd...",,"[False, False, False, True, False]","[['ES'], ['ES'], ['DE', 'GB'], ['ES'], ['FR']]",10.1016/j.cell.2013.05.039,True
1,https://doi.org/10.1051/0004-6361/201322068,2013,en,Physics and Astronomy,Astronomy and Astrophysics,S205231332,0004-6361,EDP Sciences,P4310319748,"['Max Planck Institute for Astronomy', 'Yale U...","['I4210109156', 'I32971472', 'I4210118524', 'I...","['Thomas Robitaille', 'Erik Tollerud', 'P. Gre...","['A5014651763', 'A5083524651', 'A5052341993', ...",bronze,,,"[False, False, False, False, False, False, Fal...","[['DE'], ['US'], ['FR', 'US'], ['FR', 'US'], [...",10.1051/0004-6361/201322068,True
2,https://doi.org/10.2458/azu_js_rc.55.16947,2013,en,Earth and Planetary Sciences,Radiocarbon,S35778795,0033-8222,Cambridge University Press,P4310311721,"[""Queen's University Belfast"", 'Institut de Re...","['I126231945', 'I4210166444', 'I1294671590', '...","['Reimer Paula J', 'Bard, Edouard', 'Bayliss A...","['A3095000969', 'A3024548407', 'A3135758542', ...",bronze,,,"[True, False, False, False, False, False, Fals...","[['GB'], ['FR'], ['GB'], ['US'], ['GB'], ['GB'...",10.2458/azu_js_rc.55.16947,True
3,https://doi.org/10.1038/nature12477,2013,en,"Biochemistry, Genetics and Molecular Biology",Nature,S137773608,0028-0836,Nature Portfolio,P4310319908,"['Wellcome Sanger Institute', 'Wellcome Sanger...","['I2802476451', 'I2802476451', 'I2802476451', ...","['Ludmil B. Alexandrov', 'Serena Nik‐Zainal', ...","['A5080997789', 'A5072873709', 'A5004776968', ...",bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[['GB'], ['GB'], ['GB'], ['CA'], ['GB'], ['GB'...",10.1038/nature12477,True
4,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,Medicine,,,,,,"['Cooper University Hospital', 'Phoenix Contac...","['I2800704349', 'I78801874', 'I27804330', 'I42...","['R. Phillip Dellinger', 'Mitchell M. Levy', '...","['A5066795709', 'A5000157972', 'A5083275742', ...",closed,,,"[False, False, False, False, False, False, Fal...","[['US'], ['US'], ['GB'], ['FR'], ['DE'], ['US'...",10.1097/ccm.0b013e31827e83af,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1556738,https://doi.org/10.1007/s00028-024-00946-x,2024,en,Mathematics,arXiv (Cornell University),S4306400194,,Cornell University,I205783295,"[Centre National de la Recherche Scientifique,...","[I1294671590, I4210160189, I21491767]",[Anatole Gaudin],[A5034110472],green,"{'value': 2290, 'currency': 'EUR', 'value_usd'...",,[True],[[FR]],10.1007/s00028-024-00946-x,True
1556739,https://doi.org/10.1016/j.ymssp.2024.111486,2024,en,Engineering,Mechanical Systems and Signal Processing,S128368299,,Elsevier BV,P4310320990,"[Engie (France), Institut National des Science...","[I4210124897, I48430043, I86767153, I421011597...","[Adrien Marsick, Hugo André, Ilyes Khelf, Quen...","[A5051979735, A5022853612, A5087236139, A50695...",hybrid,"{'value': 4830, 'currency': 'USD', 'value_usd'...","{'value': 4830, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False]","[[FR], [FR], [FR], [FR], [FR]]",10.1016/j.ymssp.2024.111486,True
1556740,https://doi.org/10.1016/j.finel.2024.104163,2024,en,Engineering,,,,,,[Laboratoire de génie civil et génie mécanique...,"[I4387155956, I4210115072, I4210115072, I43871...","[Yassir Wardi, Pisey Keo, Mohammed Hjiaj]","[A5095881675, A5090408324, A5107447778]",closed,"{'value': 3080, 'currency': 'USD', 'value_usd'...",,"[True, False, False]","[[FR], [FR], [FR]]",10.1016/j.finel.2024.104163,True
1556741,https://doi.org/10.54563/cahiers-duras.519,2024,fr,Psychology,Cahiers Marguerite Duras.,S4404673873,,,,[],[],[Marie-Hélène Boblet],[A5107548769],hybrid,,,[True],[[]],10.54563/cahiers-duras.519,True


### Check if not in Oa is not in OA

In [None]:
missing = df[df.openalex_id.isna()].doi.unique()

BASE_URL = "https://api.openalex.org/works"

selected_fields = ["doi", "publication_year", "language", "indexed_in", "primary_location", "best_oa_location", "open_access", "authorships", 
                   "corresponding_author_ids", "corresponding_institution_ids", "apc_list", "apc_paid", "cited_by_count", "primary_topic", "awards", "funders"]

def fetch_by_ids(id_list):
    doi_filter = "doi:" + "|".join(id_list)
    filters = [
        doi_filter,
        "indexed_in:crossref",
        "type:article|review",
    ]
    params = {
        "filter": ",".join(filters),
        "select": ",".join(selected_fields),
        "per_page": 200
    }
    url = f"{BASE_URL}?{urlencode(params)}"
    
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        data = r.json()
        return data.get("results", [])
    except requests.exceptions.JSONDecodeError:
        print("JSON decode error for URL:", url[:200])
        return []  # return empty list instead of None
    except requests.exceptions.RequestException as e:
        print("Request error:", e)
        return []  # return empty list instead of None

batch_size = 50  # try increasing, test carefully
all_fetched = []
for i in tqdm(range(0, len(missing), batch_size)):
    batch = missing[i:i+batch_size]
    results = fetch_by_ids(batch)
    all_fetched.extend(results)

    
output_file = "../data/interim/FranceInitialAPI/not_openalex.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for w in all_fetched:
        f.write(json.dumps(w) + "\n")

print(f"Saved {len(all_fetched)} records to {output_file}")

In [8]:
interest = ['doi', 'publication_year', 'language', 'authorships', 'best_oa_location', 'primary_topic', 'open_access', 'apc_list', 'apc_paid']
keys = ['doi', 'publication_year', 'language', 'field_name_top_topic', 'journal', 'journal_id','publisher', 'publisher_id', 'display_name_institution', 'id_institution', 'display_name_author', 'id_author',
        'oa_status', 'apc_list', 'apc_paid','corresponding', 'countries']
records = []
with open(f"../data/interim/FranceInitialAPI/not_openalex.jsonl", "rb") as f:
    for line in f:
        rec = orjson.loads(line)
        if not rec.get("doi"): # Skip records without DOI
            continue
        filtered = {k: rec.get(k) for k in interest} # Keep only the fields we care about

        # Extract field_names from topics
        filtered["field_name_top_topic"] =  filtered.get("primary_topic") .get("field", {}).get("display_name") if isinstance(filtered.get("primary_topic") , dict) else None

        # Extract journal and publisher from best_oa_location
        pl = filtered.get("best_oa_location") or {}
        source = pl.get("source") or {}
        filtered["journal"] = source.get("display_name")
        filtered["journal_id"] = source.get("id").split("/")[-1] if source.get("id") else None
        filtered["publisher"] = source.get("host_organization_name")
        filtered["publisher_id"] = source.get("host_organization").split("/")[-1] if source.get("host_organization") else None

        # Extract institution types and number of authors
        authorships = filtered.get("authorships") or []
        filtered["display_name_institution"] = [inst.get("display_name") for auth in authorships for inst in auth.get("institutions", []) if inst.get("display_name")]
        filtered["id_institution"] = [inst.get("id").split('/')[-1] for auth in authorships for inst in auth.get("institutions", []) if inst.get("id")]
        filtered["corresponding"] = [auth.get("is_corresponding") for auth in authorships]
        filtered["countries"] = [auth.get("countries") for auth in authorships]
        filtered["display_name_author"] = [auth["author"].get("display_name") for auth in authorships if auth.get("author", {}).get("display_name")]
        filtered["id_author"] = [auth["author"].get("id").split('/')[-1] for auth in authorships if auth.get("author", {}).get("id")]
            
        o_a = filtered.get("open_access") or {}
        filtered["oa_status"] = o_a.get("oa_status")

        records.append({k: filtered.get(k) for k in keys})

df_not_oa = pd.DataFrame(records)
df_not_oa

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries
0,https://doi.org/10.1016/s0140-6736(12)62191-6,2013,en,Psychology,,,,,"[University of Zurich, University College Lond...","[I202697423, I45129253, I31746571, I4210099336...","[And﻿re﻿as Maercker, Chris R. Brewin, Richard ...","[A5055502193, A5074178806, A5048808978, A50152...",closed,"{'value': 6830, 'currency': 'USD', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[[CH], [GB], [AU], [US], [CH], [CH], [PK], [US..."
1,https://doi.org/10.1038/nature11913,2013,en,"Biochemistry, Genetics and Molecular Biology",,,,,"[ETH Zurich, ETH Zurich, ETH Zurich, ETH Zuric...","[I35440088, I35440088, I35440088, I35440088, I...","[Médéric Diard, Víctor García, Lisa Maier, Mit...","[A5048213818, A5018865175, A5073721360, A50679...",closed,"{'value': 9750, 'currency': 'EUR', 'value_usd'...",,"[False, False, False, False, False, False, False]","[[CH], [CH], [CH], [CH], [CH], [CH], [CH]]"
2,https://doi.org/10.3389/fimmu.2013.00297,2013,en,Agricultural and Biological Sciences,Frontiers in Immunology,S2595292759,Frontiers Media,P4310320527,[],[],"[Florence Jacob, Saskia Vernaldi, Takaki Maekawa]","[A5086360259, A5090760688, A5080440564]",gold,"{'value': 2950, 'currency': 'USD', 'value_usd'...","{'value': 1105, 'currency': 'EUR', 'value_usd'...","[False, False, False]","[[], [], []]"
3,https://doi.org/10.1162/coli_a_00178,2013,en,Computer Science,Computational Linguistics,S155526855,Association for Computational Linguistics,P4310320244,[University of Edinburgh],[I98677209],[Khaled Shaalan],[A5066945309],bronze,,,[True],[[GB]]
4,https://doi.org/10.1016/j.rse.2013.09.016,2013,en,Earth and Planetary Sciences,Open Research Exeter (University of Exeter),S4306401998,University of Exeter,I23923803,"[Plymouth Marine Laboratory, Plymouth Marine L...","[I2802566253, I2802566253, I149899117, I421014...","[Robert J. W. Brewin, Shubha Sathyendranath, D...","[A5074931237, A5050444907, A5058877813, A50800...",green,"{'value': 4070, 'currency': 'USD', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[[GB], [GB], [DE], [DE], [], [CA], [DE], [DE],..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278545,https://doi.org/10.4000/11z8h,2024,fr,Arts and Humanities,,,,,[],[],[Jingjing Han],[A5101103010],gold,,,[True],[[]]
278546,https://doi.org/10.3917/regar.063.0105,2024,fr,Social Sciences,,,,,[],[],[Patrick Savidan],[A5104824994],closed,,,[True],[[]]
278547,https://doi.org/10.1016/j.optcom.2024.130895,2024,en,Physics and Astronomy,,,,,[],[],"[Jacob Szeftel, Jean-Claude Lévy]","[A5030357905, A5113997816]",closed,"{'value': 2320, 'currency': 'USD', 'value_usd'...",,"[False, False]","[[], []]"
278548,https://doi.org/10.62229/aubllrlxxi/22/2,2024,en,"Business, Management and Accounting",,,,,[],[],[ALEXANDRU MARDALE],[A5106111675],closed,,,[True],[[]]


In [9]:
df_not_oa['doi_corr'] = df_not_oa['doi'].apply(lambda x: x[16::])
df_not_oa['BSO'] = True

df_final = pd.concat((df_tmp, df_not_oa)).drop_duplicates('doi').reset_index(drop = True)
df_final.to_csv('../data/interim/initial_dataset.csv')
df_final

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries,doi_corr,BSO
0,https://doi.org/10.1016/j.cell.2013.05.039,2013,en,"Biochemistry, Genetics and Molecular Biology",Cell,S110447773,0092-8674,Cell Press,P4310315673,"['Universidad de Oviedo', 'Spanish National Ca...","['I165339363', 'I4210089594', 'I4210153965', '...","['Carlos López‐Otín', 'Marı́a A. Blasco', 'Lin...","['A5087974982', 'A5085977927', 'A5059311261', ...",bronze,"{'value': 10100, 'currency': 'USD', 'value_usd...",,"[False, False, False, True, False]","[['ES'], ['ES'], ['DE', 'GB'], ['ES'], ['FR']]",10.1016/j.cell.2013.05.039,True
1,https://doi.org/10.1051/0004-6361/201322068,2013,en,Physics and Astronomy,Astronomy and Astrophysics,S205231332,0004-6361,EDP Sciences,P4310319748,"['Max Planck Institute for Astronomy', 'Yale U...","['I4210109156', 'I32971472', 'I4210118524', 'I...","['Thomas Robitaille', 'Erik Tollerud', 'P. Gre...","['A5014651763', 'A5083524651', 'A5052341993', ...",bronze,,,"[False, False, False, False, False, False, Fal...","[['DE'], ['US'], ['FR', 'US'], ['FR', 'US'], [...",10.1051/0004-6361/201322068,True
2,https://doi.org/10.2458/azu_js_rc.55.16947,2013,en,Earth and Planetary Sciences,Radiocarbon,S35778795,0033-8222,Cambridge University Press,P4310311721,"[""Queen's University Belfast"", 'Institut de Re...","['I126231945', 'I4210166444', 'I1294671590', '...","['Reimer Paula J', 'Bard, Edouard', 'Bayliss A...","['A3095000969', 'A3024548407', 'A3135758542', ...",bronze,,,"[True, False, False, False, False, False, Fals...","[['GB'], ['FR'], ['GB'], ['US'], ['GB'], ['GB'...",10.2458/azu_js_rc.55.16947,True
3,https://doi.org/10.1038/nature12477,2013,en,"Biochemistry, Genetics and Molecular Biology",Nature,S137773608,0028-0836,Nature Portfolio,P4310319908,"['Wellcome Sanger Institute', 'Wellcome Sanger...","['I2802476451', 'I2802476451', 'I2802476451', ...","['Ludmil B. Alexandrov', 'Serena Nik‐Zainal', ...","['A5080997789', 'A5072873709', 'A5004776968', ...",bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[['GB'], ['GB'], ['GB'], ['CA'], ['GB'], ['GB'...",10.1038/nature12477,True
4,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,Medicine,,,,,,"['Cooper University Hospital', 'Phoenix Contac...","['I2800704349', 'I78801874', 'I27804330', 'I42...","['R. Phillip Dellinger', 'Mitchell M. Levy', '...","['A5066795709', 'A5000157972', 'A5083275742', ...",closed,,,"[False, False, False, False, False, False, Fal...","[['US'], ['US'], ['GB'], ['FR'], ['DE'], ['US'...",10.1097/ccm.0b013e31827e83af,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1831121,https://doi.org/10.4000/11z8h,2024,fr,Arts and Humanities,,,,,,[],[],[Jingjing Han],[A5101103010],gold,,,[True],[[]],10.4000/11z8h,True
1831122,https://doi.org/10.3917/regar.063.0105,2024,fr,Social Sciences,,,,,,[],[],[Patrick Savidan],[A5104824994],closed,,,[True],[[]],10.3917/regar.063.0105,True
1831123,https://doi.org/10.1016/j.optcom.2024.130895,2024,en,Physics and Astronomy,,,,,,[],[],"[Jacob Szeftel, Jean-Claude Lévy]","[A5030357905, A5113997816]",closed,"{'value': 2320, 'currency': 'USD', 'value_usd'...",,"[False, False]","[[], []]",10.1016/j.optcom.2024.130895,True
1831124,https://doi.org/10.62229/aubllrlxxi/22/2,2024,en,"Business, Management and Accounting",,,,,,[],[],[ALEXANDRU MARDALE],[A5106111675],closed,,,[True],[[]],10.62229/aubllrlxxi/22/2,True


## Construct Inclusive option (any corresponding french and non-corresponding last or first french [less 50 authors])

In [2]:
def safe_literal_eval(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    return []

chunks = pd.read_csv('../data/interim/initial_dataset.csv', chunksize=100000)


df_interest_parts = []
for chunk in tqdm(chunks):
    chunk = chunk.drop(columns='Unnamed: 0')

    chunk['corresponding'] = chunk['corresponding'].apply(safe_literal_eval)
    chunk['countries'] = chunk['countries'].apply(safe_literal_eval)

    # Positions of French authors
    chunk['french_position'] = chunk['countries'].apply(lambda lst: [i for i, inner in enumerate(lst) if 'FR' in inner])

    # No-corresponding vs with-corresponding
    no_corr_mask = chunk['corresponding'].apply(lambda x: all(v is False for v in x))
    chunk_nocorresponding = chunk[no_corr_mask]
    chunk_corresponding = chunk[~no_corr_mask]

    # -------------------------
    # CASE 1: ANY corresponding French author
    # -------------------------
    chunk_corresponding['CA_positions'] = chunk_corresponding['corresponding'].apply(lambda lst: [i for i, v in enumerate(lst) if v])
    chunk_corresponding_anyfrench = chunk_corresponding[chunk_corresponding.apply(lambda row: bool(set(row['CA_positions']) & set(row['french_position'])), axis = 1)]

    # -------------------------
    # CASE 2: No corresponding author First or last author French nb authors < 50
    # -------------------------
    chunk_nocorresponding['n_authors'] = chunk_nocorresponding['corresponding'].str.len()
    chunk_nocorresponding_50 = chunk_nocorresponding[(chunk_nocorresponding['n_authors'] > 0) & (chunk_nocorresponding['n_authors'] < 50)]
    chunk_nocorresponding_50_ca_fr = chunk_nocorresponding_50[chunk_nocorresponding_50.apply(lambda row: ((0 in row['french_position']) or (len(row['french_position']) > 0 and
                                                              max(row['french_position']) == row['n_authors'] - 1)), axis = 1)]

    df_interest_parts.append(pd.concat([chunk_corresponding_anyfrench, chunk_nocorresponding_50_ca_fr], ignore_index = False))

df_interest = (pd.concat(df_interest_parts, ignore_index = True).drop(columns=['doi_corr', 'french_position', 'CA_positions', 'n_authors']))
df_interest = df_interest[df_interest['publication_year'].between(2013, 2024)].reset_index(drop=True)
df_interest.to_csv('../data/interim/inclusive_branch.csv', index = False)
df_interest

0it [00:00, ?it/s]

19it [01:53,  5.96s/it]


Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries,BSO
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,"['I42237331', 'I42237331', 'I42237331', 'I4210...","['Jacques Ferlay', 'Eva Steliarova‐Foucher', '...","['A5051830072', 'A5102839836', 'A5003542814', ...",bronze,"{'value': 3800, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[[FR], [FR], [FR], [IT], [NL], [IE], [FR], [FR]]",True
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...","['I4210092322', 'I110736937', 'I154526488', 'I...","['Gabriela Bindea', 'Bernhard Mlecnik', 'Marie...","['A5062069569', 'A5078219830', 'A5066060316', ...",bronze,"{'value': 9080, 'currency': 'USD', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[[FR], [FR], [FR], [FR], [DE, FR], [AT], [FR],...",True
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...","['I4391768231', 'I4210162942', 'I154526488', '...","['Gérald Simonneau', 'Michael Α. Gatzoulis', '...","['A5106748942', 'A5054951910', 'A5050405747', ...",closed,"{'value': 6500, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[[FR], [GB], [CA], [AU], [GB], [DE], [ES], [IN...",True
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...","['I2802567020', 'I2801827564', 'I124055696', '...","['Børge G. Nordestgaard', 'M. John Chapman', '...","['A5079728437', 'A5103427365', 'A5064235722', ...",bronze,"{'value': 4238, 'currency': 'EUR', 'value_usd'...",,"[True, True, False, False, False, False, False...","[[DK], [FR], [GB], [US], [ES], [BE], [SE], [CA...",True
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...","['I95674353', 'I95674353', 'I59807433', 'I5980...","['Paul Belleflamme', 'Thomas Lambert', 'Armin ...","['A5027223667', 'A5083569548', 'A5029048208']",green,"{'value': 4080, 'currency': 'USD', 'value_usd'...",,"[False, False, True]","[[BE], [BE, FR], [FR]]",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...","['I4210130201', 'I51101395']","['Bernard Nainggolan', 'Anatoliy Kostruba']","['A5059110458', 'A5090710975']",diamond,,,"[False, False]","[[ID], [FR]]",False
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...","['I4210107574', 'I48018076', 'I3132702812', 'I...","['A. Madeswaran', 'Awadhesh Chandramauli', 'V ...","['A5098676093', 'A5017279275', 'A5108826541', ...",closed,,,"[False, False, False, False, False]","[[FR, IN], [IN], [IN], [IQ], [IN]]",False
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...","['I17606148', 'I17606148']","['Marc Romain', 'Duron Loïc']","['A5033250907', 'A5107010455']",closed,,,"[False, False]","[[FR], [FR]]",False
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']","['I-1', 'I24943067', 'I-1', 'I24943067']","['Chris Fields', 'Antonino Marciano', 'Chris F...","['A2461076406', 'A2209647467', 'A2461076406', ...",gold,"{'value': 1200, 'currency': 'CHF', 'value_usd'...","{'value': 1200, 'currency': 'CHF', 'value_usd'...","[True, False, False, False]","[[FR], [CN], [FR], [CN]]",True


## Integrate National Agreements

In [3]:
df_interest = pd.read_csv('../data/interim/inclusive_branch.csv')
df_interest


Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,id_institution,display_name_author,id_author,oa_status,apc_list,apc_paid,corresponding,countries,BSO
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,"['I42237331', 'I42237331', 'I42237331', 'I4210...","['Jacques Ferlay', 'Eva Steliarova‐Foucher', '...","['A5051830072', 'A5102839836', 'A5003542814', ...",bronze,"{'value': 3800, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[['FR'], ['FR'], ['FR'], ['IT'], ['NL'], ['IE'...",True
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...","['I4210092322', 'I110736937', 'I154526488', 'I...","['Gabriela Bindea', 'Bernhard Mlecnik', 'Marie...","['A5062069569', 'A5078219830', 'A5066060316', ...",bronze,"{'value': 9080, 'currency': 'USD', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[['FR'], ['FR'], ['FR'], ['FR'], ['DE', 'FR'],...",True
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...","['I4391768231', 'I4210162942', 'I154526488', '...","['Gérald Simonneau', 'Michael Α. Gatzoulis', '...","['A5106748942', 'A5054951910', 'A5050405747', ...",closed,"{'value': 6500, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[['FR'], ['GB'], ['CA'], ['AU'], ['GB'], ['DE'...",True
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...","['I2802567020', 'I2801827564', 'I124055696', '...","['Børge G. Nordestgaard', 'M. John Chapman', '...","['A5079728437', 'A5103427365', 'A5064235722', ...",bronze,"{'value': 4238, 'currency': 'EUR', 'value_usd'...",,"[True, True, False, False, False, False, False...","[['DK'], ['FR'], ['GB'], ['US'], ['ES'], ['BE'...",True
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...","['I95674353', 'I95674353', 'I59807433', 'I5980...","['Paul Belleflamme', 'Thomas Lambert', 'Armin ...","['A5027223667', 'A5083569548', 'A5029048208']",green,"{'value': 4080, 'currency': 'USD', 'value_usd'...",,"[False, False, True]","[['BE'], ['BE', 'FR'], ['FR']]",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...","['I4210130201', 'I51101395']","['Bernard Nainggolan', 'Anatoliy Kostruba']","['A5059110458', 'A5090710975']",diamond,,,"[False, False]","[['ID'], ['FR']]",False
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...","['I4210107574', 'I48018076', 'I3132702812', 'I...","['A. Madeswaran', 'Awadhesh Chandramauli', 'V ...","['A5098676093', 'A5017279275', 'A5108826541', ...",closed,,,"[False, False, False, False, False]","[['FR', 'IN'], ['IN'], ['IN'], ['IQ'], ['IN']]",False
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...","['I17606148', 'I17606148']","['Marc Romain', 'Duron Loïc']","['A5033250907', 'A5107010455']",closed,,,"[False, False]","[['FR'], ['FR']]",False
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']","['I-1', 'I24943067', 'I-1', 'I24943067']","['Chris Fields', 'Antonino Marciano', 'Chris F...","['A2461076406', 'A2209647467', 'A2461076406', ...",gold,"{'value': 1200, 'currency': 'CHF', 'value_usd'...","{'value': 1200, 'currency': 'CHF', 'value_usd'...","[True, False, False, False]","[['FR'], ['CN'], ['FR'], ['CN']]",True


### Elsevier

In [4]:
df_interest['doi_corr'] = df_interest['doi'].str[16:]

col = 'Publication VoR date'
interest_status = ['Accepted', 'Not yet in EOAP', 'Pending']

df_elsevier = (pd.read_csv('../data/external/Elsevier_EOAP_2225_Décembre.csv').dropna(subset = [col]))
df_elsevier['year'] = pd.to_datetime(df_elsevier[col], dayfirst = True, errors = 'coerce').dt.year
df_elsevier = df_elsevier[(df_elsevier['year'] < 2025) &(df_elsevier['Status'].isin(interest_status))]

matched_mask = df_elsevier['Article DOI'].isin(df_interest['doi_corr'])
print(df_elsevier.loc[matched_mask, 'Article DOI'].nunique() / df_elsevier['Article DOI'].nunique())
df_elsevier_filtered = df_elsevier.loc[matched_mask].reset_index(drop=True)

df_elsevier_filtered

0.8651732294420481


Unnamed: 0,Article PII,Corresponding author name,Corresponding author email,Corresponding author institute name,Corresponding author institute ECR,Article Payer Institute Name,Article Payer Institute ECR,Article publishing model,User license,Journal ISSN,...,Acceptance date,Author journey completion date,Publication VoR date,Research funder,Status,SD Link,Agreement currency,APC list price,APC price after discount,year
0,S0732889324000312,"de Villiers de la Noue, Valentin",vde-villiers-de-la-noue@chu-reims.fr,University of Reims Champagne-Ardenne,ECR-25656,University of Reims Champagne-Ardenne,ECR-25656,Hybrid open access,CC BY-NC-ND,0732-8893,...,30/01/2024,30/01/2024,02/02/2024,University of Reims Champagne-Ardenne,Accepted,https://www.sciencedirect.com/science/article/...,EUR,2 830,,2024
1,S1879981724002997,"Garot, Elsa",elsa.garot@u-bordeaux.fr,University of Bordeaux College of Health Sciences,ECR-10250482,University of Bordeaux,ECR-25642,Hybrid open access,CC BY,1879-9817,...,23/06/2024,01/07/2024,16/07/2024,University of Bordeaux,Accepted,https://www.sciencedirect.com/science/article/...,EUR,1 740,,2024
2,S2214250922000403,"Eloy, Philippine",philippine.eloy@aphp.fr,Public Assistance Hospitals Paris,ECR-213123,Public Assistance Hospitals Paris,ECR-213123,Full open access,CC BY-NC-ND,2214-2509,...,20/01/2022,08/02/2022,02/02/2022,REACTing,Accepted,https://www.sciencedirect.com/science/article/...,EUR,610,406,2022
3,S2214250923000859,"Manchon, Romain",romain.manchon@aphp.fr,Beaujon Hospital,ECR-10353746,Public Assistance Hospitals Paris,ECR-213123,Full open access,CC BY,2214-2509,...,01/04/2023,10/04/2023,03/04/2023,Public Assistance Hospitals Paris,Accepted,https://www.sciencedirect.com/science/article/...,EUR,820,484,2023
4,S1201971224003965,"Buyck, Julien",julien.buyck@univ-poitiers.fr,University of Poitiers,ECR-25646,University of Poitiers,ECR-25646,Full open access,CC BY,1201-9712,...,26/11/2024,29/11/2024,16/12/2024,Nouvelle-Aquitaine Regional Council,Accepted,https://www.sciencedirect.com/science/article/...,EUR,620,,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6213,S0925231224006325,"Zeghina, Assaad Oussama",assaad-oussama.zeghina@etu.unistra.fr,University of Strasbourg,ECR-25630,University of Strasbourg,ECR-25630,Hybrid open access,CC BY,0925-2312,...,13/05/2024,15/05/2024,23/05/2024,French National Research Agency,Accepted,https://www.sciencedirect.com/science/article/...,EUR,2 250,,2024
6214,S1542356524006062,"Cariou, Bertrand",bertrand.cariou@univ-nantes.fr,Thorax Institute,ECR-33534174,INSERM,ECR-26853,Hybrid open access,CC BY,1542-3565,...,21/06/2024,06/07/2024,20/12/2024,French National Research Agency,Accepted,https://www.sciencedirect.com/science/article/...,EUR,3 690,,2024
6215,S0300908424001408,"Coux, Olivier",olivier.coux@cnrs.fr,Institute of Molecular Genetics of Montpellier,ECR-10214327,University of Montpellier,ECR-330768,Hybrid open access,CC BY,0300-9084,...,14/06/2024,18/06/2024,06/11/2024,European Cooperation in Science and Technology,Accepted,https://www.sciencedirect.com/science/article/...,EUR,2 630,,2024
6216,S1542356522005547,"Shimakawa, Yusuke",yusuke.shimakawa@pasteur.fr,Institut Pasteur,ECR-26852,Institut Pasteur,ECR-26852,Hybrid open access,CC BY,1542-3565,...,13/05/2022,09/06/2022,21/06/2023,European Commission,Accepted,https://www.sciencedirect.com/science/article/...,EUR,2 630,1 698,2023


**Reconcile mails using Open Refine**

In [None]:
df_mails = (df_elsevier_filtered[['Corresponding author email']].drop_duplicates().copy())
df_mails = df_mails.assign(email_exploded = df_mails['Corresponding author email'].str.split(r'[;,]')).explode('email_exploded')
df_mails['email_exploded'] = df_mails['email_exploded'].str.strip()
df_mails = df_mails[df_mails['email_exploded'] != '']
df_mails = df_mails.drop_duplicates(subset = 'email_exploded').reset_index(drop = True)
df_mails[['email_exploded']].to_csv('elsevier_mails.csv', index = False)
df_mails

In [5]:
interest_cols = ['Article DOI', 'Corresponding author email', 'APC list price', 'APC price after discount']

df_elsevier_filtered = df_elsevier_filtered.assign(email_exploded = df_elsevier_filtered['Corresponding author email'].str.split(r'[;,]')).explode('email_exploded')
df_elsevier_filtered['email_exploded'] = df_elsevier_filtered['email_exploded'].str.strip()

df_mails_recon = pd.read_csv('elsevier_mails_recon.csv').drop(columns=['Column'], errors='ignore')

df_merged = df_elsevier_filtered.merge(df_mails_recon[['Corresponding author email_exp_2', 'Corresponding author email_exp_2_recon']], left_on = 'email_exploded',
                                       right_on = 'Corresponding author email_exp_2', how = 'left')
df_agg = (df_merged.groupby('Article DOI')['Corresponding author email_exp_2_recon'].apply(lambda x: ' | '.join(x.dropna().astype(str))).to_frame(name='Corresponding author email_recon'))

df_tmp = df_elsevier_filtered.drop(columns = ['Corresponding author email']).drop_duplicates(subset=['Article DOI'])
df_tmp = df_tmp.merge(df_agg, on = 'Article DOI', how = 'left')

df_tmp = df_tmp.assign(NationalAgreement = 'Elsevier')[['Article DOI', 'Corresponding author email_recon', 'APC list price', 'APC price after discount', 'NationalAgreement']]

df_interest_1 = df_interest.merge(df_tmp, left_on = 'doi_corr', right_on = 'Article DOI', how = 'left').drop(columns = ['Article DOI'])
df_interest_1


Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,...,apc_list,apc_paid,corresponding,countries,BSO,doi_corr,Corresponding author email_recon,APC list price,APC price after discount,NationalAgreement
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,...,"{'value': 3800, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[['FR'], ['FR'], ['FR'], ['IT'], ['NL'], ['IE'...",True,10.1016/j.ejca.2012.12.027,,,,
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...",...,"{'value': 9080, 'currency': 'USD', 'value_usd'...",,"[False, False, False, False, False, False, Fal...","[['FR'], ['FR'], ['FR'], ['FR'], ['DE', 'FR'],...",True,10.1016/j.immuni.2013.10.003,,,,
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...",...,"{'value': 6500, 'currency': 'USD', 'value_usd'...",,"[True, False, False, False, False, False, Fals...","[['FR'], ['GB'], ['CA'], ['AU'], ['GB'], ['DE'...",True,10.1016/j.jacc.2013.10.029,,,,
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...",...,"{'value': 4238, 'currency': 'EUR', 'value_usd'...",,"[True, True, False, False, False, False, False...","[['DK'], ['FR'], ['GB'], ['US'], ['ES'], ['BE'...",True,10.1093/eurheartj/eht273,,,,
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...",...,"{'value': 4080, 'currency': 'USD', 'value_usd'...",,"[False, False, True]","[['BE'], ['BE', 'FR'], ['FR']]",True,10.1016/j.jbusvent.2013.07.003,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...",...,,,"[False, False]","[['ID'], ['FR']]",False,10.26532/jh.v40i1.37175,,,,
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...",...,,,"[False, False, False, False, False]","[['FR', 'IN'], ['IN'], ['IN'], ['IQ'], ['IN']]",False,10.1109/icacite60783.2024.11269084,,,,
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...",...,,,"[False, False]","[['FR'], ['FR']]",False,10.1055/s-0044-1791345,,,,
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']",...,"{'value': 1200, 'currency': 'CHF', 'value_usd'...","{'value': 1200, 'currency': 'CHF', 'value_usd'...","[True, False, False, False]","[['FR'], ['CN'], ['FR'], ['CN']]",True,10.3390/quantum1020022,,,,


### Wiley

In [6]:
df_interest['doi_corr'] = df_interest['doi'].str[16:]

col = 'Request Date'

df_willey = (pd.read_csv('../data/external/Wiley_2225_Novembre.csv').dropna(subset = [col]))
df_willey['year'] = df_willey[col].apply(lambda x: int(x.split('-')[-1]))
df_willey = df_willey[(df_willey['year'] < 2025) & (df_willey['Request Status'] != 'Denied')]

matched_mask = df_willey['DOI'].isin(df_interest['doi_corr'])
print(df_willey.loc[matched_mask, 'DOI'].nunique() / df_willey['DOI'].nunique())
df_willey_filtered = df_willey.loc[matched_mask].reset_index(drop=True)

df_willey_filtered

0.8114821970742478


Unnamed: 0,WOA ID,WOA Name,eCore WOAC ID,eCore WOAC Name,Account Revenue model,WOA Advance Cash?,Request Date,Request Status,Request Status (internal),Journal Group Code,...,Production Received Date,EV Published Date,Published in Issue Date,Publication Status,Institutions,Admin Notes,User Notes,Flip Status,Revenue Model,year
0,U080,COUPERIN CY23 - Sorbonne Universite,,,BOTH,N,31-déc.-2024,Funding Confirmed,Funding Confirmed,ANIE,...,4-déc.-2024,27-janv.-2025,17-févr.-2025,Published in Issue,Sorbonne Universite (27063),,,,Online Open,2024
1,U042,COUPERIN CY23 - Commissariat a l'energie atomi...,,,BOTH,N,31-déc.-2024,Funding Confirmed,Funding Confirmed,CHEM,...,21-oct.-2024,9-janv.-2025,20-févr.-2025,Published in Issue,Commissariat a l'energie atomique et aux energ...,,,,Online Open,2024
2,FR08,COUPERIN CY23 - Universite de Strasbourg,,,BOTH,N,31-déc.-2024,Funding Confirmed,Funding Confirmed,SMLL,...,7-nov.-2024,29-déc.-2024,25-févr.-2025,Published in Issue,Universite de Strasbourg (27083),,,,Online Open,2024
3,U083,COUPERIN CY23 - Universite Cote d'Azur,,,BOTH,N,30-déc.-2024,Funding Confirmed,Funding Confirmed,VNL,...,12-nov.-2024,22-déc.-2024,1-mai-2025,Published in Issue,Institut de Chimie de Nice (131888); \nUnivers...,,,,Online Open,2024
4,U084,COUPERIN CY23 - Universite d'Angers,,,BOTH,N,29-déc.-2024,Funding Confirmed,Funding Confirmed,HED,...,30-juin-2024,26-déc.-2024,12-févr.-2025,Published in Issue,Centre Hospitalier Universitaire d'Angers Pole...,,,,Online Open,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6282,P026,DEFUNCT - COUPERIN hybrid - Institut Curie,,,,N,20-avr.-2022,Funding Confirmed,Funding Confirmed,2111,...,,11-mai-2022,22-juin-2022,Published in Issue,Institut Curie Centre de Recherche Orsay (1647...,,,,Online Open,2022
6283,P016,DEFUNCT - COUPERIN hybrid - Hospices Civils de...,,,,N,20-avr.-2022,Funding Confirmed,Funding Confirmed,EJH,...,10-avr.-2022,29-avr.-2022,12-juin-2022,Published in Issue,Hospices Civils de Lyon (26900),,,,Online Open,2022
6284,P113,DEFUNCT - COUPERIN hybrid - Universite de Tours,,,,N,19-avr.-2022,Funding Confirmed,Funding Confirmed,2328,...,13-avr.-2022,3-mai-2022,13-juin-2022,Published in Issue,"Universite de Tours (27092); \nINSERM, TOURS, ...",,,,Online Open,2022
6285,P132,DEFUNCT - COUPERIN hybrid - Centre hospitalier...,,,,N,19-avr.-2022,Funding Confirmed,Funding Confirmed,IJLH,...,29-mars-2022,21-avr.-2022,15-nov.-2022,Published in Issue,Centre Hospitalier Regional Universitaire de N...,,,,Online Open,2022


**Reconcile mails using Open Refine**

In [None]:
df_mails = (df_willey_filtered[['Responsible Corresponding author email']].drop_duplicates().copy())
df_mails = df_mails.assign(email_exploded = df_mails['Responsible Corresponding author email'].str.split(r'[;,]')).explode('email_exploded')
df_mails['email_exploded'] = df_mails['email_exploded'].str.strip()
df_mails = df_mails[df_mails['email_exploded'] != '']
df_mails = df_mails.drop_duplicates(subset = 'email_exploded').reset_index(drop = True)
df_mails[['email_exploded']].to_csv('willey_mails.csv', index = False)
df_mails

In [7]:
interest_cols = ['DOI', 'Responsible Corresponding author email', 'Full APC', 'Discount', 'Amount Charged']

df_willey_filtered = df_willey_filtered.assign(email_exploded = df_willey_filtered['Responsible Corresponding author email'].str.split(r'[;,]')).explode('email_exploded')
df_willey_filtered['email_exploded'] = df_willey_filtered['email_exploded'].str.strip()

df_mails_recon = pd.read_csv('willey_mails_recon.csv').drop(columns=['Column'], errors='ignore')

df_merged = df_willey_filtered.merge(df_mails_recon[['Responsible Corresponding author email_exp_2', 'Responsible Corresponding author email_exp_2_recon']], left_on = 'email_exploded',
                                       right_on = 'Responsible Corresponding author email_exp_2', how = 'left')
df_agg = (df_merged.groupby('DOI')['Responsible Corresponding author email_exp_2_recon'].apply(lambda x: ' | '.join(x.dropna().astype(str))).to_frame(name='Corresponding author email_recon'))

df_tmp = df_willey_filtered.drop(columns = ['Responsible Corresponding author email']).drop_duplicates(subset=['DOI'])
df_tmp = df_tmp.merge(df_agg, on = 'DOI', how = 'left')

df_tmp = df_tmp.assign(NationalAgreement = 'Wiley')[['DOI', 'Corresponding author email_recon', 'Full APC', 'Discount', 'Amount Charged', 'NationalAgreement']]

df_interest_2 = df_interest_1.merge(df_tmp, left_on = 'doi_corr', right_on = 'DOI', how = 'left').drop(columns = ['DOI'])

df_interest_2

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,...,doi_corr,Corresponding author email_recon_x,APC list price,APC price after discount,NationalAgreement_x,Corresponding author email_recon_y,Full APC,Discount,Amount Charged,NationalAgreement_y
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,...,10.1016/j.ejca.2012.12.027,,,,,,,,,
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...",...,10.1016/j.immuni.2013.10.003,,,,,,,,,
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...",...,10.1016/j.jacc.2013.10.029,,,,,,,,,
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...",...,10.1093/eurheartj/eht273,,,,,,,,,
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...",...,10.1016/j.jbusvent.2013.07.003,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...",...,10.26532/jh.v40i1.37175,,,,,,,,,
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...",...,10.1109/icacite60783.2024.11269084,,,,,,,,,
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...",...,10.1055/s-0044-1791345,,,,,,,,,
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']",...,10.3390/quantum1020022,,,,,,,,,


In [8]:
df_interest_2 = df_interest_2.assign(NationalAgreement = df_interest_2['NationalAgreement_x'].combine_first(df_interest_2['NationalAgreement_y']),
                                    Corresponding_author_email_recon = df_interest_2['Corresponding author email_recon_x'].combine_first(df_interest_2['Corresponding author email_recon_y']))

cols_to_drop = ['NationalAgreement_x', 'NationalAgreement_y', 'Corresponding author email_recon_x', 'Corresponding author email_recon_y']
df_interest_2 = df_interest_2.drop(columns = cols_to_drop)
df_interest_2

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,...,countries,BSO,doi_corr,APC list price,APC price after discount,Full APC,Discount,Amount Charged,NationalAgreement,Corresponding_author_email_recon
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,...,"[['FR'], ['FR'], ['FR'], ['IT'], ['NL'], ['IE'...",True,10.1016/j.ejca.2012.12.027,,,,,,,
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...",...,"[['FR'], ['FR'], ['FR'], ['FR'], ['DE', 'FR'],...",True,10.1016/j.immuni.2013.10.003,,,,,,,
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...",...,"[['FR'], ['GB'], ['CA'], ['AU'], ['GB'], ['DE'...",True,10.1016/j.jacc.2013.10.029,,,,,,,
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...",...,"[['DK'], ['FR'], ['GB'], ['US'], ['ES'], ['BE'...",True,10.1093/eurheartj/eht273,,,,,,,
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...",...,"[['BE'], ['BE', 'FR'], ['FR']]",True,10.1016/j.jbusvent.2013.07.003,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...",...,"[['ID'], ['FR']]",False,10.26532/jh.v40i1.37175,,,,,,,
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...",...,"[['FR', 'IN'], ['IN'], ['IN'], ['IQ'], ['IN']]",False,10.1109/icacite60783.2024.11269084,,,,,,,
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...",...,"[['FR'], ['FR']]",False,10.1055/s-0044-1791345,,,,,,,
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']",...,"[['FR'], ['CN'], ['FR'], ['CN']]",True,10.3390/quantum1020022,,,,,,,


### Add cost hypothesis

**fixed amount of 1000€**

In [9]:
df_interest_2['apc_hyp_1'] = df_interest_2.NationalAgreement.fillna(0).apply(lambda x: 1000 if x != 0 else np.nan)
df_interest_2

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,...,BSO,doi_corr,APC list price,APC price after discount,Full APC,Discount,Amount Charged,NationalAgreement,Corresponding_author_email_recon,apc_hyp_1
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,...,True,10.1016/j.ejca.2012.12.027,,,,,,,,
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...",...,True,10.1016/j.immuni.2013.10.003,,,,,,,,
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...",...,True,10.1016/j.jacc.2013.10.029,,,,,,,,
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...",...,True,10.1093/eurheartj/eht273,,,,,,,,
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...",...,True,10.1016/j.jbusvent.2013.07.003,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...",...,False,10.26532/jh.v40i1.37175,,,,,,,,
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...",...,False,10.1109/icacite60783.2024.11269084,,,,,,,,
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...",...,False,10.1055/s-0044-1791345,,,,,,,,
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']",...,True,10.3390/quantum1020022,,,,,,,,


**Divided the cost by number of papers**

In [10]:
def clean_apc(value):
    if pd.isna(value):
        return np.nan
    cleaned = ''.join(filter(str.isdigit, str(value)))
    return int(cleaned) if cleaned else np.nan 

df_filtered = df_interest_2[df_interest_2.NationalAgreement == 'Elsevier']
df_filtered['APC_list_price_clean'] = df_filtered['APC list price'].apply(clean_apc)

total_apc = df_filtered['APC_list_price_clean'].sum()
total_apc_paper  = total_apc / df_filtered.shape[0]
print('Elsevier', total_apc, total_apc_paper)
df_interest_2['apc_hyp_2'] = df_interest_2.NationalAgreement.apply(lambda x: total_apc_paper if x == 'Elsevier' else np.nan)

df_filtered = df_interest_2[df_interest_2.NationalAgreement == 'Wiley']
df_filtered['Full_APC_clean'] = df_filtered['Full APC'].apply(clean_apc)

total_apc = df_filtered['Full_APC_clean'].sum()
total_apc_paper  = total_apc / df_filtered.shape[0]
print('Wiley', total_apc, total_apc_paper)
df_interest_2['apc_hyp_2'] = df_interest_2.NationalAgreement.apply(lambda x: total_apc_paper if x == 'Wiley' else np.nan)
df_interest_2.to_csv('../data/interim/initial_dataset_elsevierwiliey.csv', index = False)
df_interest_2

Elsevier 17284753.0 2779.7930202637503
Wiley 8188145.0 1392.5416666666667


Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,...,doi_corr,APC list price,APC price after discount,Full APC,Discount,Amount Charged,NationalAgreement,Corresponding_author_email_recon,apc_hyp_1,apc_hyp_2
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,...,10.1016/j.ejca.2012.12.027,,,,,,,,,
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...",...,10.1016/j.immuni.2013.10.003,,,,,,,,,
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...",...,10.1016/j.jacc.2013.10.029,,,,,,,,,
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...",...,10.1093/eurheartj/eht273,,,,,,,,,
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...",...,10.1016/j.jbusvent.2013.07.003,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...",...,10.26532/jh.v40i1.37175,,,,,,,,,
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...",...,10.1109/icacite60783.2024.11269084,,,,,,,,,
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...",...,10.1055/s-0044-1791345,,,,,,,,,
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']",...,10.3390/quantum1020022,,,,,,,,,


## Integrate Open Apc

In [12]:
interest_cols = ['doi', 'euro', 'issn_l', 'period']
df_open_apc = pd.read_csv('../data/external/apc_de.csv')[interest_cols]

df_interest = pd.read_csv('../data/interim/initial_dataset_elsevierwiliey.csv')

df_tmp = df_interest.merge(df_open_apc.rename(columns={'doi': 'doi_apc'}), left_on = 'doi_corr', right_on = 'doi_apc', how = 'left')
df_tmp = df_tmp.drop(columns=['doi_apc', 'period']).rename(columns={'doi_x': 'doi'})

valid_issns = df_tmp['issn_l_y'].dropna().unique()
median_apc = (df_open_apc[df_open_apc['issn_l'].isin(valid_issns)].groupby(['issn_l', 'period'])['euro'].median())
counts = (df_open_apc[df_open_apc['issn_l'].isin(df_tmp['issn_l_y'].dropna().unique())].groupby(['issn_l', 'period'])['euro'].count())

df_tmp['median'] = pd.Series(list(zip(df_tmp['issn_l_x'], df_tmp['publication_year']))).map(median_apc)
df_tmp = df_tmp.drop(columns=['issn_l_y']).rename(columns={'issn_l_x': 'issn_l'})

df_tmp.to_csv('../data/processed/initial_dataset_elsevierwiliey_openapc.csv', index = False)
df_tmp

Unnamed: 0,doi,publication_year,language,field_name_top_topic,journal,journal_id,issn_l,publisher,publisher_id,display_name_institution,...,APC price after discount,Full APC,Discount,Amount Charged,NationalAgreement,Corresponding_author_email_recon,apc_hyp_1,apc_hyp_2,euro,median
0,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,Medicine,European Journal of Cancer,S16731738,0959-8049,Elsevier BV,P4310320990,['Centre international de recherche sur le can...,...,,,,,,,,,,2676.26
1,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,Medicine,Immunity,S199671312,1074-7613,Cell Press,P4310315673,"['Centre de Recherche des Cordeliers', 'Déléga...",...,,,,,,,,,,4487.84
2,https://doi.org/10.1016/j.jacc.2013.10.029,2013,en,Medicine,,,,,,"[""Laboratoire d'Excellence en Recherche sur le...",...,,,,,,,,,,
3,https://doi.org/10.1093/eurheartj/eht273,2013,en,Medicine,European Heart Journal,S181568219,0195-668X,Oxford University Press,P4310311648,"['Copenhagen University Hospital', 'Herlev Hos...",...,,,,,,,,,,
4,https://doi.org/10.1016/j.jbusvent.2013.07.003,2013,en,"Business, Management and Accounting",Digital Access to Libraries,S4306400166,,Harris County Public Library,I2800384882,"['UCLouvain', 'UCLouvain', 'Université Lille N...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162860,https://doi.org/10.26532/jh.v40i1.37175,2024,fr,,Jurnal Hukum,S4210234340,1412-2723,,,"['Universitas Kristen Indonesia Paulus', 'Univ...",...,,,,,,,,,,
1162861,https://doi.org/10.1109/icacite60783.2024.1126...,2024,,,,,,,,"['School of Business and Management', 'Christ ...",...,,,,,,,,,,
1162862,https://doi.org/10.1055/s-0044-1791345,2024,en,,,,,,,"[""École Nationale Vétérinaire d'Alfort"", ""Écol...",...,,,,,,,,,,
1162863,https://doi.org/10.3390/quantum1020022,2019,en,Physics and Astronomy,Quantum Reports,S4210211996,,Multidisciplinary Digital Publishing Institute,P4310310987,"['Fudan University', 'Fudan University']",...,,,,,,,,,,


In [3]:
threshold = 5 
counts_filtered = counts[counts > threshold]

len(counts_filtered) / len(counts)

0.3026795185168104