# Preliminar Analysis

In [1]:
import requests
import time
import json
from urllib.parse import urlencode
import pandas as pd
import orjson
from tqdm import tqdm

### Download all articles where there are one or more french authors

In [25]:
BASE_URL = "https://api.openalex.org/works"

selected_fields = ["doi", "is_xpac", "publication_year", "language", "indexed_in", "primary_location", "open_access", "authorships", "institutions", 
                   "corresponding_author_ids", "corresponding_institution_ids", "apc_list", "apc_paid", "cited_by_count", "primary_topic", "topics", 
                   "concepts", "grants", "awards", "funders"]

def fetch_page(cursor, filters):
    params = filters.copy()
    params["cursor"] = cursor
    params["per_page"] = 200
    url = f"{BASE_URL}?{urlencode(params)}"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json()

for year in range(2013, 2014): # anar modificant l'initial year
    print(f"\n=== YEAR {year} ===")

    filters = {
        "filter": ",".join([
            "type:article",
            "primary_location.source.type:journal",
            "authorships.institutions.country_code:FR",
            f"from_publication_date:{year}-01-01",
            f"to_publication_date:{year}-12-31"
        ]),
        "select": ",".join(selected_fields)
    }

    cursor = "*"
    count = 0
    output_file = f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl"

    with open(output_file, "w", encoding="utf-8") as f:
        pbar = tqdm(unit="works", dynamic_ncols=True)
        while True:
            data = fetch_page(cursor, filters)

            works = data.get("results", [])
            next_cursor = data.get("meta", {}).get("next_cursor", None)

            for w in works:
                f.write(json.dumps(w) + "\n")
                count += 1

            pbar.update(len(works))
            
            if not next_cursor:
                break
            cursor = next_cursor
            time.sleep(1)

        pbar.close()
    print(f"\nSaved: {output_file}  (total {count})")

print("\n=== COMPLETED ===")


=== YEAR 2013 ===


107221works [23:40, 75.46works/s]


Saved: ../data/interim/FranceInitialAPI/openalex_french_authors_2013_v2.jsonl  (total 107221)

=== COMPLETED ===





### Phase 1: Whole France

**Number of unique DOI in France (NON Datacite)**

In [5]:
unique_doi_france = 0
for year in tqdm(range(2013, 2025)):
    doi_set_france = set()
    with open(f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl", "rb") as f:
        for line in f:
            rec = orjson.loads(line)
            doi = rec.get("doi")
            if doi:
                doi_set_france.add(doi)
    unique_doi_france += len(doi_set_france)

print("Total unique DOIs:", unique_doi_france)

100%|██████████| 12/12 [01:06<00:00,  5.54s/it]

Total unique DOIs: 1243928





**Number of unique DOI in France (ONLY Datacite) + dataset construction**

In [1]:
### I SHOULD REDO THE DOWNLOAD; ASK IF NEEDED GIVEN THAT WE WON'T USE IT

### Phase 2: Ingest, or not, BSO

In [2]:
df_bso = pd.read_parquet("../data/external/open-access-monitor-france.parquet", engine = "pyarrow").dropna(subset = ["doi"])
df_bso

Unnamed: 0,observation_date,id,doi,pmid,hal_id,year,title,journal_issns,journal_issn_l,journal_name,...,software_used,software_created,software_shared,data_used,data_created,data_shared,missing_doi_in_hal,has_doi_in_hal,doi_in_hal,bso_country
0,2024Q4,doi10.1080/10408398.2022.2033684,10.1080/10408398.2022.2033684,35152807,hal-03777046,2022,The impact of nano/micro-plastics toxicity on ...,"1040-8398,1549-7852",1040-8398,Critical Reviews in Food Science and Nutrition,...,,,,,,,,1,10.1080/10408398.2022.2033684,fr
1,2024Q4,doi10.1016/j.gie.2021.12.048,10.1016/j.gie.2021.12.048,,,2022,Real-time use of artificial intelligence at co...,"0016-5107,1085-8741,1097-6779",0016-5107,Gastrointestinal Endoscopy,...,,,,,,,,,,
2,2024Q4,doi10.1016/j.compositesa.2022.107165,10.1016/j.compositesa.2022.107165,,Preprint-Carpier-et-al-CompPartA-2022.pdf,2022,Meso-structure-based thermomechanical modellin...,"1359-835X,1878-5840",1359-835X,Composites Part A Applied Science and Manufact...,...,True,False,False,True,True,False,,1,10.1016/j.compositesa.2022.107165,"fr,other"
3,2024Q4,doi10.1002/ejoc.202200123,10.1002/ejoc.202200123,,TEXT%20FINAL.pdf,2022,Electrochemical Trifluoromethylselenolation of...,"1099-0690,1434-193X",1099-0690,European Journal of Organic Chemistry,...,False,False,False,True,False,False,,1,10.1002/ejoc.202200123,"fr,other"
4,2024Q4,doi10.3410/f.721705664.793594159,10.3410/f.721705664.793594159,,,2022,Faculty Opinions recommendation of On the gene...,,,Faculty Opinions – Post-Publication Peer Revie...,...,,,,,,,,,,fr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937570,2024Q4,doi10.1021/acs.jpcc.9b10824,10.1021/acs.jpcc.9b10824,,hal-03017908,2020,The Dissolution Anisotropy of Pyroxenes: Exper...,"1932-7447,1932-7455",1932-7447,The Journal of Physical Chemistry C,...,False,False,False,True,True,False,,1,10.1021/acs.jpcc.9b10824,"fr,other"
937571,2024Q4,doi10.1017/s007543582000012x,10.1017/s007543582000012x,,,2020,"LAUREL FULKERSON, A LITERARY COMMENTARY ON THE...","0075-4358,1753-528X",0075-4358,The Journal of Roman Studies,...,,,,,,,,,,fr
937575,2024Q4,doi10.1075/aila.00030.haa,10.1075/aila.00030.haa,,,2020,Recycling a genre for news automation,"1461-0213,1570-5595",1461-0213,AILA Review,...,,,,,,,,,,europe
937576,2024Q4,doi10.1007/s13592-020-00743-8,10.1007/s13592-020-00743-8,,hal-03161695,2020,Short-term hyperthermia at larval age reduces ...,"0044-8435,1297-9678",0044-8435,Apidologie,...,,,,,,,,1,10.1007/s13592-020-00743-8,other


In [11]:
df['doi_clean'] = df['doi'].apply(lambda x: x[16::])
df_bso_article = df_bso[(df_bso.genre == "journal-article") | (df_bso.genre == "proceedings")]
df_bso_not_in_oa = df_bso_article[~df_bso_article["doi"].isin(df.doi_clean.unique())].reset_index(drop = True)
df_bso_not_in_oa.to_parquet("../data/interim/bso_article_notoa.parquet", engine = "pyarrow")
df_bso_not_in_oa

Unnamed: 0,observation_date,id,doi,pmid,hal_id,year,title,journal_issns,journal_issn_l,journal_name,...,software_used,software_created,software_shared,data_used,data_created,data_shared,missing_doi_in_hal,has_doi_in_hal,doi_in_hal,bso_country
0,2024Q4,doi10.1080/10408398.2022.2033684,10.1080/10408398.2022.2033684,35152807,hal-03777046,2022,The impact of nano/micro-plastics toxicity on ...,"1040-8398,1549-7852",1040-8398,Critical Reviews in Food Science and Nutrition,...,,,,,,,,1,10.1080/10408398.2022.2033684,fr
1,2024Q4,doi10.1016/j.gie.2021.12.048,10.1016/j.gie.2021.12.048,,,2022,Real-time use of artificial intelligence at co...,"0016-5107,1085-8741,1097-6779",0016-5107,Gastrointestinal Endoscopy,...,,,,,,,,,,
2,2024Q4,doi10.1016/j.jscai.2022.100218,10.1016/j.jscai.2022.100218,,,2022,"D-10 , Procedural Characteristics of Patients ...",2772-9303,2772-9303,Journal of the Society for Cardiovascular Angi...,...,,,,,,,,,,
3,2024Q4,doi10.1136/bmjebm-2022-podabstracts.81,10.1136/bmjebm-2022-podabstracts.81,,jrsm.1413.pdf,2022,160 Visualizing the evolution of evidence: cum...,,,DS scholar abstracts,...,True,True,True,True,True,True,1.0,,,"fr,other"
4,2024Q4,doi10.1145/3490148.3538552,10.1145/3490148.3538552,,hal-03950351,2022,Brief Announcement,,,Proceedings of the 34th ACM Symposium on Paral...,...,False,False,False,True,False,False,,1,10.1145/3490148.3538552,"fr,other"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206370,2024Q4,doi10.1016/j.scitotenv.2020.137673,10.1016/j.scitotenv.2020.137673,32208236,hal-03143548,2020,Towards a spatiotemporally explicit toxicokine...,"0048-9697,1879-1026",0048-9697,The Science of The Total Environment,...,False,False,False,True,True,False,,1,10.1016/j.scitotenv.2020.137673,"fr,other"
206371,2024Q4,doi10.1164/ajrccm-conference.2020.201.1_meetin...,10.1164/ajrccm-conference.2020.201.1_meetingab...,,,2020,The Polyvalent Role of Mandibular Movement Sig...,,,C109. SRN: INNOVATIVE WAYS TO ASSESS SDB AND P...,...,False,False,False,False,False,False,,,,"fr,other"
206372,2024Q4,doi10.1111/ajco.13497,10.1111/ajco.13497,,,2020,Oral Abstracts,"1743-7555,1743-7563",1743-7555,Asia-Pacific Journal of Clinical Oncology,...,False,False,False,True,True,False,,,,"other,fr"
206373,2024Q4,doi10.1075/aila.00030.haa,10.1075/aila.00030.haa,,,2020,Recycling a genre for news automation,"1461-0213,1570-5595",1461-0213,AILA Review,...,,,,,,,,,,europe


In [None]:
### WAITING FOR THE RESPONSE OF THE TECHNICAL TEAM

### Phase 3: Corresponding

In [3]:
interest = ['doi', 'publication_year', 'language', 'authorships', 'primary_location', 'topics', 'open_access', 'apc_list']
keys = ['doi', 'publication_year', 'language', 'field_names', 'journal', 'publisher', 'ins_type', '#_authors', 'oa_status', 'apc_list', 'corresponding', 'countries']
records = []
for year in tqdm(range(2013, 2025)):
    with open(f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl", "rb") as f:
        for line in f:
            rec = orjson.loads(line)
            if not rec.get("doi"): # Skip records without DOI
                continue
            filtered = {k: rec.get(k) for k in interest} # Keep only the fields we care about

            # Extract field_names from topics
            topics = filtered.get("topics") or []
            filtered["field_names"] = {c.get("field", {}).get("display_name") for c in topics if c.get("field")}

            # Extract journal and publisher from primary_location
            pl = filtered.get("primary_location") or {}
            source = pl.get("source") or {}
            filtered["journal"] = source.get("display_name")
            filtered["publisher"] = source.get("host_organization_name")

            # Extract institution types and number of authors
            authorships = filtered.get("authorships") or []
            filtered["ins_type"] = {inst.get("type") 
                                    for auth in authorships 
                                    for inst in auth.get("institutions", []) if inst.get("type")}
            filtered["corresponding"] = [auth.get("is_corresponding") for auth in authorships]
            filtered["countries"] = [auth.get("countries") for auth in authorships]
            filtered["#_authors"] = len(authorships)
            
            o_a = filtered.get("open_access") or {}
            filtered["oa_status"] = o_a.get("oa_status")

            records.append({k: filtered.get(k) for k in keys})

df = pd.DataFrame(records)
df

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [01:48<00:00,  9.02s/it]


Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1051/0004-6361/201322068,2013,en,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{facility, government, education, nonprofit}",44,bronze,,"[False, False, False, False, False, False, Fal...","[[DE], [US], [FR, US], [FR, US], [FR, US], [US..."
1,https://doi.org/10.1038/nature12477,2013,en,"{Biochemistry, Genetics and Molecular Biology,...",Nature,Nature Portfolio,"{facility, nonprofit, other, healthcare, gover...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[GB], [GB], [GB], [CA], [GB], [GB], [GB], [GB..."
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{healthcare, company, education}",23,closed,,"[False, False, False, False, False, False, Fal...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
3,https://doi.org/10.1007/s00134-012-2769-8,2013,en,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{healthcare, education}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
4,https://doi.org/10.1038/nnano.2013.46,2013,en,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False]","[[], [FR]]"
...,...,...,...,...,...,...,...,...,...,...,...,...
1349313,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{facility, education}",5,diamond,,"[True, False, False, False, False]","[[RU], [RU], [FR], [RU], [RU]]"
1349314,https://doi.org/10.7202/1121520ar,2024,fr,"{Arts and Humanities, Social Sciences}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
1349315,https://doi.org/10.7202/1121518ar,2024,fr,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
1349316,https://doi.org/10.4000/11ndt,2024,fr,{Social Sciences},Histoire Politique,,{facility},1,bronze,,[True],[[FR]]


**No corresponding**

In [3]:
df_nocorresponding = df[df['corresponding'].apply(lambda x: all(v is False for v in x))].reset_index(drop = True)
df_nocorresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1051/0004-6361/201322068,2013,en,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{education, facility, nonprofit, government}",44,bronze,,"[False, False, False, False, False, False, Fal...","[[DE], [US], [FR, US], [FR, US], [FR, US], [US..."
1,https://doi.org/10.1038/nature12477,2013,en,"{Biochemistry, Genetics and Molecular Biology,...",Nature,Nature Portfolio,"{education, facility, healthcare, government, ...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[GB], [GB], [GB], [CA], [GB], [GB], [GB], [GB..."
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{company, education, healthcare}",23,closed,,"[False, False, False, False, False, False, Fal...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
3,https://doi.org/10.1038/nnano.2013.46,2013,en,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False]","[[], [FR]]"
4,https://doi.org/10.1093/nar/gkt1178,2013,en,"{Engineering, Biochemistry, Genetics and Molec...",Nucleic Acids Research,Oxford University Press,{government},5,gold,"{'value': 3630, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False]","[[FR], [FR], [FR], [FR], [FR]]"
...,...,...,...,...,...,...,...,...,...,...,...,...
683493,https://doi.org/10.7202/1120388ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",2,diamond,,"[False, False]","[[], [FR]]"
683494,https://doi.org/10.7202/1120383ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]"
683495,https://doi.org/10.7202/1120375ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]"
683496,https://doi.org/10.7202/1120374ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]"


Single author

In [4]:
df_nocorresponding_one = df_nocorresponding[df_nocorresponding['#_authors'] == 1].reset_index(drop = True)
df_nocorresponding_one

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries


French language

In [5]:
df_nocorresponding_french = df_nocorresponding[df_nocorresponding.language == 'fr'].reset_index(drop = True)
df_nocorresponding_french.to_parquet("../data/interim/preliminary/fr_p4B_french_language.parquet", index = False)
df_nocorresponding_french.groupby('publication_year').count()[['doi']]

Unnamed: 0_level_0,doi
publication_year,Unnamed: 1_level_1
2013,10172
2014,11715
2015,7002
2016,5159
2017,5181
2018,6029
2019,5645
2020,6021
2021,5866
2022,6540


All missing

In [6]:
df_nocorresponding['missing'] = df_nocorresponding.countries.apply(lambda lst: all(len(x) == 0 for x in lst))
df_nocorresponding_missing = df_nocorresponding[df_nocorresponding['missing'] == True].reset_index(drop = True)
df_nocorresponding_missing

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing


**Corresponding**

In [7]:
df_corresponding = df[~df.doi.isin(df_nocorresponding.doi)].reset_index(drop = True)
df_corresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1007/s00134-012-2769-8,2013,en,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{education, healthcare}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
1,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,{Medicine},European Journal of Cancer,Elsevier BV,"{education, other, government, healthcare}",8,bronze,"{'value': 3800, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[FR], [FR], [FR], [IT], [NL], [IE], [FR], [FR]]"
2,https://doi.org/10.1016/s1474-4422(13)70124-8,2013,en,{Medicine},The Lancet Neurology,Elsevier BV,"{education, facility, government, healthcare}",36,green,"{'value': 6300, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[GB], [CA], [NL], [FR], [AT], [CA], [AU], [GB..."
3,https://doi.org/10.1038/nature12506,2013,en,"{Biochemistry, Genetics and Molecular Biology,...",Nature,Nature Portfolio,"{education, facility, healthcare, government, ...",82,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[MA], [DK], [CN], [MA], [BE], [BE], [MA], [CN..."
4,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,"{Immunology and Microbiology, Medicine}",Immunity,Cell Press,"{education, facility, government, healthcare}",17,bronze,"{'value': 9080, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[FR], [FR], [FR], [FR], [DE, FR], [AT], [FR],..."
...,...,...,...,...,...,...,...,...,...,...,...,...
665813,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{education, facility}",5,diamond,,"[True, False, False, False, False]","[[RU], [RU], [FR], [RU], [RU]]"
665814,https://doi.org/10.7202/1121520ar,2024,fr,"{Arts and Humanities, Social Sciences}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
665815,https://doi.org/10.7202/1121518ar,2024,fr,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
665816,https://doi.org/10.4000/11ndt,2024,fr,{Social Sciences},Histoire Politique,,{facility},1,bronze,,[True],[[FR]]


All missing

In [8]:
df_corresponding['missing'] = df_corresponding.countries.apply(lambda lst: all(len(x) == 0 for x in lst))
df_corresponding_missing = df_corresponding[df_corresponding['missing'] == True].reset_index(drop = True)
df_corresponding_missing

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing


### Phase 4

**Corresponding**

In [9]:
df_corresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing
0,https://doi.org/10.1007/s00134-012-2769-8,2013,en,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{education, healthcare}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL...",False
1,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,{Medicine},European Journal of Cancer,Elsevier BV,"{education, other, government, healthcare}",8,bronze,"{'value': 3800, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[FR], [FR], [FR], [IT], [NL], [IE], [FR], [FR]]",False
2,https://doi.org/10.1016/s1474-4422(13)70124-8,2013,en,{Medicine},The Lancet Neurology,Elsevier BV,"{education, facility, government, healthcare}",36,green,"{'value': 6300, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[GB], [CA], [NL], [FR], [AT], [CA], [AU], [GB...",False
3,https://doi.org/10.1038/nature12506,2013,en,"{Biochemistry, Genetics and Molecular Biology,...",Nature,Nature Portfolio,"{education, facility, healthcare, government, ...",82,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[MA], [DK], [CN], [MA], [BE], [BE], [MA], [CN...",False
4,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,"{Immunology and Microbiology, Medicine}",Immunity,Cell Press,"{education, facility, government, healthcare}",17,bronze,"{'value': 9080, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[FR], [FR], [FR], [FR], [DE, FR], [AT], [FR],...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
665813,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{education, facility}",5,diamond,,"[True, False, False, False, False]","[[RU], [RU], [FR], [RU], [RU]]",False
665814,https://doi.org/10.7202/1121520ar,2024,fr,"{Arts and Humanities, Social Sciences}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]],False
665815,https://doi.org/10.7202/1121518ar,2024,fr,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]],False
665816,https://doi.org/10.4000/11ndt,2024,fr,{Social Sciences},Histoire Politique,,{facility},1,bronze,,[True],[[FR]],False


All CA french (strict)

In [10]:
df_corresponding['french_position'] = df_corresponding.countries.apply(lambda lst:[i for i, inner in enumerate(lst) if 'FR' in inner])
df_corresponding['CA_positions'] = df_corresponding.corresponding.apply(lambda lst:[i for i, val in enumerate(lst) if val is True])

df_corresponding_allfrench = df_corresponding[df_corresponding.apply(lambda row: all(pos in row['french_position'] for pos in row['CA_positions']), axis=1)].reset_index(drop=True)

total_corresponding_allfrench = df_corresponding_allfrench.groupby('publication_year')['doi'].nunique()

total_corresponding_allfrench_oa = df_corresponding_allfrench[df_corresponding_allfrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_corresponding_allfrench_apc = df_corresponding_allfrench[~df_corresponding_allfrench.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_corresponding_allfrench.rename('# pubs'), total_corresponding_allfrench_oa.rename('# openaccess'), total_corresponding_allfrench_apc.rename('# apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,# openaccess,# apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,29681,14320,12129
2014,28579,13210,12417
2015,33822,14943,15711
2016,43453,17376,22526
2017,41506,18000,21172
2018,43227,19860,20950
2019,40651,22065,19447
2020,42381,24566,21408
2021,43465,24735,21386
2022,40607,22936,20136


At least CA french author and others missing (medium)

In [11]:
df_corresponding['french_or_missing_position'] = df_corresponding.countries.apply(lambda lst:[i for i, inner in enumerate(lst) if ('FR' in inner or len(inner) == 0)])

df_corresponding_french_or_missing_position = df_corresponding[df_corresponding.apply(lambda row: all(pos in row['french_or_missing_position'] for pos in row['CA_positions']), axis=1)].reset_index(drop=True)

total_corresponding_french_or_missing_position = df_corresponding_french_or_missing_position.groupby('publication_year')['doi'].nunique()

total_corresponding_french_or_missing_position_oa = df_corresponding_french_or_missing_position[df_corresponding_french_or_missing_position.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_corresponding_french_or_missing_position_apc = df_corresponding_french_or_missing_position[~df_corresponding_french_or_missing_position.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_corresponding_french_or_missing_position.rename('# pubs'), total_corresponding_french_or_missing_position_oa.rename('# openaccess'), total_corresponding_french_or_missing_position_apc.rename('# apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,# openaccess,# apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,30466,14595,12610
2014,29418,13533,12930
2015,35025,15308,16546
2016,45217,17888,23827
2017,43157,18523,22329
2018,45049,20423,22258
2019,42245,22724,20620
2020,44041,25230,22638
2021,45250,25547,22545
2022,42026,23691,21171


At least CA french  (inclusive)

In [12]:
df_corresponding['french_position'] = df_corresponding.countries.apply(lambda lst:[i for i, inner in enumerate(lst) if 'FR' in inner])
df_corresponding['CA_positions'] = df_corresponding.corresponding.apply(lambda lst:[i for i, val in enumerate(lst) if val is True])

df_corresponding_anyfrench = df_corresponding[df_corresponding.apply(lambda row: any(pos in row['french_position'] for pos in row['CA_positions']), axis=1)].reset_index(drop=True)

total_corresponding_anyfrench = df_corresponding_anyfrench.groupby('publication_year')['doi'].nunique()

total_corresponding_anyfrench_oa = df_corresponding_anyfrench[df_corresponding_anyfrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_corresponding_anyfrench_apc = df_corresponding_anyfrench[~df_corresponding_anyfrench.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_corresponding_anyfrench.rename('# pubs'), total_corresponding_anyfrench_oa.rename('# openaccess'), total_corresponding_anyfrench_apc.rename('# apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,# openaccess,# apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,31091,15212,13249
2014,30142,14189,13627
2015,35806,16182,17181
2016,46405,19154,24724
2017,44453,19873,23289
2018,46589,21859,23352
2019,43699,24198,21524
2020,45868,27167,23805
2021,46839,27229,23800
2022,43828,25358,22392


**No Corresponding**

In [13]:
df_nocorresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing
0,https://doi.org/10.1051/0004-6361/201322068,2013,en,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{education, facility, nonprofit, government}",44,bronze,,"[False, False, False, False, False, False, Fal...","[[DE], [US], [FR, US], [FR, US], [FR, US], [US...",False
1,https://doi.org/10.1038/nature12477,2013,en,"{Biochemistry, Genetics and Molecular Biology,...",Nature,Nature Portfolio,"{education, facility, healthcare, government, ...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[GB], [GB], [GB], [CA], [GB], [GB], [GB], [GB...",False
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{company, education, healthcare}",23,closed,,"[False, False, False, False, False, False, Fal...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL...",False
3,https://doi.org/10.1038/nnano.2013.46,2013,en,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False]","[[], [FR]]",False
4,https://doi.org/10.1093/nar/gkt1178,2013,en,"{Engineering, Biochemistry, Genetics and Molec...",Nucleic Acids Research,Oxford University Press,{government},5,gold,"{'value': 3630, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False]","[[FR], [FR], [FR], [FR], [FR]]",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
683493,https://doi.org/10.7202/1120388ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",2,diamond,,"[False, False]","[[], [FR]]",False
683494,https://doi.org/10.7202/1120383ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]",False
683495,https://doi.org/10.7202/1120375ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]",False
683496,https://doi.org/10.7202/1120374ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]",False


All authors french (strict)

In [14]:
df_nocorresponding['whole_french'] = df_nocorresponding.countries.apply(lambda lst: all('FR' in x for x in lst))
df_nocorresponding_wholefrench = df_nocorresponding[df_nocorresponding['whole_french'] == True].reset_index(drop = True)

total_wholefrench= df_nocorresponding_wholefrench.groupby('publication_year')['doi'].nunique()

total_wholefrench_oa = df_nocorresponding_wholefrench[df_nocorresponding_wholefrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_wholefrench_apc = df_nocorresponding_wholefrench[~df_nocorresponding_wholefrench.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_wholefrench.rename('# pubs'), total_wholefrench_oa.rename('openaccess'), total_wholefrench_apc.rename('apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,openaccess,apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,27843,10238,14884
2014,28549,10036,15172
2015,23937,10295,11914
2016,18787,9723,7801
2017,18499,10436,8300
2018,18067,10988,8249
2019,18126,12395,8834
2020,19242,14031,9879
2021,18866,13843,9901
2022,18700,13297,9600


At least french author and others missing (medium)

In [16]:
df_nocorresponding['any_french_or_missing'] = df_nocorresponding['countries'].apply(lambda lst: all(all(c == 'FR' for c in inner) or len(inner) == 0 for inner in lst))
df_nocorresponding_anyfrenchormissing = df_nocorresponding[df_nocorresponding['any_french_or_missing'] == True].reset_index(drop = True)

total_anyfrenchormissing = df_nocorresponding_anyfrenchormissing.groupby('publication_year')['doi'].nunique()

total_anyfrenchormissing_oa = df_nocorresponding_anyfrenchormissing[df_nocorresponding_anyfrenchormissing.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_anyfrenchormissing_apc = df_nocorresponding_anyfrenchormissing[~df_nocorresponding_anyfrenchormissing.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_anyfrenchormissing.rename('# pubs'), total_anyfrenchormissing_oa.rename('openaccess'), total_anyfrenchormissing_apc.rename('apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,openaccess,apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,33263,11462,17609
2014,34070,11426,17871
2015,28229,11663,14066
2016,23168,11528,10076
2017,22992,12358,10639
2018,22572,13178,10380
2019,22565,14721,11116
2020,23825,16511,12223
2021,23353,16320,12136
2022,23552,15722,12141


At least french (inclusive)

In [19]:
df_nocorresponding['any_french'] = df_nocorresponding.countries.apply(lambda lst: any('FR' in x for x in lst))
df_nocorresponding_anyfrench = df_nocorresponding[df_nocorresponding['any_french'] == True].reset_index(drop = True)

total_anyfrench= df_nocorresponding_anyfrench.groupby('publication_year')['doi'].nunique()

total_anyfrench_oa = df_nocorresponding_anyfrench[df_nocorresponding_anyfrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_anyfrench_apc = df_nocorresponding_anyfrench[~df_nocorresponding_anyfrench.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_anyfrench.rename('# pubs'), total_anyfrench_oa.rename('openaccess'), total_anyfrench_apc.rename('apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,openaccess,apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,69063,29860,36872
2014,72644,31214,38555
2015,64419,31505,33000
2016,50922,28395,22333
2017,51752,30739,23533
2018,51049,32131,23491
2019,52346,35829,25881
2020,55644,39992,28335
2021,54475,39244,28363
2022,54081,37997,27748


### Most inclusive

In [22]:
inclusive = pd.concat([df_corresponding_anyfrench, df_nocorresponding_anyfrench], ignore_index = True)
total_inclusive= inclusive.groupby('publication_year')['doi'].nunique()

total_inclusive_oa = inclusive[inclusive.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_inclusive_apc = inclusive[~inclusive.apc_list.isna()].groupby('publication_year')['doi'].nunique()
grouped = pd.concat([total_inclusive.rename('# pubs'), total_inclusive_oa.rename('openaccess'), total_inclusive_apc.rename('apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,openaccess,apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,100154,45072,50121
2014,102786,45403,52182
2015,100225,47687,50181
2016,97327,47549,47057
2017,96205,50612,46822
2018,97638,53990,46843
2019,96045,60027,47405
2020,101512,67159,52140
2021,101314,66473,52163
2022,97909,63355,50140


### Most restrictive

In [21]:
exclusive = pd.concat([df_corresponding_allfrench, df_nocorresponding_wholefrench], ignore_index = True)
total_exclusive= exclusive.groupby('publication_year')['doi'].nunique()

total_exclusive_oa = exclusive[exclusive.oa_status != 'closed'].groupby('publication_year')['doi'].nunique()
total_exclusive_apc = exclusive[~exclusive.apc_list.isna()].groupby('publication_year')['doi'].nunique()

grouped = pd.concat([total_exclusive.rename('# pubs'), total_exclusive_oa.rename('openaccess'), total_exclusive_apc.rename('apc_list')], axis = 1)
grouped

Unnamed: 0_level_0,# pubs,openaccess,apc_list
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,57524,24558,27013
2014,57128,23246,27589
2015,57759,25238,27625
2016,62240,27099,30327
2017,60005,28436,29472
2018,61294,30848,29199
2019,58777,34460,28281
2020,61623,38597,31287
2021,62331,38578,31287
2022,59307,36233,29736
