# Preliminar Analysis

In [1]:
import requests
import time
import json
from urllib.parse import urlencode
import pandas as pd
import orjson
from tqdm import tqdm

### Download all articles where there are one or more french authors

In [None]:
BASE_URL = "https://api.openalex.org/works"

selected_fields = ["doi", "is_xpac", "publication_year", "language", "indexed_in", "primary_location", "open_access", "authorships", "institutions", 
                   "corresponding_author_ids", "corresponding_institution_ids", "apc_list", "apc_paid", "cited_by_count", "primary_topic", "topics", 
                   "concepts", "grants", "awards", "funders"]

def fetch_page(cursor, filters):
    params = filters.copy()
    params["cursor"] = cursor
    params["per_page"] = 200
    url = f"{BASE_URL}?{urlencode(params)}"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json()

for year in range(2018, 2025): # anar modificant l'initial year
    print(f"\n=== YEAR {year} ===")

    filters = {
        "filter": ",".join([
            "type:article",
            "primary_location.source.type:journal",
            "authorships.institutions.country_code:FR",
            f"from_publication_date:{year}-01-01",
            f"to_publication_date:{year}-12-31"
        ]),
        "select": ",".join(selected_fields)
    }

    cursor = "*"
    count = 0
    output_file = f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl"

    with open(output_file, "w", encoding="utf-8") as f:
        pbar = tqdm(unit="works", dynamic_ncols=True)
        while True:
            data = fetch_page(cursor, filters)

            works = data.get("results", [])
            next_cursor = data.get("meta", {}).get("next_cursor", None)

            for w in works:
                f.write(json.dumps(w) + "\n")
                count += 1

            pbar.update(len(works))
            
            if not next_cursor:
                break
            cursor = next_cursor
            time.sleep(1)

        pbar.close()
    print(f"\nSaved: {output_file}  (total {count})")

print("\n=== COMPLETED ===")


=== YEAR 2013 ===


200works [00:03, 65.37works/s]

KeyboardInterrupt: 

200works [00:19, 65.37works/s]

### Phase 1: Whole France

**Number of unique DOI in France (NON Datacite)**

In [5]:
unique_doi_france = 0
for year in tqdm(range(2013, 2025)):
    doi_set_france = set()
    with open(f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl", "rb") as f:
        for line in f:
            rec = orjson.loads(line)
            doi = rec.get("doi")
            if doi:
                doi_set_france.add(doi)
    unique_doi_france += len(doi_set_france)

print("Total unique DOIs:", unique_doi_france)

100%|██████████| 12/12 [01:06<00:00,  5.54s/it]

Total unique DOIs: 1243928





**Number of unique DOI in France (ONLY Datacite) + dataset construction**

In [1]:
### I SHOULD REDO THE DOWNLOAD; ASK IF NEEDED GIVEN THAT WE WON'T USE IT

### Phase 2: Ingest, or not, BSO

In [2]:
df_bso = pd.read_parquet("../data/external/open-access-monitor-france.parquet", engine = "pyarrow").dropna(subset = ["doi"])
df_bso

Unnamed: 0,observation_date,id,doi,pmid,hal_id,year,title,journal_issns,journal_issn_l,journal_name,...,software_used,software_created,software_shared,data_used,data_created,data_shared,missing_doi_in_hal,has_doi_in_hal,doi_in_hal,bso_country
0,2024Q4,doi10.1080/10408398.2022.2033684,10.1080/10408398.2022.2033684,35152807,hal-03777046,2022,The impact of nano/micro-plastics toxicity on ...,"1040-8398,1549-7852",1040-8398,Critical Reviews in Food Science and Nutrition,...,,,,,,,,1,10.1080/10408398.2022.2033684,fr
1,2024Q4,doi10.1016/j.gie.2021.12.048,10.1016/j.gie.2021.12.048,,,2022,Real-time use of artificial intelligence at co...,"0016-5107,1085-8741,1097-6779",0016-5107,Gastrointestinal Endoscopy,...,,,,,,,,,,
2,2024Q4,doi10.1016/j.compositesa.2022.107165,10.1016/j.compositesa.2022.107165,,Preprint-Carpier-et-al-CompPartA-2022.pdf,2022,Meso-structure-based thermomechanical modellin...,"1359-835X,1878-5840",1359-835X,Composites Part A Applied Science and Manufact...,...,True,False,False,True,True,False,,1,10.1016/j.compositesa.2022.107165,"fr,other"
3,2024Q4,doi10.1002/ejoc.202200123,10.1002/ejoc.202200123,,TEXT%20FINAL.pdf,2022,Electrochemical Trifluoromethylselenolation of...,"1099-0690,1434-193X",1099-0690,European Journal of Organic Chemistry,...,False,False,False,True,False,False,,1,10.1002/ejoc.202200123,"fr,other"
4,2024Q4,doi10.3410/f.721705664.793594159,10.3410/f.721705664.793594159,,,2022,Faculty Opinions recommendation of On the gene...,,,Faculty Opinions – Post-Publication Peer Revie...,...,,,,,,,,,,fr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937570,2024Q4,doi10.1021/acs.jpcc.9b10824,10.1021/acs.jpcc.9b10824,,hal-03017908,2020,The Dissolution Anisotropy of Pyroxenes: Exper...,"1932-7447,1932-7455",1932-7447,The Journal of Physical Chemistry C,...,False,False,False,True,True,False,,1,10.1021/acs.jpcc.9b10824,"fr,other"
937571,2024Q4,doi10.1017/s007543582000012x,10.1017/s007543582000012x,,,2020,"LAUREL FULKERSON, A LITERARY COMMENTARY ON THE...","0075-4358,1753-528X",0075-4358,The Journal of Roman Studies,...,,,,,,,,,,fr
937575,2024Q4,doi10.1075/aila.00030.haa,10.1075/aila.00030.haa,,,2020,Recycling a genre for news automation,"1461-0213,1570-5595",1461-0213,AILA Review,...,,,,,,,,,,europe
937576,2024Q4,doi10.1007/s13592-020-00743-8,10.1007/s13592-020-00743-8,,hal-03161695,2020,Short-term hyperthermia at larval age reduces ...,"0044-8435,1297-9678",0044-8435,Apidologie,...,,,,,,,,1,10.1007/s13592-020-00743-8,other


In [5]:
### WAITING FOR THE RESPONSE OF THE TECNICAL TEAM

### Phase 3: journals analysis

In [2]:
interest = ['doi', 'publication_year', 'authorships', 'primary_location', 'topics', 'open_access', 'apc_list']
keys = ['doi', 'publication_year', 'field_names', 'journal', 'publisher', 'ins_type', '#_authors', 'oa_status', 'apc_list']
records = []
for year in tqdm(range(2013, 2025)):
    with open(f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl", "rb") as f:
        for line in f:
            rec = orjson.loads(line)
            if not rec.get("doi"): # Skip records without DOI
                continue
            filtered = {k: rec.get(k) for k in interest} # Keep only the fields we care about

            # Extract field_names from topics
            topics = filtered.get("topics") or []
            filtered["field_names"] = {c.get("field", {}).get("display_name") for c in topics if c.get("field")}

            # Extract journal and publisher from primary_location
            pl = filtered.get("primary_location") or {}
            source = pl.get("source") or {}
            filtered["journal"] = source.get("display_name")
            filtered["publisher"] = source.get("host_organization_name")

            # Extract institution types and number of authors
            authorships = filtered.get("authorships") or []
            filtered["ins_type"] = {inst.get("type") 
                                    for auth in authorships 
                                    for inst in auth.get("institutions", []) if inst.get("type")}
            filtered["#_authors"] = len(authorships)
            
            o_a = filtered.get("open_access") or {}
            filtered["oa_status"] = o_a.get("oa_status")

            records.append({k: filtered.get(k) for k in keys})

df = pd.DataFrame(records)
df.to_parquet("../data/interim/preliminary/fr_p2_fr_core.parquet", index=False)
df

100%|██████████| 12/12 [01:21<00:00,  6.78s/it]


Unnamed: 0,doi,publication_year,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list
0,https://doi.org/10.1051/0004-6361/201322068,2013,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{education, facility, nonprofit, government}",44,bronze,
1,https://doi.org/10.1038/nature12477,2013,"{Medicine, Biochemistry, Genetics and Molecula...",Nature,Nature Portfolio,"{education, government, facility, other, healt...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'..."
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{education, company, healthcare}",23,closed,
3,https://doi.org/10.1007/s00134-012-2769-8,2013,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{education, healthcare}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'..."
4,https://doi.org/10.1038/nnano.2013.46,2013,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'..."
...,...,...,...,...,...,...,...,...,...
1243923,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{education, facility}",5,diamond,
1243924,https://doi.org/10.7202/1121520ar,2024,"{Social Sciences, Arts and Humanities}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,
1243925,https://doi.org/10.7202/1121518ar,2024,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,
1243926,https://doi.org/10.4000/11ndt,2024,{Social Sciences},Histoire Politique,,{facility},1,bronze,


**Industry concentration and indicators**

In [3]:
interest = 'journal'

grouped = df.groupby(interest).count()[['doi']].sort_values('doi', ascending = False).rename(columns = {'doi' : '# publications'})
grouped['% publications'] = (grouped['# publications'] / df.doi.nunique())
print('The HHF is: ', sum(grouped['% publications']**2))

for year in range(2013, 2025):
    df_year = df[df['publication_year'] == year]
    grouped_year = df_year.groupby(interest).count()[['doi']].sort_values('doi', ascending = False).rename(columns = {'doi' : f'# publications {year}'})
    grouped_year[f'% publications {year}'] = (grouped_year[f'# publications {year}'] / df_year.doi.nunique())
    grouped = grouped.merge(grouped_year, left_index = True, right_index = True, how = 'left').fillna(0)
    print(f'The HHF in {year} is: ', sum(grouped_year[f'% publications {year}']**2))
grouped

The HHF is:  0.0006792579494307851
The HHF in 2013 is:  0.01995000000000005
The HHF in 2014 is:  0.0011864172963031473
The HHF in 2015 is:  0.0009941372296594685
The HHF in 2016 is:  0.0009819659181103551
The HHF in 2017 is:  0.0009303393912335911
The HHF in 2018 is:  0.000820114139809399
The HHF in 2019 is:  0.000760996965802461
The HHF in 2020 is:  0.0007493847616581456
The HHF in 2021 is:  0.00073012000581234
The HHF in 2022 is:  0.0006970888257855055
The HHF in 2023 is:  0.0007099792074962256
The HHF in 2024 is:  0.0007695706597819862


Unnamed: 0_level_0,# publications,% publications,# publications 2013,% publications 2013,# publications 2014,% publications 2014,# publications 2015,% publications 2015,# publications 2016,% publications 2016,...,# publications 2020,% publications 2020,# publications 2021,% publications 2021,# publications 2022,% publications 2022,# publications 2023,% publications 2023,# publications 2024,% publications 2024
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Scientific Reports,9692,7.791448e-03,0.0,0.000,146.0,0.001341,452.0,0.004153,964.0,0.008595,...,1129.0,0.009310,1118.0,0.009224,869.0,0.007429,761.0,0.006591,829.0,0.008367
PLoS ONE,8852,7.116167e-03,0.0,0.000,1518.0,0.013938,1358.0,0.012477,1066.0,0.009504,...,649.0,0.005352,514.0,0.004241,433.0,0.003702,416.0,0.003603,452.0,0.004562
Astronomy and Astrophysics,7555,6.073503e-03,3.0,0.015,663.0,0.006088,690.0,0.006339,705.0,0.006286,...,738.0,0.006085,712.0,0.005875,722.0,0.006173,710.0,0.006149,620.0,0.006258
Monthly Notices of the Royal Astronomical Society,5772,4.640140e-03,0.0,0.000,388.0,0.003563,437.0,0.004015,563.0,0.005020,...,616.0,0.005079,555.0,0.004579,498.0,0.004258,596.0,0.005162,401.0,0.004047
Journal of Clinical Oncology,5754,4.625670e-03,2.0,0.010,551.0,0.005059,532.0,0.004888,530.0,0.004725,...,536.0,0.004420,480.0,0.003960,520.0,0.004446,539.0,0.004668,507.0,0.005117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
日本物理学会講演概要集 70.2,1,8.039050e-07,0.0,0.000,0.0,0.000000,1.0,0.000009,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
日本物理学会講演概要集 70.1,1,8.039050e-07,0.0,0.000,0.0,0.000000,1.0,0.000009,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
日本消化器がん検診学会雑誌,1,8.039050e-07,0.0,0.000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,1.0,0.000008,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
مجلة کلیة الآداب القاهرة,1,8.039050e-07,0.0,0.000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.0,0.000010


In [4]:
interest = 'publisher'

grouped = df.groupby(interest).count()[['doi']].sort_values('doi', ascending = False).rename(columns = {'doi' : '# publications'})
grouped['% publications'] = (grouped['# publications'] / df.doi.nunique())
print('The HHF is: ', sum(grouped['% publications']**2))

for year in range(2013, 2025):
    df_year = df[df['publication_year'] == year]
    grouped_year = df_year.groupby(interest).count()[['doi']].sort_values('doi', ascending = False).rename(columns = {'doi' : f'# publications {year}'})
    grouped_year[f'% publications {year}'] = (grouped_year[f'# publications {year}'] / df_year.doi.nunique())
    grouped = grouped.merge(grouped_year, left_index = True, right_index = True, how = 'left').fillna(0)
    print(f'The HHF in {year} is: ', sum(grouped_year[f'% publications {year}']**2))
grouped

The HHF is:  0.09497703629448288
The HHF in 2013 is:  0.10260000000000001
The HHF in 2014 is:  0.1176033106108357
The HHF in 2015 is:  0.10751485031125216
The HHF in 2016 is:  0.11040617764655404
The HHF in 2017 is:  0.1034693339590879
The HHF in 2018 is:  0.10005418838990834
The HHF in 2019 is:  0.0894483992017019
The HHF in 2020 is:  0.09036613952109504
The HHF in 2021 is:  0.084227597113723
The HHF in 2022 is:  0.08195356901864419
The HHF in 2023 is:  0.09026168956903324
The HHF in 2024 is:  0.08409693451695317


Unnamed: 0_level_0,# publications,% publications,# publications 2013,% publications 2013,# publications 2014,% publications 2014,# publications 2015,% publications 2015,# publications 2016,% publications 2016,...,# publications 2020,% publications 2020,# publications 2021,% publications 2021,# publications 2022,% publications 2022,# publications 2023,% publications 2023,# publications 2024,% publications 2024
publisher,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Elsevier BV,354662,2.851146e-01,44.0,0.220,35119.0,0.322459,33321.0,0.306135,34980.0,0.311879,...,33373.0,0.275189,31824.0,0.262572,30077.0,0.257134,31750.0,0.274985,25348.0,0.255849
Wiley,86170,6.927250e-02,8.0,0.040,7680.0,0.070517,7576.0,0.069604,7777.0,0.069339,...,8794.0,0.072514,8058.0,0.066485,7462.0,0.063794,7565.0,0.065520,8124.0,0.081999
Springer Science+Business Media,71552,5.752101e-02,11.0,0.055,6686.0,0.061390,6899.0,0.063384,6659.0,0.059371,...,6534.0,0.053878,6753.0,0.055717,6620.0,0.056596,6196.0,0.053663,6172.0,0.062297
Multidisciplinary Digital Publishing Institute,38107,3.063441e-02,0.0,0.000,371.0,0.003406,518.0,0.004759,678.0,0.006045,...,5056.0,0.041691,7065.0,0.058292,7521.0,0.064299,6390.0,0.055343,4568.0,0.046107
Oxford University Press,37935,3.049614e-02,9.0,0.045,2988.0,0.027435,2995.0,0.027516,3244.0,0.028923,...,3727.0,0.030732,3587.0,0.029595,3411.0,0.029161,3647.0,0.031586,3668.0,0.037023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Academia Brasileira de Audiologia,1,8.039050e-07,0.0,0.000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,1.0,0.000009,0.0,0.000000,0.0,0.000000
Institución Universitaria Americana,1,8.039050e-07,0.0,0.000,1.0,0.000009,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
Innovative Journal Solutions,1,8.039050e-07,0.0,0.000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,1.0,0.000009,0.0,0.000000,0.0,0.000000
Institut Seni Indonesia (ISI) Surakarta,1,8.039050e-07,0.0,0.000,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.0,0.000009,0.0,0.000000


**APC vs non-APC**

In [3]:
df_apc = df[~df.apc_list.isna()].reset_index(drop = True)
df_apc.to_parquet("../data/interim/preliminary/fr_p3_apc.parquet", index = False)
df_apc.groupby('publication_year').count()[['doi']] / df.groupby('publication_year').count()[['doi']]

Unnamed: 0_level_0,doi
publication_year,Unnamed: 1_level_1
2013,0.605
2014,0.517207
2015,0.516005
2016,0.517836
2017,0.521101
2018,0.517852
2019,0.53177
2020,0.553116
2021,0.554517
2022,0.558023


**Author sector indicators**

In [6]:
df_filtered = df[df['ins_type'].apply(lambda x: bool(x & {'company', 'non-profit'}))]
df_filtered.groupby('publication_year').count()[['doi']] / df.groupby('publication_year').count()[['doi']]

Unnamed: 0_level_0,doi
publication_year,Unnamed: 1_level_1
2013,0.255
2014,0.093986
2015,0.098407
2016,0.097781
2017,0.10046
2018,0.101256
2019,0.104353
2020,0.10351
2021,0.105799
2022,0.105925


**Articles with more than 100 authors**

In [7]:
df_100 = df[df['#_authors'] == 100].reset_index(drop = True)
df_100.to_parquet("../data/interim/preliminary/fr_p3_fr_gt100_authors.parquet", index = False)
df_100.groupby('publication_year').count()[['doi']] / df.groupby('publication_year').count()[['doi']]

Unnamed: 0_level_0,doi
publication_year,Unnamed: 1_level_1
2013,0.035
2014,0.00471
2015,0.004291
2016,0.004939
2017,0.005014
2018,0.004938
2019,0.005047
2020,0.005261
2021,0.005363
2022,0.005737


### Phase 4: Corresponding authors

In [4]:
interest = ['doi', 'publication_year', 'language', 'authorships', 'primary_location', 'topics', 'open_access', 'apc_list']
keys = ['doi', 'publication_year', 'language', 'field_names', 'journal', 'publisher', 'ins_type', '#_authors', 'oa_status', 'apc_list', 'corresponding', 'countries']
records = []
for year in tqdm(range(2013, 2025)):
    with open(f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v2.jsonl", "rb") as f:
        for line in f:
            rec = orjson.loads(line)
            if not rec.get("doi"): # Skip records without DOI
                continue
            filtered = {k: rec.get(k) for k in interest} # Keep only the fields we care about

            # Extract field_names from topics
            topics = filtered.get("topics") or []
            filtered["field_names"] = {c.get("field", {}).get("display_name") for c in topics if c.get("field")}

            # Extract journal and publisher from primary_location
            pl = filtered.get("primary_location") or {}
            source = pl.get("source") or {}
            filtered["journal"] = source.get("display_name")
            filtered["publisher"] = source.get("host_organization_name")

            # Extract institution types and number of authors
            authorships = filtered.get("authorships") or []
            filtered["ins_type"] = {inst.get("type") 
                                    for auth in authorships 
                                    for inst in auth.get("institutions", []) if inst.get("type")}
            filtered["corresponding"] = [auth.get("is_corresponding") for auth in authorships]
            filtered["countries"] = [auth.get("countries") for auth in authorships]
            filtered["#_authors"] = len(authorships)
            
            o_a = filtered.get("open_access") or {}
            filtered["oa_status"] = o_a.get("oa_status")

            records.append({k: filtered.get(k) for k in keys})

df = pd.DataFrame(records)
df

100%|██████████| 12/12 [01:41<00:00,  8.46s/it]


Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1051/0004-6361/201322068,2013,en,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{education, facility, nonprofit, government}",44,bronze,,"[False, False, False, False, False, False, Fal...","[[DE], [US], [FR, US], [FR, US], [FR, US], [US..."
1,https://doi.org/10.1038/nature12477,2013,en,"{Medicine, Biochemistry, Genetics and Molecula...",Nature,Nature Portfolio,"{education, government, facility, other, healt...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[GB], [GB], [GB], [CA], [GB], [GB], [GB], [GB..."
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{education, company, healthcare}",23,closed,,"[False, False, False, False, False, False, Fal...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
3,https://doi.org/10.1007/s00134-012-2769-8,2013,en,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{education, healthcare}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
4,https://doi.org/10.1038/nnano.2013.46,2013,en,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False]","[[], [FR]]"
...,...,...,...,...,...,...,...,...,...,...,...,...
1243923,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{education, facility}",5,diamond,,"[True, False, False, False, False]","[[RU], [RU], [FR], [RU], [RU]]"
1243924,https://doi.org/10.7202/1121520ar,2024,fr,"{Social Sciences, Arts and Humanities}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
1243925,https://doi.org/10.7202/1121518ar,2024,fr,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
1243926,https://doi.org/10.4000/11ndt,2024,fr,{Social Sciences},Histoire Politique,,{facility},1,bronze,,[True],[[FR]]


**No corresponding**

In [5]:
df_nocorresponding = df[df['corresponding'].apply(lambda x: all(v is False for v in x))].reset_index(drop = True)
df_nocorresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1051/0004-6361/201322068,2013,en,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{education, facility, nonprofit, government}",44,bronze,,"[False, False, False, False, False, False, Fal...","[[DE], [US], [FR, US], [FR, US], [FR, US], [US..."
1,https://doi.org/10.1038/nature12477,2013,en,"{Medicine, Biochemistry, Genetics and Molecula...",Nature,Nature Portfolio,"{education, government, facility, other, healt...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[GB], [GB], [GB], [CA], [GB], [GB], [GB], [GB..."
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{education, company, healthcare}",23,closed,,"[False, False, False, False, False, False, Fal...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
3,https://doi.org/10.1038/nnano.2013.46,2013,en,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False]","[[], [FR]]"
4,https://doi.org/10.1093/nar/gkt1178,2013,en,"{Engineering, Biochemistry, Genetics and Molec...",Nucleic Acids Research,Oxford University Press,{government},5,gold,"{'value': 3630, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False]","[[FR], [FR], [FR], [FR], [FR]]"
...,...,...,...,...,...,...,...,...,...,...,...,...
614558,https://doi.org/10.7202/1120388ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",2,diamond,,"[False, False]","[[], [FR]]"
614559,https://doi.org/10.7202/1120383ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]"
614560,https://doi.org/10.7202/1120375ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]"
614561,https://doi.org/10.7202/1120374ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]"


One author

In [10]:
df_nocorresponding_one = df_nocorresponding[df_nocorresponding['#_authors'] == 1].reset_index(drop = True)
df_nocorresponding_one

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries


French language

In [11]:
df_nocorresponding_french = df_nocorresponding[df_nocorresponding.language == 'fr'].reset_index(drop = True)
df_nocorresponding_french.to_parquet("../data/interim/preliminary/fr_p4B_french_language.parquet", index = False)
df_nocorresponding_french.groupby('publication_year').count()[['doi']] / df.groupby('publication_year').count()[['doi']]

Unnamed: 0_level_0,doi
publication_year,Unnamed: 1_level_1
2013,
2014,0.107566
2015,0.064331
2016,0.045997
2017,0.046472
2018,0.052604
2019,0.049631
2020,0.049648
2021,0.048399
2022,0.055912


All authors missing country

In [12]:
df_nocorresponding['missing'] = df_nocorresponding.countries.apply(lambda lst: all(len(x) == 0 for x in lst))
df_nocorresponding_missing = df_nocorresponding[df_nocorresponding['missing'] == True].reset_index(drop = True)
df_nocorresponding_missing

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing


**Corresponding**

In [6]:
df_corresponding = df[~df.doi.isin(df_nocorresponding.doi)].reset_index(drop = True)
df_corresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1007/s00134-012-2769-8,2013,en,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{education, healthcare}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
1,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,{Medicine},European Journal of Cancer,Elsevier BV,"{government, other, education, healthcare}",8,bronze,"{'value': 3800, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[FR], [FR], [FR], [IT], [NL], [IE], [FR], [FR]]"
2,https://doi.org/10.1016/s1474-4422(13)70124-8,2013,en,{Medicine},The Lancet Neurology,Elsevier BV,"{education, facility, government, healthcare}",36,green,"{'value': 6300, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[GB], [CA], [NL], [FR], [AT], [CA], [AU], [GB..."
3,https://doi.org/10.1038/nature12506,2013,en,"{Medicine, Biochemistry, Genetics and Molecula...",Nature,Nature Portfolio,"{education, company, government, facility, hea...",82,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[MA], [DK], [CN], [MA], [BE], [BE], [MA], [CN..."
4,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,"{Medicine, Immunology and Microbiology}",Immunity,Cell Press,"{government, facility, education, healthcare}",17,bronze,"{'value': 9080, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[FR], [FR], [FR], [FR], [DE, FR], [AT], [FR],..."
...,...,...,...,...,...,...,...,...,...,...,...,...
629360,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{education, facility}",5,diamond,,"[True, False, False, False, False]","[[RU], [RU], [FR], [RU], [RU]]"
629361,https://doi.org/10.7202/1121520ar,2024,fr,"{Social Sciences, Arts and Humanities}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
629362,https://doi.org/10.7202/1121518ar,2024,fr,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
629363,https://doi.org/10.4000/11ndt,2024,fr,{Social Sciences},Histoire Politique,,{facility},1,bronze,,[True],[[FR]]


One Corresponding

In [30]:
df_corresponding_one = df_corresponding[df_corresponding['#_authors'] == 1].reset_index(drop = True)
df_corresponding_one.to_parquet("../data/interim/preliminary/fr_p4A_single_CA.parquet", index = False)
df_corresponding_one.groupby('publication_year').count()[['doi']] / df.groupby('publication_year').count()[['doi']]

Unnamed: 0_level_0,doi
publication_year,Unnamed: 1_level_1
2013,0.015
2014,0.138013
2015,0.143251
2016,0.144384
2017,0.144591
2018,0.15243
2019,0.147988
2020,0.13839
2021,0.143192
2022,0.145148


All authors missing country

In [29]:
df_corresponding['missing'] = df_corresponding.countries.apply(lambda lst: all(len(x) == 0 for x in lst))
df_corresponding_missing = df_corresponding[df_corresponding['missing'] == True].reset_index(drop = True)
df_corresponding_missing

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing,whole_french,any_french


### Phase 5: Corresponding with data

**Corresponding**

In [7]:
df_corresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries
0,https://doi.org/10.1007/s00134-012-2769-8,2013,en,{Medicine},Intensive Care Medicine,Springer Science+Business Media,"{education, healthcare}",23,bronze,"{'value': 3690, 'currency': 'EUR', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL..."
1,https://doi.org/10.1016/j.ejca.2012.12.027,2013,en,{Medicine},European Journal of Cancer,Elsevier BV,"{government, other, education, healthcare}",8,bronze,"{'value': 3800, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[FR], [FR], [FR], [IT], [NL], [IE], [FR], [FR]]"
2,https://doi.org/10.1016/s1474-4422(13)70124-8,2013,en,{Medicine},The Lancet Neurology,Elsevier BV,"{education, facility, government, healthcare}",36,green,"{'value': 6300, 'currency': 'USD', 'value_usd'...","[True, False, False, False, False, False, Fals...","[[GB], [CA], [NL], [FR], [AT], [CA], [AU], [GB..."
3,https://doi.org/10.1038/nature12506,2013,en,"{Medicine, Biochemistry, Genetics and Molecula...",Nature,Nature Portfolio,"{education, company, government, facility, hea...",82,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[MA], [DK], [CN], [MA], [BE], [BE], [MA], [CN..."
4,https://doi.org/10.1016/j.immuni.2013.10.003,2013,en,"{Medicine, Immunology and Microbiology}",Immunity,Cell Press,"{government, facility, education, healthcare}",17,bronze,"{'value': 9080, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[FR], [FR], [FR], [FR], [DE, FR], [AT], [FR],..."
...,...,...,...,...,...,...,...,...,...,...,...,...
629360,https://doi.org/10.20914/2310-1202-2024-4-178-184,2024,,"{Environmental Science, Agricultural and Biolo...",Proceedings of the Voronezh State University o...,Voronezh State University of Engineering Techn...,"{education, facility}",5,diamond,,"[True, False, False, False, False]","[[RU], [RU], [FR], [RU], [RU]]"
629361,https://doi.org/10.7202/1121520ar,2024,fr,"{Social Sciences, Arts and Humanities}",Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
629362,https://doi.org/10.7202/1121518ar,2024,fr,{Social Sciences},Revue d histoire de l Amérique française,Institut d'histoire de l'Amérique française,{education},1,diamond,,[True],[[FR]]
629363,https://doi.org/10.4000/11ndt,2024,fr,{Social Sciences},Histoire Politique,,{facility},1,bronze,,[True],[[FR]]


All french

In [8]:
df_corresponding['whole_french'] = df_corresponding.countries.apply(lambda lst: all('FR' in x for x in lst))
df_corresponding_wholefrench = df_corresponding[df_corresponding['whole_french'] == True].reset_index(drop = True)
df_corresponding_wholefrench.to_parquet("../data/interim/preliminary/fr_p5_CA_yes_allFR.parquet", index = False)

total_per_year = df.groupby('publication_year')['doi'].nunique()
total_wholefrench= df_corresponding_wholefrench.groupby('publication_year')['doi'].nunique()

grouped_total = total_wholefrench / total_per_year
grouped_oa = df_corresponding_wholefrench[df_corresponding_wholefrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique() / total_per_year
grouped_apc = df_corresponding_wholefrench[~df_corresponding_wholefrench.apc_list.isna()].groupby('publication_year')['doi'].nunique() / total_per_year

grouped = pd.concat([grouped_oa.rename('openaccess'), grouped_apc.rename('apc_list'), grouped_total.rename('total')], axis = 1)
grouped

Unnamed: 0_level_0,openaccess,apc_list,total
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,0.025,0.04,0.04
2014,0.096566,0.078294,0.209843
2015,0.107346,0.09668,0.242678
2016,0.113018,0.127391,0.288671
2017,0.116776,0.118893,0.27623
2018,0.124614,0.110915,0.276965
2019,0.132101,0.100757,0.260052
2020,0.136865,0.103189,0.250649
2021,0.140337,0.102813,0.257803
2022,0.136445,0.099761,0.2528


One or more french

In [10]:
df_corresponding['any_french_or_missing'] = df_corresponding['countries'].apply(lambda lst: all(all(c == 'FR' for c in inner) or len(inner) == 0 for inner in lst))
df_corresponding_anyfrenchormissing = df_corresponding[df_corresponding['any_french_or_missing'] == True].reset_index(drop = True)
df_corresponding_anyfrenchormissing.to_parquet("../data/interim/preliminary/fr_p5_CA_yes_FR_plus_missing.parquet", index = False)

total_per_year = df.groupby('publication_year')['doi'].nunique()
total_anyfrenchormissing= df_corresponding_anyfrenchormissing.groupby('publication_year')['doi'].nunique()

grouped_total = total_anyfrenchormissing / total_per_year
grouped_oa = df_corresponding_anyfrenchormissing[df_corresponding_anyfrenchormissing.oa_status != 'closed'].groupby('publication_year')['doi'].nunique() / total_per_year
grouped_apc = df_corresponding_anyfrenchormissing[~df_corresponding_anyfrenchormissing.apc_list.isna()].groupby('publication_year')['doi'].nunique() / total_per_year

grouped = pd.concat([grouped_oa.rename('openaccess'), grouped_apc.rename('apc_list'), grouped_total.rename('total')], axis = 1)
grouped

Unnamed: 0_level_0,openaccess,apc_list,total
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,0.03,0.04,0.04
2014,0.089927,0.082958,0.207437
2015,0.098269,0.106207,0.243192
2016,0.10363,0.143885,0.294992
2017,0.109403,0.133756,0.282481
2018,0.115958,0.126184,0.28309
2019,0.126984,0.113892,0.263278
2020,0.131538,0.117421,0.254212
2021,0.135378,0.115832,0.262844
2022,0.129939,0.110781,0.253287


In [41]:
df_corresponding['any_french'] = df_corresponding.countries.apply(lambda lst: any('FR' in x for x in lst))
df_corresponding_anyfrench = df_corresponding[df_corresponding['any_french'] == True].reset_index(drop = True)
df_corresponding_anyfrench.to_parquet("../data/interim/preliminary/fr_p5_CA_yes_FR_inclussive.parquet", index = False)

total_per_year = df.groupby('publication_year')['doi'].nunique()
total_anyfrench= df_corresponding_anyfrench.groupby('publication_year')['doi'].nunique()

grouped_total = total_anyfrench / total_per_year
grouped_oa = df_corresponding_anyfrench[df_corresponding_anyfrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique() / total_per_year
grouped_apc = df_corresponding_anyfrench[~df_corresponding_anyfrench.apc_list.isna()].groupby('publication_year')['doi'].nunique() / total_per_year

grouped = pd.concat([grouped_oa.rename('openaccess'), grouped_apc.rename('apc_list'), grouped_total.rename('total')], axis = 1)
grouped

Unnamed: 0_level_0,openaccess,apc_list,total
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,0.245,0.27,0.36
2014,0.161179,0.163199,0.332991
2015,0.190741,0.212818,0.408153
2016,0.239036,0.318717,0.545984
2017,0.254532,0.310018,0.535802
2018,0.275691,0.312887,0.554585
2019,0.309568,0.304223,0.539771
2020,0.333157,0.319469,0.541167
2021,0.340179,0.320501,0.55054
2022,0.336804,0.3208,0.537651


**No corresponding**

In [42]:
df_nocorresponding

Unnamed: 0,doi,publication_year,language,field_names,journal,publisher,ins_type,#_authors,oa_status,apc_list,corresponding,countries,missing,whole_french
0,https://doi.org/10.1051/0004-6361/201322068,2013,en,"{Physics and Astronomy, Computer Science}",Astronomy and Astrophysics,EDP Sciences,"{education, nonprofit, facility, government}",44,bronze,,"[False, False, False, False, False, False, Fal...","[[DE], [US], [FR, US], [FR, US], [FR, US], [US...",False,False
1,https://doi.org/10.1038/nature12477,2013,en,"{Medicine, Biochemistry, Genetics and Molecula...",Nature,Nature Portfolio,"{healthcare, nonprofit, other, education, gove...",70,bronze,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False, False, False, False, False, Fal...","[[GB], [GB], [GB], [CA], [GB], [GB], [GB], [GB...",False,False
2,https://doi.org/10.1097/ccm.0b013e31827e83af,2013,en,{Medicine},Critical Care Medicine,Lippincott Williams & Wilkins,"{education, healthcare, company}",23,closed,,"[False, False, False, False, False, False, Fal...","[[US], [US], [GB], [FR], [DE], [US], [US], [IL...",False,False
3,https://doi.org/10.1038/nnano.2013.46,2013,en,{Materials Science},Nature Nanotechnology,Nature Portfolio,{facility},2,green,"{'value': 9750, 'currency': 'EUR', 'value_usd'...","[False, False]","[[], [FR]]",False,False
4,https://doi.org/10.1093/nar/gkt1178,2013,en,"{Biochemistry, Genetics and Molecular Biology,...",Nucleic Acids Research,Oxford University Press,{government},5,gold,"{'value': 3630, 'currency': 'USD', 'value_usd'...","[False, False, False, False, False]","[[FR], [FR], [FR], [FR], [FR]]",False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614558,https://doi.org/10.7202/1120388ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",2,diamond,,"[False, False]","[[], [FR]]",False,False
614559,https://doi.org/10.7202/1120383ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]",False,False
614560,https://doi.org/10.7202/1120375ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]",False,False
614561,https://doi.org/10.7202/1120374ar,2024,fr,{},Études/Inuit/Studies,Q114618608,"{education, facility}",3,diamond,,"[False, False, False]","[[], [CA, FR], [CA]]",False,False


All french

In [43]:
df_nocorresponding['whole_french'] = df_nocorresponding.countries.apply(lambda lst: all('FR' in x for x in lst))
df_nocorresponding_wholefrench = df_nocorresponding[df_nocorresponding['whole_french'] == True].reset_index(drop = True)
df_nocorresponding_wholefrench.to_parquet("../data/interim/preliminary/fr_p5_CA_no_allFR.parquet", index = False)

total_per_year = df.groupby('publication_year')['doi'].nunique()
total_wholefrench= df_nocorresponding_wholefrench.groupby('publication_year')['doi'].nunique()

grouped_total = total_wholefrench / total_per_year
grouped_oa = df_nocorresponding_wholefrench[df_nocorresponding_wholefrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique() / total_per_year
grouped_apc = df_nocorresponding_wholefrench[~df_nocorresponding_wholefrench.apc_list.isna()].groupby('publication_year')['doi'].nunique() / total_per_year

grouped = pd.concat([grouped_oa.rename('openaccess'), grouped_apc.rename('apc_list'), grouped_total.rename('total')], axis = 1)
grouped

Unnamed: 0_level_0,openaccess,apc_list,total
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,0.04,0.06,0.085
2014,0.092149,0.139308,0.262134
2015,0.094585,0.109459,0.21992
2016,0.086689,0.069553,0.167503
2017,0.093607,0.074448,0.16593
2018,0.095873,0.071975,0.157639
2019,0.108978,0.077669,0.159365
2020,0.115698,0.081461,0.158667
2021,0.114215,0.081691,0.155659
2022,0.113679,0.082072,0.15987


One or more french

In [44]:
df_nocorresponding['any_french_or_missing'] = df_nocorresponding['countries'].apply(lambda lst: all(all(c == 'FR' for c in inner) or len(inner) == 0 for inner in lst))
df_nocorresponding_anyfrenchormissing = df_nocorresponding[df_nocorresponding['any_french_or_missing'] == True].reset_index(drop = True)
df_nocorresponding_anyfrenchormissing.to_parquet("../data/interim/preliminary/fr_p5_CA_no_FR_plus_missing.parquet", index = False)

total_per_year = df.groupby('publication_year')['doi'].nunique()
total_anyfrenchormissing= df_nocorresponding_anyfrenchormissing.groupby('publication_year')['doi'].nunique()

grouped_total = total_anyfrenchormissing / total_per_year
grouped_oa = df_nocorresponding_anyfrenchormissing[df_nocorresponding_anyfrenchormissing.oa_status != 'closed'].groupby('publication_year')['doi'].nunique() / total_per_year
grouped_apc = df_nocorresponding_anyfrenchormissing[~df_nocorresponding_anyfrenchormissing.apc_list.isna()].groupby('publication_year')['doi'].nunique() / total_per_year

grouped = pd.concat([grouped_oa.rename('openaccess'), grouped_apc.rename('apc_list'), grouped_total.rename('total')], axis = 1)
grouped

Unnamed: 0_level_0,openaccess,apc_list,total
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,0.04,0.065,0.08
2014,0.104912,0.16409,0.312827
2015,0.107153,0.129231,0.259353
2016,0.102783,0.089837,0.206564
2017,0.110847,0.095428,0.20623
2018,0.114981,0.090568,0.196946
2019,0.129428,0.097733,0.198393
2020,0.136147,0.100789,0.196458
2021,0.134652,0.100131,0.19268
2022,0.134411,0.103796,0.201351


In [45]:
df_nocorresponding['any_french'] = df_nocorresponding.countries.apply(lambda lst: any('FR' in x for x in lst))
df_nocorresponding_anyfrench = df_nocorresponding[df_nocorresponding['any_french'] == True].reset_index(drop = True)
df_nocorresponding_anyfrench.to_parquet("../data/interim/preliminary/fr_p5_CA_no_FR_inclussive.parquet", index = False)

total_per_year = df.groupby('publication_year')['doi'].nunique()
total_anyfrench= df_nocorresponding_anyfrench.groupby('publication_year')['doi'].nunique()

grouped_total = total_anyfrench / total_per_year
grouped_oa = df_nocorresponding_anyfrench[df_nocorresponding_anyfrench.oa_status != 'closed'].groupby('publication_year')['doi'].nunique() / total_per_year
grouped_apc = df_nocorresponding_anyfrench[~df_nocorresponding_anyfrench.apc_list.isna()].groupby('publication_year')['doi'].nunique() / total_per_year

grouped = pd.concat([grouped_oa.rename('openaccess'), grouped_apc.rename('apc_list'), grouped_total.rename('total')], axis = 1)
grouped

Unnamed: 0_level_0,openaccess,apc_list,total
publication_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,0.44,0.335,0.64
2014,0.286604,0.354008,0.667009
2015,0.289451,0.303186,0.591847
2016,0.253167,0.199119,0.454016
2017,0.275718,0.211083,0.464198
2018,0.280351,0.204965,0.445415
2019,0.315011,0.227547,0.460229
2020,0.329768,0.233646,0.458833
2021,0.323793,0.234016,0.44946
2022,0.324844,0.237223,0.462349
