# Preliminar Analysis

In [1]:
import requests
import json
from urllib.parse import urlencode
import pandas as pd
import orjson
from tqdm import tqdm

### Download all articles where there are one or more french authors

In [None]:
BASE_URL = "https://api.openalex.org/works"

selected_fields = ["doi", "publication_year", "language", "indexed_in", "primary_location", "best_oa_location", "open_access", "authorships", 
                   "corresponding_author_ids", "corresponding_institution_ids", "apc_list", "apc_paid", "cited_by_count", "primary_topic", "awards", "funders"]

def fetch_page(cursor, filters):
    params = filters.copy()
    params["cursor"] = cursor
    params["per_page"] = 200
    url = f"{BASE_URL}?{urlencode(params)}"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json()

for year in range(2013, 2025): # anar modificant l'initial year
    print(f"\n=== YEAR {year} ===")

    filters = {
        "filter": ",".join([
            "indexed_in:crossref",
            "type:article|review",
            "authorships.institutions.country_code:FR",
            f"from_publication_date:{year}-01-01",
            f"to_publication_date:{year}-12-31",            
        ]),
        "select": ",".join(selected_fields)
    }

    cursor = "*"
    count = 0
    output_file = f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v3.jsonl"

    with open(output_file, "w", encoding="utf-8") as f:
        pbar = tqdm(unit="works", dynamic_ncols=True)
        while True:
            data = fetch_page(cursor, filters)

            works = data.get("results", [])
            next_cursor = data.get("meta", {}).get("next_cursor", None)

            for w in works:
                f.write(json.dumps(w) + "\n")
                count += 1

            pbar.update(len(works))
            
            if not next_cursor:
                break
            cursor = next_cursor
            time.sleep(1)

        pbar.close()
    print(f"\nSaved: {output_file}  (total {count})")

print("\n=== COMPLETED ===")


=== YEAR 2013 ===





4800works [01:22, 58.04works/s]
118321works [23:56, 82.37works/s]


Saved: ../data/interim/FranceInitialAPI/openalex_french_authors_2013_v3.jsonl  (total 118321)

=== COMPLETED ===





### Dataset construction

In [5]:
interest = ['doi', 'publication_year', 'language', 'authorships', 'best_oa_location', 'primary_topic', 'open_access', 'apc_list', 'apc_paid']
keys = ['doi', 'publication_year', 'language', 'field_name_top_topic', 'journal', 'journal_id', 'issn_l', 'publisher', 'publisher_id', 'display_name_institution', 'id_institution',
        'display_name_author', 'id_author', 'oa_status', 'apc_list', 'apc_paid','corresponding', 'countries']

for year in tqdm(range(2013, 2025)):
    records = []
    with open(f"../data/interim/FranceInitialAPI/openalex_french_authors_{year}_v3.jsonl", "rb") as f:
        for line in f:
            rec = orjson.loads(line)
            if not rec.get("doi"): # Skip records without DOI
                continue
            filtered = {k: rec.get(k) for k in interest} # Keep only the fields we care about

            # Extract field_names from topics
            filtered["field_name_top_topic"] =  filtered.get("primary_topic") .get("field", {}).get("display_name") if isinstance(filtered.get("primary_topic") , dict) else {}

            # Extract journal and publisher from best_oa_location
            pl = filtered.get("best_oa_location") or {}
            source = pl.get("source") or {}
            filtered["journal"] = source.get("display_name")
            filtered["journal_id"] = source.get("id").split("/")[-1] if source.get("id") else {}
            filtered["issn_l"] = source.get("issn_l")
            filtered["publisher"] = source.get("host_organization_name")
            filtered["publisher_id"] = source.get("host_organization").split("/")[-1] if source.get("host_organization") else {}

            # Extract institution types and number of authors
            authorships = filtered.get("authorships") or []
            filtered["display_name_institution"] = {inst.get("display_name") for auth in authorships for inst in auth.get("institutions", []) if inst.get("display_name")}
            filtered["id_institution"] = {inst.get("id").split('/')[-1] for auth in authorships for inst in auth.get("institutions", []) if inst.get("id")}
            filtered["corresponding"] = [auth.get("is_corresponding") for auth in authorships]
            filtered["countries"] = [auth.get("countries") for auth in authorships]
            filtered["display_name_author"] = {auth["author"].get("display_name") for auth in authorships if auth.get("author", {}).get("display_name")}
            filtered["id_author"] = {auth["author"].get("id").split('/')[-1] for auth in authorships if auth.get("author", {}).get("id")}
            
            o_a = filtered.get("open_access") or {}
            filtered["oa_status"] = o_a.get("oa_status")

            records.append({k: filtered.get(k) for k in keys})

        df_oa = pd.DataFrame(records)
        df_oa.to_csv(f'../data/interim/oa_initial_{year}.csv')

100%|██████████| 12/12 [03:14<00:00, 16.18s/it]
