In [2]:
# Cell 1: Import Libraries
import pandas as pd
import os

## Download the PMIDS->PMCIDS mapping from the PMC FTP site. this will be the pmc_ids_file variable in the cell after this
[Website link to PMC-ids.csv.gz](https://ftp.ncbi.nlm.nih.gov/pub/pmc/) for pmid to pmcid mapping

[Website link to oa_comm_use_file_list.csv](https://ftp.ncbi.nlm.nih.gov/pub/pmc/) for pmcid to pmcid (open access + commercial use) mapping


In [3]:
# Cell 2: Define File Paths
# Define input and output directories
subsets_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\matching_pmids_subsets"  # Directory with subset files
pmc_ids_file = r"C:\Users\aivan\Downloads\PMC-ids.csv.gz"  # .gz file
oa_comm_use_file_list = r"C:\Users\aivan\Downloads\oa_comm_use_file_list.csv"
output_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\matching_pmcids_subsets"  # Output directory

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Cell 3: Load PMC-ids.csv.gz
print("Loading PMC-ids.csv.gz...")
pmc_ids = pd.read_csv(
    pmc_ids_file,
    usecols=["PMCID", "PMID"],
    dtype=str,
    compression="gzip"
)
print("PMC-ids.csv.gz loaded successfully!")

Loading PMC-ids.csv.gz...
PMC-ids.csv.gz loaded successfully!


In [5]:
# Cell 4: Load Open Access PMCIDs
print("Loading Open Access PMCIDs from oa_file_list.csv...")
oa_pmc_ids = pd.read_csv(oa_comm_use_file_list, usecols=["Accession ID"], dtype=str)
oa_pmc_set = set(oa_pmc_ids["Accession ID"])  # Convert to set for fast lookup
print("Open Access PMCIDs loaded successfully!")


Loading Open Access PMCIDs from oa_file_list.csv...
Open Access PMCIDs loaded successfully!


In [6]:
# Cell 5: Process Each Subset File
# Loop through each subset file in the directory
for subset_file in os.listdir(subsets_dir):
    if subset_file.endswith(".txt"):  # Ensure it's a .txt file
        subset_path = os.path.join(subsets_dir, subset_file)

        # Extract the subset number from the file name
        subset_number = os.path.splitext(subset_file)[0].split('_')[-1]
        output_file = os.path.join(output_dir, f"subset_{subset_number}.txt")

        # Step 1: Load the subset of PMIDs
        print(f"Loading subset file: {subset_file}...")
        subset_pmids = pd.read_csv(subset_path, header=None, names=["PMID"], dtype=str)

        # Step 2: Perform the matching (PMID → PMCID)
        print(f"Matching PMIDs with PMCIDs for {subset_file}...")
        matched = pd.merge(subset_pmids, pmc_ids, on="PMID", how="left")

        # Step 3: Keep only Open Access PMCIDs
        matched["is_open_access"] = matched["PMCID"].isin(oa_pmc_set)
        matched_oa = matched[matched["is_open_access"]]  # Filter only OA PMCIDs

        # Step 4: Save the result to a .txt file
        print(f"Saving Open Access results to: subset_{subset_number}.txt...")
        with open(output_file, "w", encoding="utf-8") as f:
            for _, row in matched_oa.iterrows():
                pmid = row["PMID"]
                pmcid = row["PMCID"] if pd.notna(row["PMCID"]) else "NA"
                f.write(f"{pmid}\t{pmcid}\n")  # Tab-separated values

print("Done! All subsets processed. Only Open Access PMCIDs (commercial use) are saved.")

Loading subset file: subset_1.txt...
Matching PMIDs with PMCIDs for subset_1.txt...
Saving Open Access results to: subset_1.txt...
Loading subset file: subset_10.txt...
Matching PMIDs with PMCIDs for subset_10.txt...
Saving Open Access results to: subset_10.txt...
Loading subset file: subset_11.txt...
Matching PMIDs with PMCIDs for subset_11.txt...
Saving Open Access results to: subset_11.txt...
Loading subset file: subset_12.txt...
Matching PMIDs with PMCIDs for subset_12.txt...
Saving Open Access results to: subset_12.txt...
Loading subset file: subset_13.txt...
Matching PMIDs with PMCIDs for subset_13.txt...
Saving Open Access results to: subset_13.txt...
Loading subset file: subset_14.txt...
Matching PMIDs with PMCIDs for subset_14.txt...
Saving Open Access results to: subset_14.txt...
Loading subset file: subset_15.txt...
Matching PMIDs with PMCIDs for subset_15.txt...
Saving Open Access results to: subset_15.txt...
Loading subset file: subset_16.txt...
Matching PMIDs with PMCIDs 