In [None]:
# Cell 1: Import Libraries
import pandas as pd
import os

## Download the PMIDS->PMCIDS mapping from the PMC FTP site. this will be the pmc_ids_file variable in the cell after this
[Website link to the .gz file](https://pmc.ncbi.nlm.nih.gov/tools/id-converter-api/#:~:text=Another%20option%20for%20bulk%2Dconversion%20of%20identifiers%20in%20the%20PMC%20system%20is%20to%20download%20the%20PMC%2Dids.csv.gz%20file%2C%20as%20described%20on%20the%20FTP%20Service%20page.)

In [None]:
# Cell 2: Define File Paths
# Define input and output directories
subsets_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\matching_pmids_subsets"  # Directory with subset files
pmc_ids_file = r"C:\Users\aivan\Downloads\PMC-ids.csv.gz"  # .gz file
output_dir = r"C:\Users\aivan\Desktop\BIOIN 401\GOLLM\data\matching_pmcids_subsets"  # Output directory

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Cell 3: Load PMC-ids.csv.gz
print("Loading PMC-ids.csv.gz...")
pmc_ids = pd.read_csv(
    pmc_ids_file,
    usecols=["PMCID", "PMID"],
    dtype=str,
    compression="gzip"
)
print("PMC-ids.csv.gz loaded successfully!")

In [None]:
# Cell 4: Process Each Subset File
# Loop through each subset file in the directory
for subset_file in os.listdir(subsets_dir):
    if subset_file.endswith(".txt"):  # Ensure it's a .txt file
        subset_path = os.path.join(subsets_dir, subset_file)
        
        # Extract the subset number from the file name
        subset_number = os.path.splitext(subset_file)[0].split('_')[-1]
        output_file = os.path.join(output_dir, f"subset_{subset_number}.txt")

        # Step 1: Load the subset of PMIDs
        print(f"Loading subset file: {subset_file}...")
        subset_pmids = pd.read_csv(subset_path, header=None, names=["PMID"], dtype=str)

        # Step 2: Perform the matching
        print(f"Matching PMIDs with PMCIDs for {subset_file}...")
        matched = pd.merge(subset_pmids, pmc_ids, on="PMID", how="left")

        # Step 3: Save the result to a .txt file
        print(f"Saving results to output file: subset_{subset_number}.txt...")
        with open(output_file, "w", encoding="utf-8") as f:
            for _, row in matched.iterrows():
                pmid = row["PMID"]
                pmcid = row["PMCID"] if pd.notna(row["PMCID"]) else "NA"  # Handle missing PMCIDs
                f.write(f"{pmid}\t{pmcid}\n")  # Tab-separated values

print("Done! All subsets processed.")
