In [16]:
# Run this cell: 
# The lines below will instruct jupyter to reload imported modules before 
# executing code cells. This enables you to quickly iterate and test revisions
# to your code without having to restart the kernel and reload all of your 
# modules each time you make a code change in a separate python file.

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import pandas as pd
import os
from tqdm import tqdm
import pickle
from loguru import logger
import json

In [18]:
# Change path to project root
if os.getcwd().endswith("notebooks"):
    os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

/Users/shloknatarajan/stanford/research/daneshjou/AutoGKB


In [19]:
# load all the data/variant_annotations tsv files
study_parameters = pd.read_csv(os.path.join("data", "variantAnnotations", "study_parameters.tsv"), sep="\t")
var_drug_ann = pd.read_csv(os.path.join("data", "variantAnnotations", "var_drug_ann.tsv"), sep="\t")
var_fa_ann = pd.read_csv(os.path.join("data", "variantAnnotations", "var_fa_ann.tsv"), sep="\t")
var_pheno_ann = pd.read_csv(os.path.join("data", "variantAnnotations", "var_pheno_ann.tsv"), sep="\t")

In [20]:
# Check unique Varriant Annotatiosn in study_parameters
unique_variant_annotations = study_parameters['Variant Annotation ID'].unique()

# Find rows where a Variant Annotation ID appears in multiple rows
duplicate_variant_annotations = study_parameters[study_parameters.duplicated(subset=['Variant Annotation ID'])]

# Count the number of duplicate Variant Annotation IDs
num_duplicate_variant_annotations = len(duplicate_variant_annotations)




In [21]:
study_parameters["Study Type"].unique()

array(['case/control', 'meta-analysis', 'cohort', nan, 'retrospective',
       'case series', 'clinical trial', 'cohort, retrospective',
       'cohort, prospective', 'cohort, clinical trial', 'GWAS',
       'case/control, GWAS', 'case/control, meta-analysis',
       'cohort, GWAS', 'meta-analysis, retrospective', 'prospective',
       'case series, retrospective',
       'case/control, clinical trial, retrospective',
       'cohort, clinical trial, prospective', 'cohort, replication',
       'case/control, retrospective', 'clinical trial, GWAS',
       'case/control, prospective', 'cohort, prospective, retrospective',
       'replication', 'cohort, case/control',
       'clinical trial, prospective', 'prospective, retrospective',
       'GWAS, replication', 'meta-analysis, GWAS',
       'case/control, clinical trial, prospective',
       'case series, prospective', 'cohort, meta-analysis',
       'case/control, GWAS, retrospective', 'cross sectional',
       'cohort, case series', 'ca

In [22]:
# Convert var_drug_ann to pmid, Variant Annotation ID, and Variant Annotation Type
pmid_table = var_drug_ann[['PMID', 'Variant Annotation ID']]
pmid_table = pd.concat([pmid_table, var_fa_ann[['PMID', 'Variant Annotation ID']]])
pmid_table = pd.concat([pmid_table, var_pheno_ann[['PMID', 'Variant Annotation ID']]])
pmid_table.drop_duplicates(inplace=True)
len(pmid_table)

28525

In [23]:
# join study parameters with pmid_table on Variant Annotation ID
study_parameters = study_parameters.merge(pmid_table, on='Variant Annotation ID', how='left')
len(study_parameters)

34836

In [24]:
sorted_params = study_parameters.sort_values(by="Variant Annotation ID", inplace=False)

In [25]:
base_json = {
    "pmid": "",
    "var_drug_ann": [],
    "var_fa_ann": [],
    "var_pheno_ann": []
}

In [27]:
# check for annotations_by_pmid.pkl
pmid_table = []
if os.path.exists("data/variantAnnotations/annotations_by_pmid.pkl"):
    with open("data/variantAnnotations/annotations_by_pmid.pkl", "rb") as f:
        pmid_table = pickle.load(f)
        logger.info(f"Loaded {len(pmid_table)} pmid annotations from pickle")
else:
    for index, row in tqdm(study_parameters.iterrows(), total=len(study_parameters)):
        pmid = row['PMID']

        # Filter by Variant Annotation ID
        var_drug_ann_filtered = var_drug_ann[var_drug_ann['PMID'] == pmid].to_dict(orient='records')
        # Convert to list
        var_fa_ann_filtered = var_fa_ann[var_fa_ann['PMID'] == pmid].to_dict(orient='records')

        var_pheno_ann_filtered = var_pheno_ann[var_pheno_ann['PMID'] == pmid].to_dict(orient='records')

        # Filter by Variant Annotation Type
        pmid_addition = {
            "pmid": pmid,
            "var_drug_ann": var_drug_ann_filtered,
            "var_fa_ann": var_fa_ann_filtered,
            "var_pheno_ann": var_pheno_ann_filtered
        }
        pmid_table.append(pmid_addition)

[32m2025-06-09 11:57:58.637[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mLoaded 34836 pmid annotations from pickle[0m


In [35]:
# save pmid_table to jsonl
# import json
# with open("data/annotations_by_pmid.jsonl", "w") as f:
#     for pmid_addition in pmid_table:
#         f.write(json.dumps(pmid_addition) + "\n")

In [38]:
pmcid_mapping = json.load(open("data/pmcid_mapping_updated.json"))

In [43]:
def get_title_from_markdown(markdown_path: str):
    with open(markdown_path, "r") as f:
        markdown_text = f.read()
    # get the title from the markdown text
    title = markdown_text.split("\n")[0]
    # remove the # from the title
    title = title.replace("# ", "")
    return title


In [63]:
# lets add pmcids and titles to the pmid_table
found_pmcids = 0
for pmid_addition in tqdm(pmid_table):
    title = ""
    pmcid = ""
    pmcid = pmcid_mapping[str(pmid_addition["pmid"])]
    if pmcid is None:
        pmid_addition["pmcid"] = None
        pmid_addition["title"] = None
        continue
    found_pmcids += 1
    markdown_path = os.path.join("data", "articles", f"PMC{pmcid}.md")
    if os.path.exists(markdown_path):
        title = get_title_from_markdown(markdown_path)
    else:
        title = ""
    pmid_addition["pmcid"] = pmcid
    pmid_addition["title"] = title
print(f"Found {found_pmcids} pmcids")

  0%|          | 0/34836 [00:00<?, ?it/s]

100%|██████████| 34836/34836 [00:00<00:00, 172077.15it/s]

Found 13523 pmcids





In [64]:
pmid_table_filtered = []
for pmid_addition in pmid_table:
    if pmid_addition["pmcid"] is None:
        continue
    pmid_table_filtered.append(pmid_addition)
len(pmid_table_filtered)


13523