In [70]:
# Run this cell: 
# The lines below will instruct jupyter to reload imported modules before 
# executing code cells. This enables you to quickly iterate and test revisions
# to your code without having to restart the kernel and reload all of your 
# modules each time you make a code change in a separate python file.

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
import pandas as pd
import os
from tqdm import tqdm
import pickle
from loguru import logger
import json

In [72]:
# Change path to project root
if os.getcwd().endswith("notebooks"):
    os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

/Users/shloknatarajan/stanford/research/daneshjou/AutoGKB


In [73]:
# load all the data/variant_annotations tsv files
study_parameters = pd.read_csv(os.path.join("data", "variantAnnotations", "study_parameters.tsv"), sep="\t")
var_drug_ann = pd.read_csv(os.path.join("data", "variantAnnotations", "var_drug_ann.tsv"), sep="\t")
var_fa_ann = pd.read_csv(os.path.join("data", "variantAnnotations", "var_fa_ann.tsv"), sep="\t")
var_pheno_ann = pd.read_csv(os.path.join("data", "variantAnnotations", "var_pheno_ann.tsv"), sep="\t")

In [74]:
# Check unique Varriant Annotatiosn in study_parameters
unique_variant_annotations = study_parameters['Variant Annotation ID'].unique()

# Find rows where a Variant Annotation ID appears in multiple rows
duplicate_variant_annotations = study_parameters[study_parameters.duplicated(subset=['Variant Annotation ID'])]

# Count the number of duplicate Variant Annotation IDs
num_duplicate_variant_annotations = len(duplicate_variant_annotations)




In [75]:
study_parameters["Study Type"].unique()

array(['case/control', 'meta-analysis', 'cohort', nan, 'retrospective',
       'case series', 'clinical trial', 'cohort, retrospective',
       'cohort, prospective', 'cohort, clinical trial', 'GWAS',
       'case/control, GWAS', 'case/control, meta-analysis',
       'cohort, GWAS', 'meta-analysis, retrospective', 'prospective',
       'case series, retrospective',
       'case/control, clinical trial, retrospective',
       'cohort, clinical trial, prospective', 'cohort, replication',
       'case/control, retrospective', 'clinical trial, GWAS',
       'case/control, prospective', 'cohort, prospective, retrospective',
       'replication', 'cohort, case/control',
       'clinical trial, prospective', 'prospective, retrospective',
       'GWAS, replication', 'meta-analysis, GWAS',
       'case/control, clinical trial, prospective',
       'case series, prospective', 'cohort, meta-analysis',
       'case/control, GWAS, retrospective', 'cross sectional',
       'cohort, case series', 'ca

In [76]:
# Convert var_drug_ann to pmid, Variant Annotation ID, and Variant Annotation Type
pmid_table = var_drug_ann[['PMID', 'Variant Annotation ID']]
pmid_table = pd.concat([pmid_table, var_fa_ann[['PMID', 'Variant Annotation ID']]])
pmid_table = pd.concat([pmid_table, var_pheno_ann[['PMID', 'Variant Annotation ID']]])
pmid_table.drop_duplicates(inplace=True)
len(pmid_table)

28525

In [77]:
# join study parameters with pmid_table on Variant Annotation ID
study_parameters = study_parameters.merge(pmid_table, on='Variant Annotation ID', how='left')
len(study_parameters)

34836

In [78]:
sorted_params = study_parameters.sort_values(by="Variant Annotation ID", inplace=False)

In [88]:
# check for annotations_by_pmid.pkl
pmid_table = []
if os.path.exists("data/variantAnnotations/annotations_by_pmid.pkl"):
    with open("data/variantAnnotations/annotations_by_pmid.pkl", "rb") as f:
        pmid_table = pickle.load(f)
        logger.info(f"Loaded {len(pmid_table)} pmid annotations from pickle")
else:
    for index, row in tqdm(study_parameters.iterrows(), total=len(study_parameters)):
        pmid = row['PMID']

        # Filter by Variant Annotation ID
        var_drug_ann_filtered = var_drug_ann[var_drug_ann['PMID'] == pmid].to_dict(orient='records')
        # Convert to list
        var_fa_ann_filtered = var_fa_ann[var_fa_ann['PMID'] == pmid].to_dict(orient='records')

        var_pheno_ann_filtered = var_pheno_ann[var_pheno_ann['PMID'] == pmid].to_dict(orient='records')

        # Filter by Variant Annotation Type
        pmid_addition = {
            "pmid": pmid,
            "pmcid": None,
            "title": None,
            "study_parameters": row.to_dict(),
            "var_drug_ann": var_drug_ann_filtered,
            "var_fa_ann": var_fa_ann_filtered,
            "var_pheno_ann": var_pheno_ann_filtered,
        }
        pmid_table.append(pmid_addition)

100%|██████████| 34836/34836 [00:48<00:00, 713.75it/s]


In [90]:
# save pmid_table to pickle
# pickle.dump(pmid_table, open("data/variantAnnotations/annotations_by_pmid.pkl", "wb"))
# save pmid_table to json
# json.dump(pmid_table, open("data/variantAnnotations/annotations_by_pmid.json", "w"))

In [91]:
pmcid_mapping = json.load(open("data/pmcid_mapping_updated.json"))

In [96]:
def get_title_from_markdown(markdown_path: str):
    with open(markdown_path, "r") as f:
        markdown_text = f.read()
    # get the title from the markdown text
    title = markdown_text.split("\n")[0]
    # remove the # from the title
    title = title.replace("# ", "")
    return title


In [108]:
# lets add pmcids and titles to the pmid_table
found_pmcids = 0
for pmid_addition in tqdm(pmid_table):
    title = ""
    pmcid = ""
    pmcid = pmcid_mapping[str(pmid_addition["pmid"])]
    if pmcid is None:
        pmid_addition["pmcid"] = None
        pmid_addition["title"] = None
        continue
    found_pmcids += 1
    markdown_path = os.path.join("data", "articles", f"{pmcid}.md")
    if os.path.exists(markdown_path):
        title = get_title_from_markdown(markdown_path)
    else:
        title = ""
    pmid_addition["pmcid"] = pmcid
    pmid_addition["title"] = title
print(f"Found {found_pmcids} pmcids")

100%|██████████| 34836/34836 [00:01<00:00, 24847.06it/s]

Found 13523 pmcids





In [109]:
pmid_table_filtered = []
for pmid_addition in pmid_table:
    if pmid_addition["pmcid"] is None:
        continue
    pmid_table_filtered.append(pmid_addition)
len(pmid_table_filtered)


13523

In [111]:
pmid_table_filtered[0]

{'pmid': 29238301,
 'pmcid': 'PMC5712579',
 'title': 'Association of HLA-A and HLA-B Alleles with Lamotrigine-Induced Cutaneous Adverse Drug Reactions in the Thai Population',
 'study_parameters': {'Study Parameters ID': 1449169927,
  'Variant Annotation ID': 1449169911,
  'Study Type': 'case/control',
  'Study Cases': 15.0,
  'Study Controls': 50.0,
  'Characteristics': 'SCAR, MPE, SJS (cases and controls)',
  'Characteristics Type': 'Disease',
  'Frequency In Cases': 0.06,
  'Allele Of Frequency In Cases': '*35:08',
  'Frequency In Controls': 0.001,
  'Allele Of Frequency In Controls': '*35:08',
  'P Value': '= 0.231',
  'Ratio Stat Type': 'OR',
  'Ratio Stat': 10.45,
  'Confidence Interval Start': 0.4,
  'Confidence Interval Stop': 270.41,
  'Biogeographical Groups': 'East Asian',
  'PMID': 29238301},
 'var_drug_ann': [],
 'var_fa_ann': [],
 'var_pheno_ann': [{'Variant Annotation ID': 1449169911,
   'Variant/Haplotypes': 'HLA-B*35:08',
   'Gene': 'HLA-B',
   'Drug(s)': 'lamotrigine'

In [112]:
# Save pmid_table_filtered to pickle
pickle.dump(pmid_table_filtered, open("data/variantAnnotations/annotations_by_pmcid.pkl", "wb"))
# Save pmid_table_filtered to json
json.dump(pmid_table_filtered, open("data/variantAnnotations/annotations_by_pmcid.json", "w"))
