In [1]:
# Run this cell: 
# The lines below will instruct jupyter to reload imported modules before 
# executing code cells. This enables you to quickly iterate and test revisions
# to your code without having to restart the kernel and reload all of your 
# modules each time you make a code change in a separate python file.

%load_ext autoreload
%autoreload 2

In [3]:
import os

# Change path to project root
if os.getcwd().endswith("notebooks"):
    os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

/Users/shloknatarajan/stanford/research/daneshjou/AutoGKB


In [2]:
import pandas as pd
import os
from tqdm import tqdm
import pickle
from loguru import logger
import json

## Articles to get
- 3: Articles with only one annotation
- 3: Articles with 2 annotations
- 1: Article with all 3 annotations

## Setup

In [4]:
# load annotations_by_pmcid
annotations_by_pmcid = pickle.load(open("data/variantAnnotations/annotations_by_pmcid.pkl", "rb"))


## Only One Annotation

In [24]:
only_drug = []
for pmid_addition in annotations_by_pmcid:
    if len(pmid_addition["var_drug_ann"]) > 0 and len(pmid_addition["var_fa_ann"]) == 0 and len(pmid_addition["var_pheno_ann"]) == 0:
        only_drug.append(pmid_addition)
len(only_drug)

5021

In [25]:
only_fa = []
for pmid_addition in annotations_by_pmcid:
    if len(pmid_addition["var_drug_ann"]) == 0 and len(pmid_addition["var_fa_ann"]) > 0 and len(pmid_addition["var_pheno_ann"]) == 0:
        only_fa.append(pmid_addition)
len(only_fa)

368

In [26]:
only_pheno = []
for pmid_addition in annotations_by_pmcid:
    if len(pmid_addition["var_drug_ann"]) == 0 and len(pmid_addition["var_fa_ann"]) == 0 and len(pmid_addition["var_pheno_ann"]) > 0:
        only_pheno.append(pmid_addition)
len(only_pheno)

5857

## Two Annotations and All Three

In [27]:
two_annotations = []
all_three_annotations = []
for pmid_addition in annotations_by_pmcid:
    annotation_count = 0
    if len(pmid_addition["var_drug_ann"]) > 0:
        annotation_count += 1
    if len(pmid_addition["var_fa_ann"]) > 0:
        annotation_count += 1
    if len(pmid_addition["var_pheno_ann"]) > 0:
        annotation_count += 1
    if annotation_count == 2:
        two_annotations.append(pmid_addition)
    elif annotation_count == 3:
        all_three_annotations.append(pmid_addition)
print(f"Two annotations: {len(two_annotations)}")
print(f"All three annotations: {len(all_three_annotations)}")


Two annotations: 2249
All three annotations: 28


In [28]:
# Save all to new json files
with open("data/variantAnnotations/exploration/two_annotations.json", "w") as f:
    json.dump(two_annotations, f)
with open("data/variantAnnotations/exploration/all_three_annotations.json", "w") as f:
    json.dump(all_three_annotations, f)
with open("data/variantAnnotations/exploration/only_drug.json", "w") as f:
    json.dump(only_drug, f)
with open("data/variantAnnotations/exploration/only_fa.json", "w") as f:
    json.dump(only_fa, f)
with open("data/variantAnnotations/exploration/only_pheno.json", "w") as f:
    json.dump(only_pheno, f)

## One of each study type

In [5]:
study_types = {
    "cohort": [],
    "case_control": [],
    "case_series": [],
    "cross_sectional": [],
    "clinical_trial": [],
    "meta_analysis": [],
    "GWAS": [],
    "replication": [],
    "prospective": [],
    "retrospective": [],
    "linkage": [],
    "other": []   
}

In [6]:
annotations_by_pmcid[0]

{'pmid': 29238301,
 'pmcid': 'PMC5712579',
 'title': 'Association of HLA-A and HLA-B Alleles with Lamotrigine-Induced Cutaneous Adverse Drug Reactions in the Thai Population',
 'study_parameters': {'Study Parameters ID': 1449169927,
  'Variant Annotation ID': 1449169911,
  'Study Type': 'case/control',
  'Study Cases': 15.0,
  'Study Controls': 50.0,
  'Characteristics': 'SCAR, MPE, SJS (cases and controls)',
  'Characteristics Type': 'Disease',
  'Frequency In Cases': 0.06,
  'Allele Of Frequency In Cases': '*35:08',
  'Frequency In Controls': 0.001,
  'Allele Of Frequency In Controls': '*35:08',
  'P Value': '= 0.231',
  'Ratio Stat Type': 'OR',
  'Ratio Stat': 10.45,
  'Confidence Interval Start': 0.4,
  'Confidence Interval Stop': 270.41,
  'Biogeographical Groups': 'East Asian',
  'PMID': 29238301},
 'var_drug_ann': [],
 'var_fa_ann': [],
 'var_pheno_ann': [{'Variant Annotation ID': 1449169911,
   'Variant/Haplotypes': 'HLA-B*35:08',
   'Gene': 'HLA-B',
   'Drug(s)': 'lamotrigine'

In [7]:
all_study_types = set()
for annotation in annotations_by_pmcid:
    study_type = annotation["study_parameters"]["Study Type"]
    all_study_types.add(study_type)

# remove nan from all_study_types
all_study_types = list({t for t in all_study_types if pd.notna(t)})
all_study_types.sort()



In [8]:
for x in all_study_types:
    print(x)

GWAS
GWAS, prospective
GWAS, replication
GWAS, retrospective
case series
case series, clinical trial
case series, linkage, trios
case series, trios
case/control
case/control, GWAS
case/control, GWAS, retrospective
case/control, case series
case/control, clinical trial
case/control, clinical trial, GWAS
case/control, clinical trial, meta-analysis
case/control, clinical trial, prospective
case/control, clinical trial, retrospective
case/control, meta-analysis
case/control, meta-analysis, GWAS
case/control, prospective
case/control, replication
case/control, replication, retrospective
case/control, retrospective
clinical trial
clinical trial, GWAS
clinical trial, meta-analysis
clinical trial, meta-analysis, GWAS
clinical trial, meta-analysis, replication
clinical trial, prospective
clinical trial, replication
clinical trial, replication, prospective
clinical trial, retrospective
cohort
cohort, GWAS
cohort, GWAS, prospective
cohort, GWAS, replication
cohort, GWAS, retrospective
cohort, cas

In [9]:
annotations_by_study_type = {}
for annotation in tqdm(annotations_by_pmcid, desc="Analyzing annotation study types"):
    annotation_study_type = annotation["study_parameters"]["Study Type"]
    if pd.isna(annotation_study_type):
        annotation_study_type = "nan"
    if annotation_study_type not in annotations_by_study_type:
        annotations_by_study_type[annotation_study_type] = []
    annotations_by_study_type[annotation_study_type].append(annotation)


Analyzing annotation study types: 100%|██████████| 13523/13523 [00:00<00:00, 1573882.37it/s]


In [10]:
# sort keys of annotations_by_study_type
annotations_by_study_type = dict(sorted(annotations_by_study_type.items()))

# print number of annotations in each study type
for study_type, annotations in annotations_by_study_type.items():
    print(f"{study_type}: {len(annotations)}")


GWAS: 400
GWAS, prospective: 7
GWAS, replication: 3
GWAS, retrospective: 9
case series: 299
case series, clinical trial: 3
case series, linkage, trios: 1
case series, trios: 1
case/control: 1473
case/control, GWAS: 97
case/control, GWAS, retrospective: 7
case/control, case series: 1
case/control, clinical trial: 26
case/control, clinical trial, GWAS: 3
case/control, clinical trial, meta-analysis: 2
case/control, clinical trial, prospective: 27
case/control, clinical trial, retrospective: 9
case/control, meta-analysis: 58
case/control, meta-analysis, GWAS: 17
case/control, prospective: 8
case/control, replication: 30
case/control, replication, retrospective: 2
case/control, retrospective: 78
clinical trial: 502
clinical trial, GWAS: 51
clinical trial, meta-analysis: 6
clinical trial, meta-analysis, GWAS: 2
clinical trial, meta-analysis, replication: 1
clinical trial, prospective: 64
clinical trial, replication: 34
clinical trial, replication, prospective: 13
clinical trial, retrospectiv

In [11]:
# Save annotations_by_study_type to json
# with open("data/variantAnnotations/exploration/annotations_by_study_type.json", "w") as f:
#     json.dump(annotations_by_study_type, f)

In [64]:
annotation_template = {
    "pmid": None,
    "pmcid": None,
    "study_parameters": [],
    "var_drug_ann": [],
    "var_fa_ann": [],
    "var_pheno_ann": [],
}

In [61]:
len(annotations_by_pmcid)

13523

In [70]:
annotations_pmid_grouped = {}
for annotation in tqdm(annotations_by_pmcid, desc="Grouping annotations by PMID"):
    annotation_pmcid = annotation["pmid"]
    if pd.isna(annotation_pmcid):
        annotation_pmcid = "nan"
    if annotation_pmcid not in annotations_pmid_grouped:
        annotations_pmid_grouped[annotation_pmcid] = {
            "pmid": None,
            "pmcid": None,
            "study_parameters": [],
            "var_drug_ann": [],
            "var_fa_ann": [],
            "var_pheno_ann": [],
        }
    annotations_pmid_grouped[annotation_pmcid]["pmid"] = annotation.get("pmid", None)
    annotations_pmid_grouped[annotation_pmcid]["pmcid"] = annotation.get("pmcid", None)
    annotations_pmid_grouped[annotation_pmcid]["study_parameters"].append(annotation.get("study_parameters", None))
    annotations_pmid_grouped[annotation_pmcid]["var_drug_ann"].extend(annotation.get("var_drug_ann", []))
    annotations_pmid_grouped[annotation_pmcid]["var_fa_ann"].extend(annotation.get("var_fa_ann", []))
    annotations_pmid_grouped[annotation_pmcid]["var_pheno_ann"].extend(annotation.get("var_pheno_ann", []))


Grouping annotations by PMID: 100%|██████████| 13523/13523 [00:00<00:00, 592464.33it/s]


In [76]:
annotations_pmid_grouped[29238301]

{'pmid': 29238301,
 'pmcid': 'PMC5712579',
 'study_parameters': [{'Study Parameters ID': 1449169927,
   'Variant Annotation ID': 1449169911,
   'Study Type': 'case/control',
   'Study Cases': 15.0,
   'Study Controls': 50.0,
   'Characteristics': 'SCAR, MPE, SJS (cases and controls)',
   'Characteristics Type': 'Disease',
   'Frequency In Cases': 0.06,
   'Allele Of Frequency In Cases': '*35:08',
   'Frequency In Controls': 0.001,
   'Allele Of Frequency In Controls': '*35:08',
   'P Value': '= 0.231',
   'Ratio Stat Type': 'OR',
   'Ratio Stat': 10.45,
   'Confidence Interval Start': 0.4,
   'Confidence Interval Stop': 270.41,
   'Biogeographical Groups': 'East Asian',
   'PMID': 29238301},
  {'Study Parameters ID': 1449170027,
   'Variant Annotation ID': 1449169930,
   'Study Type': 'case/control',
   'Study Cases': 15.0,
   'Study Controls': 50.0,
   'Characteristics': 'SCAR, MPE or SJS (cases and controls)',
   'Characteristics Type': 'Disease',
   'Frequency In Cases': 0.13,
   'A

In [77]:
# Save annotations_pmid_grouped to json
with open("data/variantAnnotations/exploration/annotations_pmid_grouped_2.json", "w") as f:
    json.dump(annotations_pmid_grouped, f, indent=2)


In [18]:
# Check number of pmids in annotations_pmid_grouped
print(f"Number of pmids in annotations_pmid_grouped: {len(annotations_pmid_grouped)}")
# Check number of pmids in annotations_by_pmcid
print(f"Number of pmids in annotations_by_pmcid: {len(annotations_by_pmcid)}")
# Check number of pmids in annotations_by_pmcid
print(f"Number of pmids in annotations_by_pmcid: {len(annotations_by_pmcid)}")


Number of pmids in annotations_pmid_grouped: 2864
Number of pmids in annotations_by_pmcid: 13523
Number of pmids in annotations_by_pmcid: 13523


In [22]:
annotations_by_pmcid[0]

{'pmid': 39528547,
 'pmcid': 'PMC11554802',
 'title': 'Efficacy and safety of sacituzumab govitecan Trop-2-targeted antibody-drug conjugate in solid tumors and UGT1A1*28 polymorphism: a systematic review and meta-analysis',
 'study_parameters': {'Study Parameters ID': 1453086511,
  'Variant Annotation ID': 1453086503,
  'Study Type': 'meta-analysis',
  'Study Cases': 1138.0,
  'Study Controls': nan,
  'Characteristics': 'Grade 3/4 Neutropenia, 5 studies',
  'Characteristics Type': 'Study Cohort',
  'Frequency In Cases': nan,
  'Allele Of Frequency In Cases': nan,
  'Frequency In Controls': nan,
  'Allele Of Frequency In Controls': nan,
  'P Value': '= 0.05',
  'Ratio Stat Type': nan,
  'Ratio Stat': nan,
  'Confidence Interval Start': nan,
  'Confidence Interval Stop': nan,
  'Biogeographical Groups': 'Multiple groups',
  'PMID': 39528547},
 'var_drug_ann': [],
 'var_fa_ann': [],
 'var_pheno_ann': [{'Variant Annotation ID': 1453086503,
   'Variant/Haplotypes': 'UGT1A1*1, UGT1A1*28',
  

In [23]:
sorted_annotations_by_pmcid = sorted(annotations_by_pmcid, key=lambda x: x["pmid"])

In [31]:
sorted_annotations_by_pmcid[5]

{'pmid': 990860,
 'pmcid': 'PMC1689719',
 'title': 'Drug-induced haemolysis in glucose-6-phosphate dehydrogenase deficiency',
 'study_parameters': {'Study Parameters ID': 1184521221,
  'Variant Annotation ID': 1184521219,
  'Study Type': 'cohort',
  'Study Cases': 8.0,
  'Study Controls': nan,
  'Characteristics': 'Red blood cells from G6PD deficient individuals classified as having the Canton variant or Hong Kong-Pokfulam variant.',
  'Characteristics Type': 'Study Cohort',
  'Frequency In Cases': nan,
  'Allele Of Frequency In Cases': nan,
  'Frequency In Controls': nan,
  'Allele Of Frequency In Controls': nan,
  'P Value': nan,
  'Ratio Stat Type': 'OR',
  'Ratio Stat': nan,
  'Confidence Interval Start': nan,
  'Confidence Interval Stop': nan,
  'Biogeographical Groups': 'East Asian',
  'PMID': 990860},
 'var_drug_ann': [],
 'var_fa_ann': [],
 'var_pheno_ann': [{'Variant Annotation ID': 1184521237,
   'Variant/Haplotypes': 'G6PD deficiency',
   'Gene': 'G6PD',
   'Drug(s)': 'pheny