In [1]:
# Run this cell: 
# The lines below will instruct jupyter to reload imported modules before 
# executing code cells. This enables you to quickly iterate and test revisions
# to your code without having to restart the kernel and reload all of your 
# modules each time you make a code change in a separate python file.

%load_ext autoreload
%autoreload 2

In [24]:
from src.annotation_extraction.simple_inference import SimpleLLM, PromptGenerator
from src.annotation_extraction.models import ArticleParser

In [25]:
article_title = ArticleParser(pmcid="PMC11730665").parse().title
article_text = ArticleParser(pmcid="PMC11730665").parse().article_text

[32m2025-06-11 15:12:58.134[0m | [1mINFO    [0m | [36msrc.annotation_extraction.models[0m:[36m__init__[0m:[36m73[0m - [1mGetting article text from PMCID: PMC11730665[0m
[32m2025-06-11 15:12:58.137[0m | [1mINFO    [0m | [36msrc.annotation_extraction.models[0m:[36m__init__[0m:[36m73[0m - [1mGetting article text from PMCID: PMC11730665[0m


In [79]:
DRUG_EXTRACTION_PROMPT = """
You are an expert pharmacogenomics researcher reading and extracting annotations from the following article

\n\n{article_text}\n\n

These are the following terms for which we need to extract values:

Term: Variant/Haplotypes
- Content: The specific genetic variant mentioned in the study
- Manual Process: Look for SNP IDs (rs numbers), star alleles (CYP2D6*4), or genotype combinations
- Example: rs2909451, CYP2C19*1, CYP2C19*2, *1/*18

Term: Gene
- Content: Gene symbol associated with the variant
- Manual Process: Find the gene name near the variant mention, use standard HUGO symbols
- Example: DPP4, CYP2C19, KCNJ11

Term: Drug(s)
- Content: Generic drug name(s) studied
- Manual Process: Extract drug names from methods/results, use generic names, separate multiple drugs with commas
- Example: sitagliptin, clopidogrel, aspirin

Term: Phenotype Category
- Content: Type of clinical outcome studied
- Manual Process: Categorize based on what was measured:
    - Efficacy: Treatment response, clinical improvement
    - Metabolism/PK: Drug levels, clearance, half-life
    - Toxicity: Adverse events, side effects
    - Dosage: Dose requirements, dose adjustments
    - Other: Everything else
- Example: Efficacy (for HbA1c improvement study)

Term: Significance
- Content: Whether the association was statistically significant
- Manual Process: Look for p-values, confidence intervals:
    - yes: p < 0.05 or explicitly stated as significant
    - no: p ≥ 0.05 or stated as non-significant
    - not stated: No statistical testing mentioned
- Example: yes (P < .001 in sitagliptin study)

Term: Notes
- Content: Key study details, methodology, or important context
- Manual Process: Extract relevant quotes showing statistical results, study design, or important caveats
- Example: "Patients with the rs2909451 TT genotype in the study group exhibited a median HbA1c improvement of 0.57..."

Term: Standardized Sentence

- Content: Standardized description of the genetic association
- Manual Process: Write in format: "[Genotype/Allele] is [associated with/not associated with] [increased/decreased]
[outcome] [drug context] [population context]"
- Example: "Genotype TT is associated with decreased response to sitagliptin in people with Diabetes Mellitus, Type 2."

Term: Alleles

- Content: Specific allele or genotype if different from Variant/Haplotypes field
- Manual Process: Extract the exact genotype mentioned (AA, TT, CC, del/del, etc.)
- Example: TT, *1/*18, del/del

Term: Metabolizer types

- Content: CYP enzyme phenotype categories
- Manual Process: Look for metabolizer classifications in CYP studies:
    - poor metabolizer, intermediate metabolizer, extensive metabolizer, ultrarapid metabolizer
- Example: intermediate metabolizer

Term: Comparison Allele(s) or Genotype(s)

- Content: Reference genotype used for comparison
- Manual Process: Find what the study variant was compared against
- Example: *1/*1, C (for wild-type comparisons)

Term: Comparison Metabolizer types

- Content: Reference metabolizer status for comparison
- Manual Process: Extract the comparison metabolizer phenotype
- Example: normal metabolizer

Term: Specialty Population

- Content: Age-specific populations
- Manual Process: Check if study specifically focused on:
    - Pediatric: Children/adolescents
    - Geriatric: Elderly patients
    - Leave empty for general adult populations

Term: Population types
- Content: Descriptor of study population
- Manual Process: Look for population descriptors, usually "in people with" or ethnicity information
- Example: in people with

Term: Population Phenotypes or diseases
- Content: Disease/condition context with standardized prefix
- Manual Process: Find the medical condition studied, add appropriate prefix:
    - Disease: for established diseases
    - Other: for conditions/traits
    - Side Effect: for adverse events
- Example: Other:Diabetes Mellitus, Type 2

Term: isPlural
- Content: Grammar helper for sentence construction
- Manual Process: Use Is for singular subjects, Are for plural
- Example: Is

Term: Is/Is Not associated
- Content: Direction of association
- Manual Process: Determine if association was:
    - Associated with: Positive association found
    - Not associated with: No association found
- Example: Associated with

Term: Direction of effect

- Content: Whether the effect increases or decreases the outcome
- Manual Process: Look for directional language:
    - increased: Higher levels, better response, more effect
    - decreased: Lower levels, worse response, less effect
    - Leave empty if no clear direction
- Example: decreased

Term: PD/PK terms

- Content: Pharmacological outcome descriptor
- Manual Process: Extract the specific outcome measured:
    - response to, concentrations of, metabolism of, clearance of, dose of
- Example: response to

Term: Multiple drugs And/or

- Content: Logical connector for multiple drugs
- Manual Process: If multiple drugs mentioned:
    - and: All drugs together
    - or: Any of the drugs
    - Leave empty for single drug

Term: Multiple phenotypes or diseases And/or

- Content: Logical connector for multiple conditions
- Manual Process: Similar to drugs, use and/or for multiple conditions
- Leave empty for single condition

General recommended strategies

1. Scan for genetic variants: Look for "rs" numbers, gene names with asterisks, or phrases like "genotype," "allele,"
"polymorphism"
2. Identify drug context: Find drug names in methods, results, or discussion sections
3. Locate outcome measures: Look for clinical endpoints, lab values, response rates, adverse events
4. Find statistical associations: Search for p-values, odds ratios, significant differences between genotype groups
5. Extract population details: Note the study population, disease context, and inclusion criteria
6. Standardize the relationship: Convert the finding into the standardized sentence format following the association pattern

For each term, the output should be of the format:

Extracted Output: (output)
Reason: (one sentence justification)
Quote: (quote from the article that demonstrates why)
"""

In [95]:
PHENO_EXTRACTION_PROMPT = """
You are an expert pharmacogenomics researcher reading and extracting annotations related to how variants affect phenotypes from the following article

\n\n{article_text}\n\n

These are the following terms for which we need to extract values:

## Terms for Extraction

### Variant/Haplotypes
- **Content**: The specific genetic variant studied
- **Manual Process**: Extract SNP IDs (rs numbers), HLA alleles, star alleles, or genotype combinations
- **Example**: HLA-B*35:08, rs1801272, UGT1A1*1, UGT1A1*28

### Gene
- **Content**: Gene symbol associated with the variant
- **Manual Process**: Find the gene name near the variant mention
- **Example**: HLA-B, CYP2A6, UGT1A1

### Drug(s)
- **Content**: Drug(s) that caused or were involved in the phenotype
- **Manual Process**: 
  - Extract drug names that triggered the adverse event or phenotype
  - Leave empty for disease susceptibility studies without drug involvement
- **Example**: lamotrigine, sacituzumab govitecan, empty for disease predisposition

### Phenotype Category
- **Content**: Type of phenotype or outcome studied
- **Manual Process**: Categorize based on primary outcome:
  - Toxicity: Adverse drug reactions, side effects, drug-induced toxicity
  - Efficacy: Treatment response, therapeutic outcomes
  - Metabolism/PK: Pharmacokinetic parameters, drug levels
  - Dosage: Dose requirements, dose-response relationships
  - Other: Disease susceptibility, traits not directly drug-related
- **Example**: 
  - Toxicity (for Stevens-Johnson Syndrome)
  - Other (for alcoholism risk)

### Significance
- **Content**: Statistical significance of the association
- **Manual Process**: Look for p-values and statistical tests:
  - yes: p < 0.05 or stated as significant
  - no: p ≥ 0.05 or explicitly non-significant
  - not stated: No statistical testing reported
- **Example**: no (for non-significant HLA associations)

### Notes
- **Content**: Key study details, statistics, methodology
- **Manual Process**: Extract relevant quotes showing statistical results, case descriptions, or important context
- **Example**: "The allele was not significant when comparing allele frequency in cases..."

### Standardized Sentence
- **Content**: Standardized description of the genetic-phenotype association
- **Manual Process**: Write in format: "[Variant] is [associated with/not associated with] [increased/decreased] [phenotype outcome] [drug context] [population context]"
- **Example**: "HLA-B *35:08 is not associated with likelihood of Maculopapular Exanthema, severe cutaneous adverse reactions or Stevens-Johnson Syndrome when treated with lamotrigine in people with Epilepsy."

### Alleles
- **Content**: Specific allele or genotype if different from main variant field
- **Manual Process**: Extract the exact genotype mentioned
- **Example**: *35:08, AA + AT, *1/*28 + *28/*28

### Specialty Population
- **Content**: Age-specific populations
- **Manual Process**: Identify if study focused on specific age groups:
  - Pediatric: Children/adolescents
  - Geriatric: Elderly patients
  - Leave empty for general adult populations
- **Example**: Pediatric (for children with Fanconi Anemia)

### Metabolizer Types
- **Content**: CYP enzyme phenotype when applicable
- **Manual Process**: Look for metabolizer classifications in CYP studies:
  - poor metabolizer
  - intermediate metabolizer
  - extensive metabolizer
  - ultrarapid metabolizer
  - deficiency
- **Example**: ultrarapid metabolizer, intermediate activity

### isPlural
- **Content**: Grammar helper for sentence construction
- **Manual Process**: Use Is for singular subjects, Are for plural
- **Example**: Is (for single allele), Are (for combined genotypes)

### Is/Is Not Associated
- **Content**: Direction of statistical association
- **Manual Process**: Determine association type:
  - Associated with: Positive association found
  - Not associated with: No association found
- **Example**: Not associated with, Associated with

### Direction of Effect
- **Content**: Whether the variant increases or decreases the phenotype
- **Manual Process**: Look for directional language:
  - increased: Higher risk, more severe, greater likelihood
  - decreased: Lower risk, less severe, reduced likelihood
  - Leave empty if no clear direction
- **Example**: 
  - increased (for higher toxicity risk)
  - decreased (for lower disease risk)

### Side Effect/Efficacy/Other
- **Content**: Specific phenotype outcome with standardized prefix
- **Manual Process**: Categorize the phenotype and add appropriate prefix:
  - Side Effect: for adverse drug reactions
  - Efficacy: for therapeutic outcomes
  - Disease: for disease conditions
  - Other: for other traits/conditions
  - PK: for pharmacokinetic measures
- **Example**: 
  - Side Effect:Stevens-Johnson Syndrome
  - Disease:Alcohol abuse
  - Other:Medication adherence

### When Treated With/Exposed To/When Assayed With
- **Content**: Drug administration context
- **Manual Process**: Use standard phrases:
  - when treated with: For therapeutic drug administration
  - when exposed to: For environmental or non-therapeutic exposure
  - due to: For substance-related disorders
  - Leave empty for non-drug phenotypes
- **Example**: when treated with, due to (for substance abuse)

### Multiple Drugs And/Or
- **Content**: Logical connector for multiple drugs
- **Manual Process**: If multiple drugs involved:
  - and: Combination therapy
  - or: Any of the drugs
  - Leave empty for single drug
- **Example**: or (for any of several drugs)

### Population Types
- **Content**: Description of study population
- **Manual Process**: Look for population descriptors:
  - in people with: General population with condition
  - in children with: Pediatric population
  - in women with: Gender-specific population
- **Example**: in people with, in children with

### Population Phenotypes or Diseases
- **Content**: Disease/condition context with prefix
- **Manual Process**: Find the medical condition and add prefix:
  - Disease: for established diseases
  - Other: for conditions/traits
- **Example**: 
  - Disease:Epilepsy
  - Other:Diabetes Mellitus, Type 2

### Multiple Phenotypes or Diseases And/Or
- **Content**: Logical connector for multiple conditions
- **Manual Process**: Use and/or for multiple disease contexts
- **Example**: and (for multiple comorbidities)

### Comparison Allele(s) or Genotype(s)
- **Content**: Reference genotype for comparison
- **Manual Process**: Find what the variant was compared against
- **Example**: TT (wild-type), *1/*1 (normal function allele)

### Comparison Metabolizer Types
- **Content**: Reference metabolizer phenotype
- **Manual Process**: Extract comparison metabolizer status
- **Example**: normal metabolizer

## General Strategy Recommendations

1. **Identify Phenotype Outcomes**: Look for adverse events, toxicities, disease conditions, clinical traits
2. **Find Genetic Associations**: Search for variants linked to the phenotype (may or may not involve drugs)
3. **Determine Drug Involvement**: Check if phenotype is drug-induced or related to disease susceptibility
4. **Extract Statistical Evidence**: Look for odds ratios, p-values, case reports, frequency differences
5. **Categorize Phenotype Type**: Classify as toxicity, efficacy, disease susceptibility, or other trait
6. **Note Population Context**: Identify specific patient populations, age groups, disease conditions
7. **Standardize the Relationship**: Convert findings into standardized sentence format describing the genetic-phenotype association

For each term, the output should be of the format:

Extracted Output: (output)
Reason: (one sentence justification)
Quote: (quote from the article that demonstrates why)
"""

In [105]:
FA_EXTRACTION_PROMPT = """
You are an expert pharmacogenomics researcher reading and extracting annotations related to how variants affect in-vitro or lab measured mechanisms from the following article

\n\n{article_text}\n\n

These are the following terms for which we need to extract values:

## Terms for Extraction

### Variant/Haplotypes
- **Content**: The specific genetic variant studied
- **Manual Process**: Extract variant names, star alleles, SNP IDs, or protein constructs tested
- **Example**: CYP2C19*1, CYP2C19*17, rs72552763, CYP2B6*1, CYP2B6*6

### Gene
- **Content**: Gene symbol associated with the variant
- **Manual Process**: Identify the gene being studied functionally
- **Example**: CYP2C19, CYP2B6, SLC22A1

### Drug(s)
- **Content**: Substrate or compound used in the functional assay
- **Manual Process**: Extract the drug/substrate used to test enzyme activity or transport
- **Example**: normeperidine, bupropion, warfarin, voriconazole

### Phenotype Category
- **Content**: Type of functional outcome measured
- **Manual Process**: Categorize based on what was measured:
  - Metabolism/PK: Enzyme activity, clearance, transport, binding affinity
  - Efficacy: Functional response in cellular systems
  - Leave empty for basic biochemical studies
- **Example**: 
  - Metabolism/PK (for enzyme kinetics)
  - Efficacy (for cellular response)

### Significance
- **Content**: Statistical significance of functional differences
- **Manual Process**: Look for statistical comparisons:
  - yes: Significant differences in activity/function
  - no: No significant differences
  - not stated: No statistical testing reported
- **Example**: 
  - yes (for significant activity differences)
  - not stated (for descriptive studies)

### Notes
- **Content**: Key experimental details, methodology, quantitative results
- **Manual Process**: Extract relevant quotes showing experimental conditions, numerical results, or important technical details
- **Example**: "Clearance was 26.57% of wild-type. CYP2C19 variants expressed in Sf21 insect cells..."

### Standardized Sentence
- **Content**: Standardized description of the functional relationship
- **Manual Process**: Write in format: "[Variant] is associated with [increased/decreased] [functional outcome] [experimental context] as compared to [reference variant]"
- **Example**: "CYP2C19 *17/*17 is associated with increased formation of normeperidine as compared to CYP2C19 *1/*1 + *1/*17."

### Alleles
- **Content**: Specific allele or genotype tested
- **Manual Process**: Extract the exact variant designation
- **Example**: *17/*17, *1/*1, del, A

### Metabolizer Types
- **Content**: Phenotype classification if applicable
- **Manual Process**: Rarely used in functional studies; mainly for CYP phenotyping
- **Example**: Usually empty

### Comparison Allele(s) or Genotype(s)
- **Content**: Reference variant for comparison
- **Manual Process**: Find the control/wild-type variant used for comparison
- **Example**: *1/*1 + *1/*17, *1, GAT

### Comparison Metabolizer Types
- **Content**: Reference metabolizer status
- **Manual Process**: Usually empty for functional studies
- **Example**: Usually empty

### Assay Type
- **Content**: Laboratory method or experimental system used
- **Manual Process**: Extract the specific assay methodology:
  - in human liver microsomes: Microsomal enzyme assays
  - hydroxylation assay: Specific metabolic pathway assays
  - crystal structure prediction: Computational modeling
  - Leave empty if not specified
- **Example**: 
  - in human liver microsomes
  - hydroxylation assay
  - crystal structure prediction

### Cell Type
- **Content**: Cell line or tissue system used for the assay
- **Manual Process**: Extract the specific cellular context:
  - 293FT cells: Human embryonic kidney cells
  - COS-7 cells: Monkey kidney cells
  - Sf21 insect cells: Insect cells for baculovirus expression
  - in insect microsomes: Microsomal preparations
  - expressed in [cell type]: Heterologous expression systems
- **Example**: 
  - in 293FT cells
  - expressed in COS-7 cells

### Specialty Population
- **Content**: Age-specific populations (rarely applicable to functional studies)
- **Manual Process**: Usually leave empty for in vitro studies
- **Example**: Usually empty

### isPlural
- **Content**: Grammar helper for sentence construction
- **Manual Process**: Use Is for singular subjects, Are for plural
- **Example**: Is

### Is/Is Not Associated
- **Content**: Direction of functional association
- **Manual Process**: Determine association type:
  - Associated with: Functional difference observed
  - Not associated with: No functional difference
- **Example**: Associated with

### Direction of Effect
- **Content**: Whether the variant increases or decreases function
- **Manual Process**: Look for directional language:
  - increased: Higher activity, better function, enhanced capability
  - decreased: Lower activity, reduced function, impaired capability
- **Example**: 
  - increased (for enhanced activity)
  - decreased (for reduced activity)

### Functional Terms
- **Content**: Specific functional outcome measured
- **Manual Process**: Extract the precise functional parameter:
  - activity of: Enzyme activity measurements
  - clearance of: Drug clearance kinetics
  - formation of: Metabolite formation
  - transport of: Transporter function
  - affinity to: Binding affinity
  - catalytic activity of: Catalytic efficiency
- **Example**: 
  - formation of
  - activity of
  - clearance of

### Gene/Gene Product
- **Content**: Specific gene or protein being functionally assessed
- **Manual Process**: Extract the gene symbol when the functional term relates to gene product activity
- **Example**: CYP2C19, CYP2B6, CYP2C9

### When Treated With/Exposed To/When Assayed With
- **Content**: Experimental substrate context
- **Manual Process**: Use standard phrases for functional assays:
  - when assayed with: For enzyme activity assays
  - of: For direct metabolite measurements
  - Leave empty for non-substrate specific functions
- **Example**: 
  - when assayed with
  - of

### Multiple Drugs And/Or
- **Content**: Logical connector for multiple substrates
- **Manual Process**: If multiple substrates tested:
  - and: Combination substrate assays
  - or: Alternative substrate assays
  - Leave empty for single substrate
- **Example**: or (for alternative substrates)

## Manual Reading Strategy for Functional Annotations

1. **Identify Experimental System**: Look for cell lines, microsomes, expression systems, computational models
2. **Find Functional Readouts**: Search for enzyme activity, kinetic parameters, binding affinity, transport rates
3. **Extract Substrate Information**: Identify the drug/compound used to test function
4. **Locate Comparison Data**: Find reference variants (usually wild-type or *1 alleles) for comparison
5. **Quantify Functional Changes**: Look for fold-changes, percentages, kinetic parameters (Km, Vmax, clearance)
6. **Note Experimental Conditions**: Extract assay conditions, expression systems, substrate concentrations
7. **Standardize the Relationship**: Convert findings into standardized sentence format describing the functional difference

## Key Differences from Clinical Annotations

- **Laboratory-based**: In vitro studies rather than patient studies
- **Mechanistic Focus**: How variants affect protein function rather than clinical outcomes
- **Quantitative Measures**: Enzyme kinetics, binding constants, activity percentages
- **Controlled Conditions**: Defined experimental systems rather than clinical populations
- **Substrate-specific**: Effects measured with specific drugs/compounds as substrates

**Purpose**: Functional annotations provide the mechanistic basis for understanding why certain variants affect drug response in patients - they show how genetic changes alter protein function at the molecular level.

For each term, the output should be of the format:

Extracted Output: (output)
Reason: (one sentence justification)
Quote: (sentence from the article that demonstrates why)
"""

In [104]:
from dotenv import load_dotenv
load_dotenv()

True

In [106]:
model = SimpleLLM()
article_text = ArticleParser(pmcid="PMC5728534").get_article_text()
prompt = PromptGenerator(FA_EXTRACTION_PROMPT, {"article_text": article_text}).get_prompt()
summary = model.generate(prompt)

[32m2025-06-11 21:58:11.800[0m | [1mINFO    [0m | [36msrc.annotation_extraction.models[0m:[36m__init__[0m:[36m73[0m - [1mGetting article text from PMCID: PMC5728534[0m


In [107]:
# save summary to file
with open("output_testing/fa_2.txt", "w") as f:
    f.write(summary)