In [1]:
import pandas as pd
import requests

from opentargets_pharmgkb.pandas_utils import read_tsv_to_df

In [2]:
work_dir = '/home/april/projects/opentargets/pharmgkb/star-alleles'

In [None]:
# Rerun to refresh data
!cd {work_dir}
!wget -q https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
!unzip -qj clinicalAnnotations.zip "*.tsv" -d {work_dir}
!rm clinicalAnnotations.zip

In [3]:
annotations_df = read_tsv_to_df(f'{work_dir}/clinical_annotations.tsv')
alleles_df = read_tsv_to_df(f'{work_dir}/clinical_ann_alleles.tsv')

In [4]:
len(annotations_df)

5101

In [5]:
no_rs_annotations = annotations_df[~annotations_df['Variant/Haplotypes'].str.contains('rs')]

In [6]:
len(no_rs_annotations)

596

In [7]:
# Check names to see if there's anything truly bizarre
names = no_rs_annotations['Variant/Haplotypes'].unique()

In [8]:
# Note that the "variant/haplotype name" is a listing of which alleles are annotated in the specific record,
# it will not immediately tell you whether the records are referring to the "same haplotype"
names[:50]

array(['HLA-B*15:02',
       'CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41',
       'CYP2D6*1, CYP2D6*1xN, CYP2D6*2xN, CYP2D6*4, CYP2D6*5',
       'CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*2xN',
       'CYP2D6*1, CYP2D6*3, CYP2D6*4, CYP2D6*4xN, CYP2D6*5, CYP2D6*6',
       'CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D6*10x2, CYP2D6*11, CYP2D6*17, CYP2D6*21, CYP2D6*36, CYP2D6*41',
       'UGT1A3*1, UGT1A3*2, UGT1A3*3', 'HLA-B*55:01',
       'CYP2C19*1, CYP2C19*17',
       'NAT2*4, NAT2*5, NAT2*6, NAT2*7, NAT2*12, NAT2*13',
       'CYP3A5*1, CYP3A5*3', 'CYP2C9*1, CYP2C9*3',
       'CYP2C19*1, CYP2C19*2, CYP2C19*3', 'UGT1A1*1, UGT1A1*28',
       'CYP2B6*1, CYP2B6*6', 'NUDT15*1, NUDT15*4, NUDT15*5, NUDT15*6',
       'NUDT15*1, NUDT15*6', 'CYP2D6*1, CYP2D6*10', 'UGT1A1*1, UGT1A1*6',
       'CYP2C9*1, CYP2C9*2, CYP2C9*3', 'HLA-B*48:01',
       'CYP2C19*1, CYP2C19*2, CYP2C19*17', 'HLA-B*15:12',
  

In [9]:
star_allele_names = [n for n in names if '*' in n]
no_star_names = [n for n in names if '*' not in n]

In [10]:
no_star_names

['G6PD A- 202A_376G, G6PD B (reference)',
 'GSTT1 non-null, GSTT1 null',
 'GSTM1 non-null, GSTM1 null',
 'G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham',
 'SLC6A4 HTTLPR long form (L allele), SLC6A4 HTTLPR short form (S allele)',
 'G6PD B (reference), G6PD Mediterranean Haplotype',
 'G6PD B (reference), G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham',
 'G6PD B (reference), G6PD Canton, Taiwan-Hakka, Gifu-like, Agrigento-like',
 'G6PD B (reference), G6PD Mediterranean Haplotype, G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham',
 'G6PD A- 202A_376G, G6PD B (reference), G6PD Mediterranean Haplotype, G6PD Mediterranean, Dallas, Panama, Sassari, Cagliari, Birmingham',
 'G6PD A- 202A_376G']

No star allele observations:
* [G6PD](https://www.pharmgkb.org/gene/PA28469/haplotype) seems well-defined though the naming is idiosyncratic (e.g. is it safe to just comma-split these strings?)
  * may be a bit clearer in the alleles tables, e.g. [here](https://www.pharmgkb.org/clinicalAnnotation/1183621000)
* [GSTT1](https://www.pharmgkb.org/gene/PA183/haplotype), [GSTM1](https://www.pharmgkb.org/gene/PA182/haplotype) null/non-null are just absence or presence of the entire gene, if this naming convention is standard we can work with it
* [SLC6A4](https://www.pharmgkb.org/gene/PA312/haplotype) seems to be just... special

Note we can clearly get affected genes for all of these alleles though, from PGKB directly.

In [None]:
# Not actually sure whether the star/no star named allele distinction is relevant
# star_annotations = no_rs_annotations[no_rs_annotations['Variant/Haplotypes'].str.contains('\*')]
# no_star_annotations = no_rs_annotations[~no_rs_annotations['Variant/Haplotypes'].str.contains('\*')]

In [11]:
joined_df = alleles_df.merge(no_rs_annotations, on='Clinical Annotation ID')

In [12]:
# Remove some columns to make things easier to read...
joined_df = joined_df[['Clinical Annotation ID', 'Genotype/Allele', 'Annotation Text',
       'Allele Function', 'Variant/Haplotypes', 'Gene', 'Level of Evidence',
       'Phenotype Category', 'Drug(s)', 'Phenotype(s)']]

In [13]:
joined_df[joined_df['Clinical Annotation ID'] == '1451259580']

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant/Haplotypes,Gene,Level of Evidence,Phenotype Category,Drug(s),Phenotype(s)
1,1451259580,*1,The CYP2D6*1 allele is assigned as a normal fu...,Normal function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
2,1451259580,*1xN,The CYP2D6*1xN alleles (*1x2 and *1x≥3) have b...,Increased function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
3,1451259580,*2,The CYP2D6*2 allele is assigned as a normal fu...,Normal function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
4,1451259580,*3,The CYP2D6*3 allele is assigned as a no functi...,No function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
5,1451259580,*4,The CYP2D6*4 allele is assigned as a no functi...,No function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
6,1451259580,*5,The CYP2D6*5 allele is assigned as a no functi...,No function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
7,1451259580,*6,The CYP2D6*6 allele is assigned as a no functi...,No function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
8,1451259580,*10,The CYP2D6*10 allele is assigned as a decrease...,Decreased function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder
9,1451259580,*41,The CYP2D6*41 allele is assigned as a decrease...,Decreased function,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",CYP2D6,1A,Toxicity,amitriptyline,Depressive Disorder


Questions:

* Can we automatically get the PGKB spreadsheet definitions of these? Do we need to?
* Once we have those definitions what should we do with them?

In [14]:
allele_definition_url = 'https://api.pharmgkb.org/v1/download/file/attachment/{gene}_allele_definition_table.xlsx'

In [15]:
genes = joined_df['Gene'].unique()

In [16]:
genes

array(['HLA-B', 'CYP2D6', 'UGT1A3', 'CYP2C19', 'NAT2', 'CYP3A5', 'CYP2C9',
       'UGT1A1', 'CYP2B6', 'NUDT15', 'CYP2C8', 'CYP3A4', 'HLA-A', 'G6PD',
       'UGT2B15', 'SLCO1B1', 'GSTT1', 'GSTM1', 'TPMT', 'SLC6A4', 'HLA-C',
       'HLA-DRB1', 'HLA-DQB1', 'HLA-DPB1', 'CYP3A7', 'CYP2A6', 'HLA-DRB3',
       'CYP1A2', 'UGT1A6', 'CYP2E1', 'UGT1A7', 'HLA-DQA1', 'UGT1A4',
       'CYP1A1', 'CYP4F2'], dtype=object)

In [17]:
allele_def_tables = {}
for gene in genes:
    try:
        allele_def_tables[gene] = pd.read_excel(allele_definition_url.format(gene=gene), storage_options={'User-Agent': 'Mozilla/5.0'})
    except Exception as e:
        print(f'Error for {gene}: {e}')

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Error for UGT1A3: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")


Error for NAT2: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Error for UGT2B15: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")


Error for GSTT1: HTTP Error 404: 
Error for GSTM1: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")


Error for SLC6A4: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Error for CYP3A7: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Error for CYP1A2: HTTP Error 404: 
Error for UGT1A6: HTTP Error 404: 
Error for CYP2E1: HTTP Error 404: 
Error for UGT1A7: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")


Error for UGT1A4: HTTP Error 404: 
Error for CYP1A1: HTTP Error 404: 


  warn("Workbook contains no default style, apply openpyxl's default")


In [18]:
no_allele_def_table_genes = set(genes) - set(allele_def_tables.keys())

In [19]:
no_allele_def_table_genes

{'CYP1A1',
 'CYP1A2',
 'CYP2E1',
 'CYP3A7',
 'GSTM1',
 'GSTT1',
 'NAT2',
 'SLC6A4',
 'UGT1A3',
 'UGT1A4',
 'UGT1A6',
 'UGT1A7',
 'UGT2B15'}

Checked a few of this list and they indeed don't have definition tables in PharmGKB, categories I see:
* Refer to another resource: [some (but not all) CYP](https://www.pharmgkb.org/gene/PA129), [NAT](https://www.pharmgkb.org/gene/PA18/haplotype), [UGT](https://www.pharmgkb.org/gene/PA37179/haplotype)
* Null/non-null: [GSTT1](https://www.pharmgkb.org/gene/PA183/haplotype), [GSTM1](https://www.pharmgkb.org/gene/PA182/haplotype)
* Special: [SLC6A4](https://www.pharmgkb.org/gene/PA312/haplotype)

For now we'll skip these and look at those with allele definition tables (covers about 90% of no-RS records in PGKB).

In [20]:
# What do we lose if we skip these?
len(no_rs_annotations[no_rs_annotations['Gene'].isin(no_allele_def_table_genes)])

53

In [21]:
pd.set_option('display.max_rows', 100)

In [22]:
allele_def_tables['CYP2D6'].head(50)

Unnamed: 0,GENE: CYP2D6,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 141,Unnamed: 142,Unnamed: 143,Unnamed: 144,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Unnamed: 150
0,NG_008376.4 (ATG start),14C>T,19G>A,31G>A,64delC,73C>T,77G>A,82C>T,100C>T,122C>T,...,4165T>G,4167T>C,4168G>A,4169C>G,4170T>C,4173C>T,4181G>C,4187C>T,4214G>A,Structural Variation
1,Effect on protein (NP_000097.3),p.A5V,p.V7M,p.V11M,p.L22X,p.R25W,p.R26H,p.R28C,p.P34S,p.P41L,...,p.F481V,,p.A482T,p.A482G,,,p.S486T,p.S488F,p.R497H,
2,Position at NC_000022.11 (Homo sapiens chromos...,g.42130778G>A,g.42130773C>T,g.42130761C>T,g.42130729del,g.42130719G>A,g.42130715C>T,g.42130710G>A,g.42130692G>A,g.42130670G>A,...,g.42126627A>C,g.42126625A>G,g.42126624C>T,g.42126623G>C,g.42126622A>G,g.42126619G>A,g.42126611C>G,g.42126605G>A,g.42126578C>T,
3,Position at NG_008376.4 (CYP2D6 RefSeqGene; re...,g.5033C>T,g.5038G>A,g.5050G>A,g.5083del,g.5092C>T,g.5096G>A,g.5101C>T,g.5119C>T,g.5141C>T,...,g.9184T>G,g.9186T>C,g.9187G>A,g.9188C>G,g.9189T>C,g.9192C>T,g.9200G>C,g.9206C>T,g.9233G>A,
4,rsID,rs773790593,rs72549358,rs769258,,rs267608313,rs28371696,rs138100349,rs1065852,rs373243894,...,,,rs74478221,rs75467367,rs747998333,rs28371736,rs1135840,rs568495591,rs1440526469,
5,CYP2D6 Allele,,,,,,,,,,...,,,,,,,,,,
6,*1,G,C,C,G,G,C,G,G,G,...,A,A,C,G,A,G,C,G,C,
7,*2,,,,,,,,,,...,,,,,,,G,,,
8,*3,,,,,,,,,,...,,,,,,,,,,
9,*4,,,,,,,,R,,...,M,R,Y,S,R,R,S,,,


In [23]:
# Just because an allele definition table is present doesn't mean it's at all useful!
allele_def_tables['HLA-DRB1'].head(50)

Unnamed: 0,GENE: HLA-DRB1,Unnamed: 1
0,,
1,Effect on protein,
2,Position on chromosomal sequence,
3,Position on gene sequence,
4,rsID,
5,HLA-DRB1 Allele,
6,*01:01,Not Callable
7,*01:02,Not Callable
8,*01:03,Not Callable
9,*01:04,Not Callable


Understanding the allele definition table:
* First few rows give various definitions of variants: protein/chromosome/gene-level HGVS, and rsID if present
* Each subsequent row gives what alleles are present for each of these variants for a particular named allele
    * In theory should be able to use the "Genotype/Allele" column from the clinical allele annotations to index into this table
* The final column is "structural variation" and contains text describing the nature of the variant, e.g. `CYP2D7::CYP2D6 hybrid gene`
* Missing values = reference? Or is e.g. *1/first row the reference? If so what does missing value mean?

### Summary

* Non-rsID containing records represent 596 / 5101 = 12% of the clinical annotations
* Affected gene is easy to get for all named alleles - we can rely on the "Gene" column in PGKB data
* Most records have an allele definition table we can download - 53 do not
    * Not all of these are actually usable

### Questions

* Identifier for these? PGKB basically uses a list of haplotypes being annotated as the identifier in their annotations table
* Do we want to resolve named alleles of haplotypes to variants, and if so how to convey this information?
* Are we interested in functional consequences or is affected gene enough? What does "functional consequence" even mean here?