# Exploring EVA scores + bechmarks against pharmaproject

### Conclusions:

#### clinical significance:

* 1% of the evidence strings have more than 1 clinical significance annotation
* These rows will be excluded from the anaysis for now (for simplicity)


### Questions:

* [ ] loading pharmaproject
* [ ] establish overlap between eva evidence set and pharmaproject
    * [ ] evidence level
    * [ ] association level
* [ ] Split overlap across review status and clicical significance

In [48]:
import pandas as pd 
import json

# Read data:
eva_df = pd.read_csv('updated_eva.tsv.gz', compression='infer', sep='\t')

# Load json object:
eva_df.clinical_significance = eva_df.clinical_significance_json.apply(lambda x: json.loads(x))


# Print out basic information:
print(f'Number of evidence: {len(eva_df)}')
print(f'Number of unique associations: {len(eva_df[["disease","target"]].drop_duplicates())}')

# Print out unmapped diseases and unmapped gene names:
print(f'Number of unmapped diseases: {len(eva_df.loc[eva_df.label.isna()])}')
print(f'Number of unmapped genes: {len(eva_df.loc[eva_df.symbol.isna()])}')



Number of evidence: 574097
Number of unique associations: 21548
Number of unmapped diseases: 0
Number of unmapped genes: 0


### Distribution of clinical significance

1. Get all clinical significance
2. Get a count across all clinical significance

In [52]:
def get_clinical_significance(x):
    
    for y in x:
        try:
            clins[y] += 1
        except:
            clins[y] = 1
    
    
clins = dict()

eva_df.clinical_significance.apply(get_clinical_significance)
clins

{'pathogenic': 90980,
 'not provided': 6025,
 'association': 174,
 'likely pathogenic': 42015,
 'risk factor': 377,
 'uncertain significance': 263668,
 'affects': 126,
 'drug response': 284,
 'benign': 75540,
 'conflicting interpretations of pathogenicity': 7718,
 'likely benign': 91390,
 'other': 2146,
 'protective': 29,
 'association not found': 3}

In [60]:
eva_df.clinical_significance.apply(len).value_counts()

1    567724
2      6368
3         5
Name: clinical_significance, dtype: int64

## Characterize eva associations

In [62]:
print(f'Number of evidence: {len(eva_df)}')
print(f'Number of associations: {len(eva_df[["target","disease"]].drop_duplicates())}')

Number of evidence: 574097
Number of associations: 21548


## Reading pharmaproject data

In [67]:
pharma_project_file = '/Users/dsuveges/project/issue-1166_benchmarking_clingen_evidences/abbvie_pharmaprojects_2018_mapped.csv'

# Reading pharmaproject file as dataframe:
pharma_df = pd.read_csv(pharma_project_file)

# Renaming columns for easy handling:
pharma_df.rename(columns={
    'ensembl_id': 'target',
    'id': 'disease',
    'lApprovedUS.EU': 'approved'
}, inplace=True)

print(f'Number of disease/target associations pair: {len(pharma_df)}')

Number of disease/target associations pair: 22947


In [69]:
# Merging data
pharma_merged = eva_df.merge(pharma_df, on=['disease', 'target'], how='left')
pharma_merged.head()

Unnamed: 0,type,target,disease,rsid,clinvar_acces,score,star,review_status,clinical_significance,so_code,consequence,efo_id,label,gene_id,symbol,clinical_significance_json,approved
0,genetic_association,ENSG00000154803,Orphanet_2903,rs886037610,RCV000003542,1e-07,0,no assertion criteria provided,[pathogenic],SO_0001589,frameshift_variant,Orphanet_2903,Familial spontaneous pneumothorax,ENSG00000154803,FLCN,"[""pathogenic""]",
1,genetic_association,ENSG00000186832,Orphanet_2337,rs1555573633,RCV000015714,1e-07,0,no assertion criteria provided,[pathogenic],SO_0001818,protein_altering_variant,Orphanet_2337,Non-epidermolytic palmoplantar keratoderma,ENSG00000186832,KRT16,"[""pathogenic""]",
2,genetic_association,ENSG00000103197,Orphanet_805,rs137854012,RCV000042664,1e-07,0,no assertion provided,[not provided],SO_0001589,frameshift_variant,Orphanet_805,Tuberous sclerosis,ENSG00000103197,TSC2,"[""not provided""]",
3,genetic_association,ENSG00000103197,Orphanet_805,rs137854420,RCV000042684,1e-07,0,no assertion provided,[not provided],SO_0001589,frameshift_variant,Orphanet_805,Tuberous sclerosis,ENSG00000103197,TSC2,"[""not provided""]",
4,genetic_association,ENSG00000103197,Orphanet_805,rs137854007,RCV000042927,1e-07,0,no assertion provided,[not provided],SO_0001589,frameshift_variant,Orphanet_805,Tuberous sclerosis,ENSG00000103197,TSC2,"[""not provided""]",


In [73]:
# Ok, what do we have here:
overlap = pharma_merged.loc[~pharma_merged.approved.isna()]
print(f'Number of poscon evidence: {len(overlap)}')
print(f'Number of poscon association: {len(overlap[["disease","target"]].drop_duplicates())}')


print(f'Number of approved evidence: {len(overlap.loc[overlap.approved == True])}')
print(f'Number of poscon association: {len(overlap.loc[overlap.approved == True, ["disease","target"]].drop_duplicates())}')

Number of poscon evidence: 8783
Number of poscon association: 294
Number of approved evidence: 3553
Number of poscon association: 49


In [92]:
# Summarizing clinical significane counts:
clinical_summary_dict = {}
for clin_list in eva_df.clinical_significance:
    for significance in clin_list:
        try:
            clinical_summary_dict[significance] += 1
        except:
            clinical_summary_dict[significance] = 1
            
# Generating dataframe from the bare eva counts:        
overlap_counts_df = pd.DataFrame({'EVA_evidence_counts':list(clinical_summary_dict.values()),
                                 'poscon_counts': 0, 'approved_counts': 0}, index = list(clinical_summary_dict.keys()))

for index, row in overlap.iterrows():
    for sign in row['clinical_significance']:
        overlap_counts_df.loc[sign,'poscon_counts'] += 1
        
        if row['approved']:
            overlap_counts_df.loc[sign,'approved_counts'] += 1
overlap_counts_df

Unnamed: 0,EVA_evidence_counts,poscon_counts,approved_counts
pathogenic,90980,951,176
not provided,6025,81,47
association,174,1,0
likely pathogenic,42015,2016,190
risk factor,377,23,1
uncertain significance,263668,3218,1523
affects,126,0,0
drug response,284,0,0
benign,75540,757,473
conflicting interpretations of pathogenicity,7718,101,63


In [94]:
overlap_counts_df.sort_values('EVA_evidence_counts', ascending=False)

Unnamed: 0,EVA_evidence_counts,poscon_counts,approved_counts
uncertain significance,263668,3218,1523
likely benign,91390,1700,1113
pathogenic,90980,951,176
benign,75540,757,473
likely pathogenic,42015,2016,190
conflicting interpretations of pathogenicity,7718,101,63
not provided,6025,81,47
other,2146,2,0
risk factor,377,23,1
drug response,284,0,0


In [101]:
overlap_counts_df['poscon_expected'] = overlap_counts_df.EVA_evidence_counts / overlap_counts_df.EVA_evidence_counts.sum() * overlap_counts_df.poscon_counts.sum()
overlap_counts_df['poscon_enrichment'] = overlap_counts_df.poscon_counts /overlap_counts_df.poscon_expected 

overlap_counts_df['approved_expected'] = overlap_counts_df.EVA_evidence_counts / overlap_counts_df.EVA_evidence_counts.sum() * overlap_counts_df.approved_counts.sum()
overlap_counts_df['approved_enrichment'] = overlap_counts_df.approved_counts /overlap_counts_df.approved_expected 

overlap_counts_df.sort_values('EVA_evidence_counts', ascending=False)

Unnamed: 0,EVA_evidence_counts,poscon_counts,approved_counts,poscon_expected,poscon_enrichment,approved_expected,approved_enrichment
uncertain significance,263668,3218,1523,4020.372054,0.800423,1628.861619,0.935009
likely benign,91390,1700,1113,1393.501684,1.219948,564.579939,1.971377
pathogenic,90980,951,176,1387.250062,0.685529,562.047082,0.313141
benign,75540,757,473,1151.823145,0.657219,466.663405,1.013579
likely pathogenic,42015,2016,190,640.638727,3.146859,259.556036,0.732019
conflicting interpretations of pathogenicity,7718,101,63,117.682963,0.858238,47.679483,1.321323
not provided,6025,81,47,91.868341,0.881697,37.220638,1.26274
other,2146,2,0,32.721902,0.061121,13.257343,0.0
risk factor,377,23,1,5.748442,4.001084,2.328993,0.42937
drug response,284,0,0,4.330391,0.0,1.754467,0.0


In [107]:
from scipy import stats
from scipy import statsmodels.stats.proportion.proportions_ztest

SyntaxError: invalid syntax (<ipython-input-107-f774532f054b>, line 2)

In [110]:
from statsmodels.stats.proportion import proportions_ztest

In [113]:
proportions_ztest(1393, overlap_counts_df.poscon_counts.sum(), value=91390/overlap_counts_df.EVA_evidence_counts.sum(), alternative='larger')
# proportions_ztest(count, nobs, value=None, alternative='two-sided', prop_var=False)[source]


(-0.014643323046165113, 0.5058416319207657)