# Benchmarking ClinGen vs Literature evidence


1. Fetch most recent OT envidence set
2. Parse evidence: get all literature based evidence.
3. Apply score filter.

```bash
cd /Users/dsuveges/project/evidences
wget https://storage.googleapis.com/open-targets-data-releases/20.06/output/20.06_evidence_data.json.gz
```

## Parsing evidence file

In [26]:
import pandas as pd
import json

literature_data = '/Users/dsuveges/project/evidences/20.06_evidence_data.filtered.parquet'
clingen_Data = '/Users/dsuveges/project/evidences/clingen_2020-08-04.json.gz'

# Reading and filtering literature data:
literature_df = pd.read_parquet(literature_data)

score_threshold = 0.6 # 32570 evidence / 20289 unique
score_threshold = 0.5 # 47300 evidence / 27402 unique

filtered_lit = literature_df.loc[literature_df.assoc_score >= score_threshold]

# Opening filtered and processed clingen data:
clingen_df = pd.read_csv('parsed_clingen_data.tsv.gz', sep='\t', compression='infer')

print(f"Number of literature evidence: {len(filtered_lit):,}")
print(f"Number of unique target/disease evidence: {len(filtered_lit[['efo_code','gene_id']].drop_duplicates()):,}")
      
      

Number of literature evidence: 47,300
Number of unique target/disease evidence: 27,402


### Find out overlap between clingen and the literature evidence set

In [2]:
# clingen_data.apply(lambda row: (row['confidence'],row['score']), axis=1).value_counts()

summary_list = []
for conf, group in clingen_df.groupby(['confidence']):
    summary_list.append({
        'confidence': conf,
        'evidence_count': len(group),
        'score': group.score.tolist()[0]
    })
    
summary_df = pd.DataFrame(summary_list)
summary_df.sort_values(['score'],ascending=False, inplace=True)


summary_df

Unnamed: 0,confidence,evidence_count,score
0,Definitive,528,1.0
6,Strong,19,0.75
3,Moderate,100,0.5
2,Limited,162,0.25
1,Disputed,69,0.1
5,Refuted,10,0.05
4,No Reported Evidence,92,0.01


In [3]:
confidence = 'Definitive'

def get_overlap(confidence):

    # Filter data for a specific confidence:
    pairs = clingen_df.loc[clingen_df.confidence == confidence]

    # Filter for overlapping target/disease pairs:
    found_scores = pairs.apply(lambda row: filtered_lit.loc[(filtered_lit.efo_code == row['disease']) & (filtered_lit.gene_id == row['target'])].assoc_score.mean(), axis =1)

    # Report overlap:
    score_mean = found_scores.loc[~found_scores.isna()].mean() # Found scores can be plot as boxplot
    hits_found = len(found_scores.loc[~found_scores.isna()])
    
    return (hits_found, score_mean)

overlaps = summary_df.confidence.apply(get_overlap)

summary_df['lit_overlap_cnt'] = overlaps.apply(lambda x: x[0])
summary_df['lit_overlap_mean_score'] = overlaps.apply(lambda x: x[1])



In [44]:
summary_df['overlap_prop'] = summary_df.apply(lambda x: x['lit_overlap_cnt']/x['evidence_count'], axis = 1)
summary_df

Unnamed: 0,confidence,evidence_count,score,lit_overlap_cnt,lit_overlap_mean_score,overlap_prop
0,Definitive,528,1.0,32,0.659018,0.060606
6,Strong,19,0.75,1,0.504,0.052632
3,Moderate,100,0.5,1,0.69,0.01
2,Limited,162,0.25,0,,0.0
1,Disputed,69,0.1,1,0.612,0.014493
5,Refuted,10,0.05,1,0.614,0.1
4,No Reported Evidence,92,0.01,0,,0.0


## Test against all literature data


Instead of applying a score threshold, I'll use all literature evidence as follows:

1. Calculate harmonic sum for all unique target-disease pairs.
2. Performace of ClinGen scores are tested against this table.

In [27]:
def harmonic_sum(data, scale_factor=1, cap=None):
    """
    Returns an harmonic sum for the data passed
    Args:
        data (list): list of floats to compute the harmonic sum from
        scale_factor (float): a scaling factor to multiply to each datapoint. Defaults to 1
        cap (float): if not None, never return an harmonic sum higher than the cap value.
    Returns:
        harmonic_sum (float): the harmonic sum of the data passed
    """
    
    data.sort(reverse=True)
    harmonic_sum = sum(s / ((i+1) ** scale_factor) for i, s in enumerate(data))
    
    # Applying cap:
    if cap is not None and harmonic_sum > cap:
        return cap
      
    return harmonic_sum

In [None]:
summed_literature = pd.DataFrame(columns=['target','disease','score'])
iloc = 0

for (target, disease), group in literature_df.groupby(['gene_id', 'efo_code']):
    summed_literature.loc[iloc,'target']  = target
    summed_literature.loc[iloc,'disease'] = disease
    summed_literature.loc[iloc,'score']   = harmonic_sum(group.assoc_score.tolist())
    iloc += 1

summed_literature.head()

In [38]:
len(summed_literature)

507