In [1]:
import pandas as pd
import numpy as np
from scipy import stats

## 1. Help functions

In [2]:
HAPPY_COLUMNS = ['Type', 'Filter', 'TRUTH.TP', 'QUERY.FP', 'TRUTH.FN', 'METRIC.Recall', 'METRIC.Precision', 'METRIC.F1_Score']


## 2. Input files

In [3]:
SAMPLE_ANNOTATIONS_FILE = 'sample_annotations.txt'
ARRAY_BENCHMARK_DIR = '1KG_HGDP_array_imputation_analysis'


## 3. Load data

In [4]:
# Load sample annotations
df_samples = pd.read_csv(SAMPLE_ANNOTATIONS_FILE, header = 0, sep = '\t')

hgid2naid = {
    'HG002': 'NA24385',
    'HG003': 'NA24149',
    'HG004': 'NA24143'
}

hgid2rel = {
    'HG002': 'Son',
    'HG003': 'Father',
    'HG004': 'Mother'
}

### 3.1 Load 1KG+HGDP array imputed variants

In [5]:
df_array_imputed_variants = []
for row_index, row in df_samples[['HGID', 'Family']].drop_duplicates().iterrows():
    filepath = f'{ARRAY_BENCHMARK_DIR}/Genomewide/{row.HGID}_GRCh37_1_22_v4.2.1_GSAv3_positions.impuded.dose.happy_benchmark.summary.csv'
    df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
        'METRIC.Recall': 'Recall', 
        'METRIC.Precision': 'Precision',
        'METRIC.F1_Score': 'F1_Score',
        'TRUTH.TP': 'TP',
        'QUERY.FP': 'FP',
        'TRUTH.FN': 'FN'
    })
    df['Total_called'] = df['TP'] + df['FP']
    df['HGID'] = row.HGID
    df_array_imputed_variants.append(df)
df_array_imputed_variants = pd.concat(df_array_imputed_variants).reset_index(drop = True)


## 4. Figures and tables

### TABLE: Precision and recall rates of variants imputed using local reference panel genome-wide.

In [6]:
data = {
    'Sample': [],
    'Imputed SNVs TP': [],
    'Imputed SNVs FN': [],
    'Imputed SNVs FP': [],
    'Imputed SNVs Precision': [],
    'Imputed SNVs Recall': [],
    'Imputed InDels TP': [],
    'Imputed InDels FN': [],
    'Imputed InDels FP': [],
    'Imputed InDels Precision': [],
    'Imputed InDels Recall': [],
}

for hgid, naid in hgid2naid.items():
    data['Sample'].append(hgid)
    
    df_temp = df_array_imputed_variants[df_array_imputed_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL')][['TP', 'FN', 'FP', 'Recall', 'Precision']]
    data['Imputed SNVs TP'].append(f'{df_temp.TP.values[0]:,}')
    data['Imputed SNVs FP'].append(f'{df_temp.FP.values[0]:,}')
    data['Imputed SNVs FN'].append(f'{df_temp.FN.values[0]:,}')
    data['Imputed SNVs Precision'].append(f'{df_temp.Precision.values[0]:.4f}')
    data['Imputed SNVs Recall'].append(f'{df_temp.Recall.values[0]:.4f}')
    
    df_temp = df_array_imputed_variants[df_array_imputed_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL')][['TP', 'FN', 'FP', 'Recall', 'Precision']]
    data['Imputed InDels TP'].append(f'{df_temp.TP.values[0]:,}')
    data['Imputed InDels FP'].append(f'{df_temp.FP.values[0]:,}')
    data['Imputed InDels FN'].append(f'{df_temp.FN.values[0]:,}')
    data['Imputed InDels Precision'].append(f'{df_temp.Precision.values[0]:.4f}')
    data['Imputed InDels Recall'].append(f'{df_temp.Recall.values[0]:.4f}')
    
pd.DataFrame(data)


Unnamed: 0,Sample,Imputed SNVs TP,Imputed SNVs FN,Imputed SNVs FP,Imputed SNVs Precision,Imputed SNVs Recall,Imputed InDels TP,Imputed InDels FN,Imputed InDels FP,Imputed InDels Precision,Imputed InDels Recall
0,HG002,2942195,410484,316718,0.9028,0.8776,326360,196029,47144,0.8738,0.6247
1,HG003,2908634,405509,317430,0.9016,0.8776,317001,183792,46494,0.8721,0.633
2,HG004,2934901,405570,313069,0.9036,0.8786,320526,187748,45098,0.8767,0.6306
