In [1]:
import pandas as pd
import numpy as np
from scipy import stats

## 1. Help functions

In [2]:
HAPPY_COLUMNS = ['Type', 'Filter', 'TRUTH.TP', 'QUERY.FP', 'TRUTH.FN', 'METRIC.Recall', 'METRIC.Precision', 'METRIC.F1_Score']


## 2. Input files

In [3]:
SAMPLE_ANNOTATIONS_FILE = 'sample_annotations.txt'
GLIMPSE_BENCHMARK_DIR = '1KG_HGDP_WEGS_imputation_analysis'


## 3. Load data

In [4]:
# Load sample annotations
df_samples = pd.read_csv(SAMPLE_ANNOTATIONS_FILE, header = 0, sep = '\t')

hgid2naid = {
    'HG002': 'NA24385',
    'HG003': 'NA24149',
    'HG004': 'NA24143'
}

hgid2rel = {
    'HG002': 'Son',
    'HG003': 'Father',
    'HG004': 'Mother'
}

### 3.1 Load GLIMPSE imputed variants

In [5]:
df_glimpse_variants = []
for row_index, row in df_samples.iterrows():
    for plexing, wgs_depth in [(4, 2), (8, 5)]:
        if row.Plexing != plexing:
            continue
        filepath = f'{GLIMPSE_BENCHMARK_DIR}/{row.Plexing}plex_{wgs_depth}X/{row.Filename}.imputed.merged_with_WEGS.happy_benchmark.summary.csv'
        df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
            'METRIC.Recall': 'Recall', 
            'METRIC.Precision': 'Precision',
            'METRIC.F1_Score': 'F1_Score',
            'TRUTH.TP': 'TP',
            'QUERY.FP': 'FP',
            'TRUTH.FN': 'FN'
        })
        df['Total_called'] = df['TP'] + df['FP']
        df['ID'] = row.Filename
        df['HGID'] = row.HGID
        df['Batch'] = row.Batch
        df['Plexing'] = row.Plexing
        df['WGS_depth'] = wgs_depth
        df_glimpse_variants.append(df)
df_glimpse_variants = pd.concat(df_glimpse_variants).reset_index(drop = True)


## 4. Figures and tables

### TABLE: Precision and recall rates of variants imputed using the GLIMPSE method genome-wide (PART 1 - SNVs)

In [6]:
data = {
    'Sample': [],
    'Imputed 4P2X TP': [],
    'Imputed 4P2X FN': [],
    'Imputed 4P2X FP': [],
    'Imputed 4P2X Precision': [],
    'Imputed 4P2X Recall': [],
    'Imputed 8P5X TP': [],
    'Imputed 8P5X FN': [],
    'Imputed 8P5X FP': [],
    'Imputed 8P5X Precision': [],
    'Imputed 8P5X Recall': [],
}

for hgid, naid in hgid2naid.items():
    data['Sample'].append(hgid)
    
    df_temp = df_glimpse_variants[df_glimpse_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 4) & (df_temp.WGS_depth == 2)]
    data['Imputed 4P2X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['Imputed 4P2X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['Imputed 4P2X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['Imputed 4P2X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['Imputed 4P2X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    
    df_temp = df_glimpse_variants[df_glimpse_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 8) & (df_temp.WGS_depth == 5)]
    data['Imputed 8P5X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['Imputed 8P5X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['Imputed 8P5X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['Imputed 8P5X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['Imputed 8P5X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    
pd.DataFrame(data)

Unnamed: 0,Sample,Imputed 4P2X TP,Imputed 4P2X FN,Imputed 4P2X FP,Imputed 4P2X Precision,Imputed 4P2X Recall,Imputed 8P5X TP,Imputed 8P5X FN,Imputed 8P5X FP,Imputed 8P5X Precision,Imputed 8P5X Recall
0,HG002,"2,736,662 (2,636)","616,017 (2,636)","372,221 (1,713)",0.8803 (0.0006),0.8163 (0.0008),"2,922,557 (375)","430,122 (375)","301,471 (328)",0.9065 (0.0001),0.8717 (0.0001)
1,HG003,"2,709,032 (1,998)","605,111 (1,998)","388,043 (965)",0.8747 (0.0004),0.8174 (0.0006),"2,919,299 (567)","394,844 (567)","267,597 (277)",0.9160 (0.0001),0.8809 (0.0002)
2,HG004,"2,706,645 (1,212)","633,826 (1,212)","381,866 (504)",0.8764 (0.0002),0.8103 (0.0004),"2,879,156 (815)","461,315 (815)","323,954 (586)",0.8989 (0.0002),0.8619 (0.0002)


### TABLE: Precision and recall rates of variants imputed using the GLIMPSE method genome-wide (PART 2 - InDels)

In [7]:
data = {
    'Sample': [],
    'Imputed 4P2X TP': [],
    'Imputed 4P2X FN': [],
    'Imputed 4P2X FP': [],
    'Imputed 4P2X Precision': [],
    'Imputed 4P2X Recall': [],
    'Imputed 8P5X TP': [],
    'Imputed 8P5X FN': [],
    'Imputed 8P5X FP': [],
    'Imputed 8P5X Precision': [],
    'Imputed 8P5X Recall': [],
}

for hgid, naid in hgid2naid.items():
    data['Sample'].append(hgid)
    
    df_temp = df_glimpse_variants[df_glimpse_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 4) & (df_temp.WGS_depth == 2)]
    data['Imputed 4P2X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['Imputed 4P2X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['Imputed 4P2X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['Imputed 4P2X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['Imputed 4P2X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    # print(hgid, 'Imputed WEGS 4P2X', len(df_temp))
    
    df_temp = df_glimpse_variants[df_glimpse_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 8) & (df_temp.WGS_depth == 5)]
    data['Imputed 8P5X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['Imputed 8P5X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['Imputed 8P5X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['Imputed 8P5X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['Imputed 8P5X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    # print(hgid, 'Imputed WEGS 8P5X', len(df_temp))
    
pd.DataFrame(data)

Unnamed: 0,Sample,Imputed 4P2X TP,Imputed 4P2X FN,Imputed 4P2X FP,Imputed 4P2X Precision,Imputed 4P2X Recall,Imputed 8P5X TP,Imputed 8P5X FN,Imputed 8P5X FP,Imputed 8P5X Precision,Imputed 8P5X Recall
0,HG002,"251,862 (1,203)","270,527 (1,203)","65,080 (614)",0.7947 (0.0008),0.4821 (0.0023),"294,850 (254)","227,539 (254)","76,268 (112)",0.7945 (0.0002),0.5644 (0.0005)
1,HG003,"256,262 (716)","244,530 (716)","70,962 (416)",0.7832 (0.0005),0.5117 (0.0014),"305,742 (233)","195,052 (233)","78,524 (145)",0.7957 (0.0002),0.6105 (0.0005)
2,HG004,"237,951 (490)","270,323 (490)","60,461 (365)",0.7974 (0.0007),0.4682 (0.0010),"273,779 (416)","234,495 (416)","71,535 (257)",0.7929 (0.0004),0.5386 (0.0008)
