In [1]:
import sys
!{sys.executable} -m pip install pysam --no-index
!{sys.executable} -m pip install intervaltree --no-index

Ignoring pip: markers 'python_version < "3"' don't match your environment
Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/avx2, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic
Ignoring pip: markers 'python_version < "3"' don't match your environment
Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/avx2, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic


In [2]:
import pandas as pd
from intervaltree import Interval, IntervalTree
import numpy as np
from scipy import stats

pd.set_option('display.max_columns', None)

## 1. Help functions

In [3]:
HAPPY_COLUMNS = ['Type', 'Filter', 'TRUTH.TP', 'QUERY.FP', 'TRUTH.FN', 'METRIC.Recall', 'METRIC.Precision', 'METRIC.F1_Score']

def load_targets_bed(filename):
    targets = dict()
    with open(filename, 'rt') as ifile:
        for line in ifile:
            chrom, start, stop = line.rstrip().split()
            targets.setdefault(chrom, IntervalTree()).addi(int(start) + 1, int(stop) + 1) # because BED is 0-based and IntervalTree stores half-open intervals.
    return targets

## 2. Input files

In [4]:
SAMPLE_ANNOTATIONS_FILE = 'sample_annotations.txt'
WEGS_TARGETS_BED_FILE = 'WES_targets/SureSelectHumanAllExonV7.b37.Target.bed'
NOPLEXWES_BENCHMARK_DIR = 'Variant_calling_analysis/WES/1plex'
WEGS_BENCHMARK_DIR = 'Variant_calling_analysis/WEGS'
WGS30X_BENCHMARK_DIR = 'WEGS_vs_30XWGS_comparison'


## 3. Load data

In [5]:
# Load sample annotations
df_samples = pd.read_csv(SAMPLE_ANNOTATIONS_FILE, header = 0, sep = '\t')

hgid2naid = {
    'HG002': 'NA24385',
    'HG003': 'NA24149',
    'HG004': 'NA24143'
}

hgid2rel = {
    'HG002': 'Son',
    'HG003': 'Father',
    'HG004': 'Mother'
}

WEGS_TARGETS = load_targets_bed(WEGS_TARGETS_BED_FILE)

### 3.1 Load variant calls inside targets

In [6]:
df_noplexwes_target_variants = []
for row_index, row in df_samples.iterrows():
    if row.Plexing != 1:
        continue
    filepath = f'{NOPLEXWES_BENCHMARK_DIR}/{row.Filename}.norm.hard_filter.happy_benchmark.summary.csv'
    df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
            'METRIC.Recall': 'Recall', 
            'METRIC.Precision': 'Precision',
            'METRIC.F1_Score': 'F1_Score',
            'TRUTH.TP': 'TP',
            'QUERY.FP': 'FP',
            'TRUTH.FN': 'FN'
        })
    df['Total_called'] = df['TP'] + df['FP']
    df['ID'] = row.Filename
    df['HGID'] = row.HGID
    df['Batch'] = row.Batch
    df['Plexing'] = row.Plexing
    df_noplexwes_target_variants.append(df)
df_noplexwes_target_variants = pd.concat(df_noplexwes_target_variants).reset_index(drop = True)


df_wegs_target_variants = []
for row_index, row in df_samples.iterrows():
    if row.Plexing == 1:
        continue
    for wgs_depth in [2, 5]:
        filepath = f'{WEGS_BENCHMARK_DIR}/{row.Plexing}plex_{wgs_depth}X/{row.Filename}.norm.hard_filter.happy_benchmark.summary.csv'
        df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
            'METRIC.Recall': 'Recall', 
            'METRIC.Precision': 'Precision',
            'METRIC.F1_Score': 'F1_Score',
            'TRUTH.TP': 'TP',
            'QUERY.FP': 'FP',
            'TRUTH.FN': 'FN'
        })
        df['Total_called'] = df['TP'] + df['FP']
        df['ID'] = row.Filename
        df['HGID'] = row.HGID
        df['Batch'] = row.Batch
        df['Plexing'] = row.Plexing
        df['WGS_depth'] = wgs_depth
        df_wegs_target_variants.append(df)
df_wegs_target_variants = pd.concat(df_wegs_target_variants).reset_index(drop = True)

df_wgs30x_target_variants = []
for row_index, row in df_samples[['HGID', 'Family']].drop_duplicates().iterrows():
    for i in range(1, 6):
        filepath = f'{WGS30X_BENCHMARK_DIR}/Targets/{row.HGID}_{hgid2naid[row.HGID]}_{row.Family}_S{i}.norm.hard_filter.happy_benchmark.summary.csv'
        df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
            'METRIC.Recall': 'Recall', 
            'METRIC.Precision': 'Precision',
            'METRIC.F1_Score': 'F1_Score',
            'TRUTH.TP': 'TP',
            'QUERY.FP': 'FP',
            'TRUTH.FN': 'FN'
        })
        df['Total_called'] = df['TP'] + df['FP']
        df['HGID'] = row.HGID
        df['Downsample'] = i
        df_wgs30x_target_variants.append(df)
df_wgs30x_target_variants = pd.concat(df_wgs30x_target_variants).reset_index(drop = True)




### 3.2 Load variant calls genome-wide

In [7]:
df_wegs_variants = []
for row_index, row in df_samples.iterrows():
    if row.Plexing == 1:
        continue
    for wgs_depth in [2, 5]:
        filepath = f'{WEGS_BENCHMARK_DIR}/{row.Plexing}plex_{wgs_depth}X_genomewide/{row.Filename}.norm.hard_filter.happy_benchmark.summary.csv'
        df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
            'METRIC.Recall': 'Recall', 
            'METRIC.Precision': 'Precision',
            'METRIC.F1_Score': 'F1_Score',
            'TRUTH.TP': 'TP',
            'QUERY.FP': 'FP',
            'TRUTH.FN': 'FN'
        })
        df['Total_called'] = df['TP'] + df['FP']
        df['ID'] = row.Filename
        df['HGID'] = row.HGID
        df['Batch'] = row.Batch
        df['Plexing'] = row.Plexing
        df['WGS_depth'] = wgs_depth
        df_wegs_variants.append(df)
df_wegs_variants = pd.concat(df_wegs_variants).reset_index(drop = True)

df_wgs30x_variants = []
for row_index, row in df_samples[['HGID', 'Family']].drop_duplicates().iterrows():
    for i in range(1, 6):
        filepath = f'{WGS30X_BENCHMARK_DIR}/Genomewide/{row.HGID}_{hgid2naid[row.HGID]}_{row.Family}_S{i}.norm.hard_filter.happy_benchmark.summary.csv'
        df = pd.read_csv(filepath, usecols = HAPPY_COLUMNS).rename(columns = {
            'METRIC.Recall': 'Recall', 
            'METRIC.Precision': 'Precision',
            'METRIC.F1_Score': 'F1_Score',
            'TRUTH.TP': 'TP',
            'QUERY.FP': 'FP',
            'TRUTH.FN': 'FN'
        })
        df['Total_called'] = df['TP'] + df['FP']
        df['HGID'] = row.HGID
        df['Downsample'] = i
        df_wgs30x_variants.append(df)
df_wgs30x_variants = pd.concat(df_wgs30x_variants).reset_index(drop = True)

## 4. Figures and tables

### TABLE: Average variant recall and precision rates in 30X WGS, WES, and WEGS.

In [8]:
dfs = [
    ( 'WGS 30X', df_wgs30x_target_variants), 
    ( 'WES 100X', df_noplexwes_target_variants),
    ( 'WEGS 4P2X', df_wegs_target_variants[(df_wegs_target_variants.Plexing == 4) & (df_wegs_target_variants.WGS_depth == 2)]),
    ( 'WEGS 4P5X', df_wegs_target_variants[(df_wegs_target_variants.Plexing == 4) & (df_wegs_target_variants.WGS_depth == 5)]),
    ( 'WEGS 8P2X', df_wegs_target_variants[(df_wegs_target_variants.Plexing == 8) & (df_wegs_target_variants.WGS_depth == 2)]),
    ( 'WEGS 8P5X', df_wegs_target_variants[(df_wegs_target_variants.Plexing == 8) & (df_wegs_target_variants.WGS_depth == 5)])
]

data = {
    'Label': [],
    'SNV TP': [],
    'SNV FP': [],
    'SNV FN': [],
    'SNV Precision': [],
    'SNV Recall': [],
    'InDel TP': [],
    'InDel FP': [],
    'InDel FN': [],
    'InDel Precision': [],
    'InDel Recall': []
}

for label, df in dfs:
    data['Label'].append(label)
    
    df_temp = df[(df.Type == 'SNP') & (df.Filter == 'ALL')]
    data['SNV TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['SNV FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['SNV FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['SNV Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['SNV Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

    df_temp = df[(df.Type == 'INDEL') & (df.Filter == 'ALL')]
    data['InDel TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['InDel FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['InDel FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['InDel Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['InDel Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

pd.DataFrame(data)

Unnamed: 0,Label,SNV TP,SNV FP,SNV FN,SNV Precision,SNV Recall,InDel TP,InDel FP,InDel FN,InDel Precision,InDel Recall
0,WGS 30X,"22,338 (19)",260 (5),287 (5),0.9885 (0.0002),0.9873 (0.0002),661 (5),12 (1),11 (1),0.9823 (0.0017),0.9841 (0.0011)
1,WES 100X,"22,241 (30)",406 (7),384 (8),0.9821 (0.0003),0.9830 (0.0004),631 (6),53 (2),41 (2),0.9232 (0.0028),0.9390 (0.0029)
2,WEGS 4P2X,"22,268 (24)",426 (6),357 (5),0.9812 (0.0002),0.9842 (0.0002),638 (5),51 (2),34 (2),0.9269 (0.0024),0.9493 (0.0028)
3,WEGS 4P5X,"22,291 (25)",427 (6),334 (3),0.9812 (0.0002),0.9852 (0.0001),642 (5),49 (2),30 (1),0.9300 (0.0024),0.9552 (0.0019)
4,WEGS 8P2X,"22,247 (21)",421 (6),376 (4),0.9814 (0.0003),0.9834 (0.0002),633 (4),52 (2),40 (2),0.9240 (0.0024),0.9407 (0.0026)
5,WEGS 8P5X,"22,277 (21)",418 (6),346 (3),0.9816 (0.0002),0.9847 (0.0001),638 (4),50 (2),34 (2),0.9277 (0.0027),0.9490 (0.0020)


### EXTRA TABLE: Average SNV recall and precision rates in 30X WGS, WES, and WEGS stratified by individual.

In [9]:
data = {
    'Sample': [],
    'WGS 30X TP': [],
    'WGS 30X FN': [],
    'WGS 30X FP': [],
    'WGS 30X Precision': [],
    'WGS 30X Recall': [],
    'WES 100X TP': [],
    'WES 100X FN': [],
    'WES 100X FP': [],
    'WES 100X Precision': [],
    'WES 100X Recall': [],
    'WEGS 4P2X TP': [],
    'WEGS 4P2X FN': [],
    'WEGS 4P2X FP': [],
    'WEGS 4P2X Precision': [],
    'WEGS 4P2X Recall': [],
    'WEGS 8P5X TP': [],
    'WEGS 8P5X FN': [],
    'WEGS 8P5X FP': [],
    'WEGS 8P5X Precision': [],
    'WEGS 8P5X Recall': [],
}

for hgid, naid in hgid2naid.items():
    data['Sample'].append(hgid)
    
    df_temp = df_wgs30x_target_variants[df_wgs30x_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL')][['TP', 'FN', 'FP', 'Recall', 'Precision']]
    data['WGS 30X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WGS 30X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WGS 30X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WGS 30X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WGS 30X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

    df_temp = df_noplexwes_target_variants[df_noplexwes_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL')][['TP', 'FN', 'FP', 'Recall', 'Precision']]
    data['WES 100X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WES 100X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WES 100X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WES 100X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WES 100X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

    df_temp = df_wegs_target_variants[df_wegs_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 4) & (df_temp.WGS_depth == 2)]
    data['WEGS 4P2X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WEGS 4P2X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WEGS 4P2X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WEGS 4P2X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WEGS 4P2X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    
    df_temp = df_wegs_target_variants[df_wegs_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'SNP') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 8) & (df_temp.WGS_depth == 5)]
    data['WEGS 8P5X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WEGS 8P5X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WEGS 8P5X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WEGS 8P5X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WEGS 8P5X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    

pd.DataFrame(data)

Unnamed: 0,Sample,WGS 30X TP,WGS 30X FN,WGS 30X FP,WGS 30X Precision,WGS 30X Recall,WES 100X TP,WES 100X FN,WES 100X FP,WES 100X Precision,WES 100X Recall,WEGS 4P2X TP,WEGS 4P2X FN,WEGS 4P2X FP,WEGS 4P2X Precision,WEGS 4P2X Recall,WEGS 8P5X TP,WEGS 8P5X FN,WEGS 8P5X FP,WEGS 8P5X Precision,WEGS 8P5X Recall
0,HG002,"22,433 (4)",310 (4),284 (5),0.9875 (0.0002),0.9864 (0.0002),"22,356 (3)",387 (3),408 (3),0.9821 (0.0001),0.9830 (0.0001),"22,379 (7)",364 (7),446 (6),0.9805 (0.0002),0.9840 (0.0003),"22,390 (5)",353 (5),449 (7),0.9803 (0.0003),0.9845 (0.0002)
1,HG003,"22,312 (4)",277 (4),245 (3),0.9892 (0.0001),0.9877 (0.0002),"22,202 (17)",387 (17),405 (6),0.9821 (0.0003),0.9829 (0.0008),"22,231 (9)",358 (9),425 (4),0.9813 (0.0001),0.9841 (0.0004),"22,249 (6)",340 (6),406 (5),0.9821 (0.0002),0.9850 (0.0003)
2,HG004,"22,269 (5)",274 (5),251 (6),0.9888 (0.0003),0.9878 (0.0002),"22,167 (22)",376 (22),406 (24),0.9820 (0.0010),0.9833 (0.0010),"22,195 (9)",348 (9),406 (7),0.9820 (0.0003),0.9846 (0.0004),"22,197 (5)",346 (5),402 (3),0.9822 (0.0001),0.9847 (0.0002)


### EXTRA TABLE: Average InDel recall and precision rates in 30X WGS, WES, and WEGS stratified by individual.

In [10]:
data = {
    'Sample': [],
    'WGS 30X TP': [],
    'WGS 30X FN': [],
    'WGS 30X FP': [],
    'WGS 30X Precision': [],
    'WGS 30X Recall': [],
    'WES 100X TP': [],
    'WES 100X FN': [],
    'WES 100X FP': [],
    'WES 100X Precision': [],
    'WES 100X Recall': [],
    'WEGS 4P2X TP': [],
    'WEGS 4P2X FN': [],
    'WEGS 4P2X FP': [],
    'WEGS 4P2X Precision': [],
    'WEGS 4P2X Recall': [],
    'WEGS 8P5X TP': [],
    'WEGS 8P5X FN': [],
    'WEGS 8P5X FP': [],
    'WEGS 8P5X Precision': [],
    'WEGS 8P5X Recall': [],
}

for hgid, naid in hgid2naid.items():
    data['Sample'].append(hgid)

    df_temp = df_wgs30x_target_variants[df_wgs30x_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL')][['TP', 'FN', 'FP', 'Recall', 'Precision']]
    data['WGS 30X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WGS 30X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WGS 30X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WGS 30X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WGS 30X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

    df_temp = df_noplexwes_target_variants[df_noplexwes_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL')][['TP', 'FN', 'FP', 'Recall', 'Precision']]
    data['WES 100X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WES 100X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WES 100X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WES 100X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WES 100X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

    df_temp = df_wegs_target_variants[df_wegs_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 4) & (df_temp.WGS_depth == 2)]
    data['WEGS 4P2X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WEGS 4P2X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WEGS 4P2X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WEGS 4P2X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WEGS 4P2X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    
    df_temp = df_wegs_target_variants[df_wegs_target_variants.HGID == hgid]
    df_temp = df_temp[(df_temp.Type == 'INDEL') & (df_temp.Filter == 'ALL') & (df_temp.Plexing == 8) & (df_temp.WGS_depth == 5)]
    data['WEGS 8P5X TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['WEGS 8P5X FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['WEGS 8P5X FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['WEGS 8P5X Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['WEGS 8P5X Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')
    
pd.DataFrame(data)

Unnamed: 0,Sample,WGS 30X TP,WGS 30X FN,WGS 30X FP,WGS 30X Precision,WGS 30X Recall,WES 100X TP,WES 100X FN,WES 100X FP,WES 100X Precision,WES 100X Recall,WEGS 4P2X TP,WEGS 4P2X FN,WEGS 4P2X FP,WEGS 4P2X Precision,WEGS 4P2X Recall,WEGS 8P5X TP,WEGS 8P5X FN,WEGS 8P5X FP,WEGS 8P5X Precision,WEGS 8P5X Recall
0,HG002,673 (1),9 (1),16 (1),0.9765 (0.0009),0.9865 (0.0009),641 (1),41 (1),60 (1),0.9147 (0.0014),0.9399 (0.0015),648 (3),34 (3),57 (1),0.9197 (0.0017),0.9498 (0.0050),647 (2),35 (2),58 (3),0.9177 (0.0038),0.9481 (0.0028)
1,HG003,676 (1),14 (1),13 (2),0.9809 (0.0028),0.9797 (0.0013),642 (3),48 (3),50 (3),0.9280 (0.0036),0.9309 (0.0040),649 (2),41 (2),52 (2),0.9254 (0.0019),0.9409 (0.0027),651 (1),39 (1),52 (1),0.9265 (0.0018),0.9432 (0.0019)
2,HG004,634 (1),9 (1),7 (1),0.9895 (0.0009),0.9860 (0.0017),608 (3),35 (3),48 (3),0.9271 (0.0048),0.9461 (0.0049),616 (2),28 (2),43 (2),0.9357 (0.0035),0.9572 (0.0026),615 (2),28 (2),40 (2),0.9393 (0.0033),0.9568 (0.0031)


### TABLE: Average genome-wide variant recall and precision rates in 30X WGS and WEGS.

In [11]:
dfs = [
    ( 'WGS 30X', df_wgs30x_variants), 
    ( 'WEGS 4P2X', df_wegs_variants[(df_wegs_variants.Plexing == 4) & (df_wegs_variants.WGS_depth == 2)]),
    ( 'WEGS 4P5X', df_wegs_variants[(df_wegs_variants.Plexing == 4) & (df_wegs_variants.WGS_depth == 5)]),
    ( 'WEGS 8P2X', df_wegs_variants[(df_wegs_variants.Plexing == 8) & (df_wegs_variants.WGS_depth == 2)]),
    ( 'WEGS 8P5X', df_wegs_variants[(df_wegs_variants.Plexing == 8) & (df_wegs_variants.WGS_depth == 5)])
]

data = {
    'Label': [],
    'SNV TP': [],
    'SNV FP': [],
    'SNV FN': [],
    'SNV Precision': [],
    'SNV Recall': [],
    'InDel TP': [],
    'InDel FP': [],
    'InDel FN': [],
    'InDel Precision': [],
    'InDel Recall': []
}

for label, df in dfs:
    data['Label'].append(label)
    
    df_temp = df[(df.Type == 'SNP') & (df.Filter == 'ALL')]
    data['SNV TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['SNV FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['SNV FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['SNV Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['SNV Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

    df_temp = df[(df.Type == 'INDEL') & (df.Filter == 'ALL')]
    data['InDel TP'].append(f'{np.mean(df_temp.TP):,.0f} ({stats.sem(df_temp.TP):,.0f})')
    data['InDel FP'].append(f'{np.mean(df_temp.FP):,.0f} ({stats.sem(df_temp.FP):,.0f})')
    data['InDel FN'].append(f'{np.mean(df_temp.FN):,.0f} ({stats.sem(df_temp.FN):,.0f})')
    data['InDel Precision'].append(f'{np.mean(df_temp.Precision):.4f} ({stats.sem(df_temp.Precision):,.4f})')
    data['InDel Recall'].append(f'{np.mean(df_temp.Recall):.4f} ({stats.sem(df_temp.Recall):,.4f})')

pd.DataFrame(data)


Unnamed: 0,Label,SNV TP,SNV FP,SNV FN,SNV Precision,SNV Recall,InDel TP,InDel FP,InDel FN,InDel Precision,InDel Recall
0,WGS 30X,"3,309,667 (4,295)","15,268 (174)","26,097 (82)",0.9954 (0.0001),0.9922 (0.0000),"500,728 (2,382)","5,605 (35)","9,757 (111)",0.9889 (0.0001),0.9809 (0.0002)
1,WEGS 4P2X,"1,333,840 (44,603)","303,452 (4,673)","2,001,925 (48,123)",0.8137 (0.0033),0.4000 (0.0138),"132,918 (5,359)","56,441 (1,663)","377,567 (6,801)",0.7010 (0.0025),0.2607 (0.0112)
2,WEGS 4P5X,"1,909,331 (58,921)","240,392 (3,054)","1,426,434 (62,566)",0.8869 (0.0043),0.5726 (0.0183),"201,593 (7,908)","68,206 (1,221)","308,892 (9,249)",0.7457 (0.0041),0.3954 (0.0166)
3,WEGS 8P2X,"1,326,086 (39,124)","307,039 (4,029)","2,008,327 (42,308)",0.8109 (0.0029),0.3979 (0.0121),"131,783 (4,687)","56,193 (1,460)","378,097 (6,013)",0.7001 (0.0022),0.2588 (0.0098)
4,WEGS 8P5X,"1,914,743 (51,710)","240,678 (2,681)","1,419,670 (54,999)",0.8870 (0.0038),0.5745 (0.0161),"201,995 (6,927)","68,102 (1,067)","307,885 (8,174)",0.7463 (0.0036),0.3967 (0.0146)
