In [None]:
import hail as hl
hl.init(app_name="filter_mt")

In [None]:
pwd

In [None]:
# Import mt which was run through VQSR
VQSR_path = "file:///directflow/ClinicalGenomicsPipeline/dev/2021-02-04-PIPELINE-1885-All-Hail/EricData/vqsr_finalised.mt/"
mt = hl.read_matrix_table(VQSR_path)

In [None]:
# We split using this function since the combiner outputs a sparse matrix table
mt = hl.experimental.sparse_split_multi(mt)

In [None]:
# Filter out rows with less than 10 DP
# Rows with missing values are removed regardless of keep
mt = mt.filter_rows(mt.info.DP > 10, keep=True)

In [None]:
# Filter by PASS status
mt = mt.filter_rows(mt.filters == {"PASS"})

In [None]:
# Filter by Q values greater than 90
mt = mt.filter_rows(mt.info.QUALapprox > 90)

In [None]:
# Hardy-Weinberg equilibrium greater than 0.001
mt = mt.annotate_rows(hwe = hl.agg.hardy_weinberg_test(mt.GT))
mt = mt.filter_rows(mt.hwe.p_value > 0.001)

In [None]:
mt = hl.variant_qc(mt,name="variant_qc")

In [None]:
mt = mt.filter_rows(mt.variant_qc.call_rate > 0.9)

In [None]:
mt.count()

In [None]:
output_path = "file:///directflow/ClinicalGenomicsPipeline/dev/2021-02-04-PIPELINE-1885-All-Hail/EricData/vqsr_finalised.filtered.mt"
mt.write(output_path)