From f6fe0d9d9eff9b5182bfc8173d9e9237eefff780 Mon Sep 17 00:00:00 2001 From: evancofer Date: Tue, 28 Jan 2020 12:39:35 -0500 Subject: [PATCH 1/3] Removed chromosome chr-based filtering --- selene_sdk/predict/model_predict.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/selene_sdk/predict/model_predict.py b/selene_sdk/predict/model_predict.py index 59c29b34..32e25ece 100644 --- a/selene_sdk/predict/model_predict.py +++ b/selene_sdk/predict/model_predict.py @@ -310,8 +310,6 @@ def _get_sequences_from_bed_file(self, strand = '.' if isinstance(strand_index, int) and len(cols) > strand_index: strand = cols[strand_index] - if 'chr' not in chrom: - chrom = 'chr{0}'.format(chrom) if not str.isdigit(start) or not str.isdigit(end) \ or chrom not in self.reference_sequence.genome: na_rows.append(line) From 38b6c0823f6861e353dc86998c8723fb08a8bd9e Mon Sep 17 00:00:00 2001 From: evancofer Date: Tue, 28 Jan 2020 12:44:21 -0500 Subject: [PATCH 2/3] Replaced automatic addition of chr- for VEP --- selene_sdk/predict/_variant_effect_prediction.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/selene_sdk/predict/_variant_effect_prediction.py b/selene_sdk/predict/_variant_effect_prediction.py index 1c78d04c..8b204aff 100644 --- a/selene_sdk/predict/_variant_effect_prediction.py +++ b/selene_sdk/predict/_variant_effect_prediction.py @@ -85,12 +85,13 @@ def read_vcf_file(input_path, chrom = str(cols[0]) if 'CHR' == chrom[:3]: chrom = chrom.replace('CHR', 'chr') - elif "chr" not in chrom: - chrom = "chr" + chrom if chrom == "chrMT" and \ chrom not in reference_sequence.get_chrs(): chrom = "chrM" + elif chrom == "MT" and \ + chrom not in reference_sequence.get_chrs(): + chrom = "M" pos = int(cols[1]) name = cols[2] From 9a0aee6cea1fdbca1d5a342a15a05726a90d2610 Mon Sep 17 00:00:00 2001 From: evancofer Date: Fri, 7 Feb 2020 09:47:55 -0500 Subject: [PATCH 3/3] Added prepending of 'chr' based on ref seq --- selene_sdk/predict/_variant_effect_prediction.py | 7 +++++++ selene_sdk/predict/model_predict.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/selene_sdk/predict/_variant_effect_prediction.py b/selene_sdk/predict/_variant_effect_prediction.py index 8b204aff..7ebaf899 100644 --- a/selene_sdk/predict/_variant_effect_prediction.py +++ b/selene_sdk/predict/_variant_effect_prediction.py @@ -62,6 +62,11 @@ def read_vcf_file(input_path, """ variants = [] na_rows = [] + check_chr = True + for chrom in reference_sequence.get_chrs(): + if not chrom.startswith("chr"): + check_chr = False + break with open(input_path, 'r') as file_handle: lines = file_handle.readlines() index = 0 @@ -85,6 +90,8 @@ def read_vcf_file(input_path, chrom = str(cols[0]) if 'CHR' == chrom[:3]: chrom = chrom.replace('CHR', 'chr') + elif "chr" not in chrom and check_chr is True: + chrom = "chr" + chrom if chrom == "chrMT" and \ chrom not in reference_sequence.get_chrs(): diff --git a/selene_sdk/predict/model_predict.py b/selene_sdk/predict/model_predict.py index 32e25ece..268758d9 100644 --- a/selene_sdk/predict/model_predict.py +++ b/selene_sdk/predict/model_predict.py @@ -298,6 +298,11 @@ def _get_sequences_from_bed_file(self, sequences = [] labels = [] na_rows = [] + check_chr = True + for chrom in reference_sequence.get_chrs(): + if not chrom.startswith("chr"): + check_chr = False + break with open(input_path, 'r') as read_handle: for i, line in enumerate(read_handle): cols = line.strip().split('\t') @@ -310,6 +315,8 @@ def _get_sequences_from_bed_file(self, strand = '.' if isinstance(strand_index, int) and len(cols) > strand_index: strand = cols[strand_index] + if 'chr' not in chrom and check_chr is True: + chrom = "chr{0}".format(chrom) if not str.isdigit(start) or not str.isdigit(end) \ or chrom not in self.reference_sequence.genome: na_rows.append(line)