diff --git a/selene_sdk/predict/_variant_effect_prediction.py b/selene_sdk/predict/_variant_effect_prediction.py index 1c78d04c..7ebaf899 100644 --- a/selene_sdk/predict/_variant_effect_prediction.py +++ b/selene_sdk/predict/_variant_effect_prediction.py @@ -62,6 +62,11 @@ def read_vcf_file(input_path, """ variants = [] na_rows = [] + check_chr = True + for chrom in reference_sequence.get_chrs(): + if not chrom.startswith("chr"): + check_chr = False + break with open(input_path, 'r') as file_handle: lines = file_handle.readlines() index = 0 @@ -85,12 +90,15 @@ def read_vcf_file(input_path, chrom = str(cols[0]) if 'CHR' == chrom[:3]: chrom = chrom.replace('CHR', 'chr') - elif "chr" not in chrom: + elif "chr" not in chrom and check_chr is True: chrom = "chr" + chrom if chrom == "chrMT" and \ chrom not in reference_sequence.get_chrs(): chrom = "chrM" + elif chrom == "MT" and \ + chrom not in reference_sequence.get_chrs(): + chrom = "M" pos = int(cols[1]) name = cols[2] diff --git a/selene_sdk/predict/model_predict.py b/selene_sdk/predict/model_predict.py index 59c29b34..268758d9 100644 --- a/selene_sdk/predict/model_predict.py +++ b/selene_sdk/predict/model_predict.py @@ -298,6 +298,11 @@ def _get_sequences_from_bed_file(self, sequences = [] labels = [] na_rows = [] + check_chr = True + for chrom in reference_sequence.get_chrs(): + if not chrom.startswith("chr"): + check_chr = False + break with open(input_path, 'r') as read_handle: for i, line in enumerate(read_handle): cols = line.strip().split('\t') @@ -310,8 +315,8 @@ def _get_sequences_from_bed_file(self, strand = '.' if isinstance(strand_index, int) and len(cols) > strand_index: strand = cols[strand_index] - if 'chr' not in chrom: - chrom = 'chr{0}'.format(chrom) + if 'chr' not in chrom and check_chr is True: + chrom = "chr{0}".format(chrom) if not str.isdigit(start) or not str.isdigit(end) \ or chrom not in self.reference_sequence.genome: na_rows.append(line)