# Pickinging SNPS in CDS

In this script, we pick the SNPs only in the coding sequence of the transcripts.

In [1]:
import gzip

In [2]:
input_vcf  = "../reference_files/transcriptomic_variants.vcf.gz"
output_vcf = "../reference_files/cds_of_transcriptomic_variants.vcf.gz"

In [3]:
def get_start_stop_from_chrom_header(chrom_header):
    header_components= chrom_header.split("|")
    
    for c in header_components:
        if c.startswith("CDS"):
            cds_start_stop = c.split(":")[1].split("-")
            cds_start = int(cds_start_stop[0])
            cds_stop  = int(cds_start_stop[1])
            return ( cds_start, cds_stop )
    raise Error("No CDS found!")
            

In [4]:
with gzip.open( input_vcf , "rt" ) as input_stream,\
     gzip.open( output_vcf, "wt") as output_stream:        
    
    for this_line in input_stream:
        this_line = this_line.strip()
        
        if this_line.startswith("#"):
            print(this_line, file = output_stream)
            continue
            
        line_contents = this_line.split()
        
        if len(line_contents) < 4:
            continue
            
        chrom_header = line_contents[0]
        position     = int(line_contents[1])
        
        cds_start, cds_stop = get_start_stop_from_chrom_header(chrom_header)
        
        if position >= cds_start and position <= cds_stop:
            print( this_line, file = output_stream )
        

In [5]:
sample_header = "ENSMUST00000120274.7|ENSMUSG00000040624.18|OTTMUSG00000030941.3|OTTMUST00000076586.1|Plekhg1-202|Plekhg1|7369|UTR5:1-345|CDS:346-4518|UTR3:4519-7369|"

get_start_stop_from_chrom_header(sample_header)

(346, 4518)