# Variant Analysis using VCF files

After downloading the VCF files and variants form chromosome 19, the coding sequence annotations on chromosome 19 are extacted with:

In [None]:
def extact_seq_annotations(file_path):
    annotations = {}
    with open (file_path, "r") as f:
        for line in f: 
            # splits line into fields
            fields = line.strip().split("\t")
            chrom, start, end, attributes, a, strand = fields

            attr_dict = dict(item.split("=") for item in attributes.split(";") if "=" in item) 
            transcript_id = attr_dict.get("transcript_id")
            # writes the transcript_id, start, and end to an output file
            with open("data/processed/chr19_exons_annotated.txt", "a") as out_f:
                out_f.write(f"{chrom}\t{start}\t{end}\n")

            annotations[chrom] = (start, end)
    return list(annotations.items())[:5]  # Return the first 5 annotations to test
    
extact_seq_annotations("data/raw/chr19_exons.bed")

[('19', ('59108550', '59109183'))]

Sorted the annotation files in terminal using 
`sort -k1,1 -k2,2n chr19_exons_annotated.txt -o ch19_exons_annotated_sorted.txt`

Now to intersect the mutations with CDS gene annotations:

In [4]:
from pybedtools import BedTool

variants = BedTool("data/raw/chr19_variants.bed")
exons = BedTool("data/processed/ch19_exons_annotated_sorted.txt")

variants_in_exons = variants.intersect(exons)

print(variants_in_exons.head())


None
