# Variant Analysis using VCF files

After downloading the VCF files and variants form chromosome 19, the coding sequence annotations on chromosome 19 are extacted with:

In [13]:
def extact_seq_annotations(file_path):
    annotations = []
    with open (file_path, "r") as f:
        for line in f: 
            # splits line into fields
            fields = line.strip().split("\t")
            chrom, start, end, attributes, a, strand = fields

            attr_dict = dict(item.split("=") for item in attributes.split(";") if "=" in item) 
            transcript_id = attr_dict.get("transcript_id")
            # writes the transcript_id, start, and end to an output file
            with open("data/processed/chr19_exons_annotated.txt", "a") as out_f:
                out_f.write(f"{chrom}\t{start}\t{end}\n")

        annotations.append((chrom, start, end))
    return print(annotations[:5])  # Return the first 5 annotations to test
    
extact_seq_annotations("data/raw/chr19_exons.bed")

def extract_vcf_variants(file_path):
    variants = []
    with open(file_path, "r") as f:
        for line in f:

            fields = line.strip().split("\t")
            chrom, start, end, attributes, a, strand = fields
            attr_dict = dict(item.split("=") for item in attributes.split(";") if "=" in item) 
            allele_freq = attr_dict.get("AF")

            with open("data/processed/chr19_variants_annotated.txt", "a") as out_f:
                out_f.write(f"{chrom}\t{start}\t{end}\t{allele_freq}\n")
            variants.append((chrom, start, end, allele_freq))

    return print(variants[:5])  # Return the first 5 variants to test

extract_vcf_variants("data/raw/chr19_variants.bed")

[('19', '59108550', '59109183')]
[('19', '80839', '80840', '0.14'), ('19', '90973', '90974', '0.0037'), ('19', '91105', '91106', '0.96'), ('19', '93541', '93542', '0.01'), ('19', '93817', '93818', '0.0041')]


Sorted the annotation files in terminal using 
`sort -k1,1 -k2,2n -u chr19_exons_annotated.txt -o ch19_exons_annotated_sorted.txt`

Now to intersect the mutations with CDS gene annotations:

In [17]:
from pybedtools import BedTool

variants = BedTool("data/processed/chr19_variants_annotated.bed")
exons = BedTool("data/processed/ch19_exons_annotated_sorted.bed")

variants_in_exons = exons.intersect(variants)
with open("data/processed/chr19_variants_in_exons.txt", "w") as out_f:
    for variant in variants_in_exons:
        out_f.write(str(variant) + "\n")

print(variants_in_exons.head())



19	105020	105021
 19	159547	159548
 19	230129	230130
 19	279506	279507
 19	279508	279509
 19	279512	279513
 19	279555	279556
 19	279644	279645
 19	279667	279668
 19	279793	279794
 None
