# Variant Analysis using VCF files

After downloading the VCF files and variants form chromosome 19, the coding sequence annotations on chromosome 19 are extacted with:

In [13]:
def extact_seq_annotations(file_path):
    annotations = []
    with open (file_path, "r") as f:
        for line in f: 
            # splits line into fields
            fields = line.strip().split("\t")
            chrom, start, end, attributes, a, strand = fields

            attr_dict = dict(item.split("=") for item in attributes.split(";") if "=" in item) 
            transcript_id = attr_dict.get("transcript_id")
            # writes the transcript_id, start, and end to an output file
            with open("data/processed/chr19_exons_annotated.txt", "a") as out_f:
                out_f.write(f"{chrom}\t{start}\t{end}\n")

        annotations.append((chrom, start, end))
    return print(annotations[:5])  # Return the first 5 annotations to test
    
extact_seq_annotations("data/raw/chr19_exons.bed")

def extract_vcf_variants(file_path):
    variants = []
    with open(file_path, "r") as f:
        for line in f:

            fields = line.strip().split("\t")
            chrom, start, end, attributes, a, strand = fields
            attr_dict = dict(item.split("=") for item in attributes.split(";") if "=" in item) 
            allele_freq = attr_dict.get("AF")

            with open("data/processed/chr19_variants_annotated.txt", "a") as out_f:
                out_f.write(f"{chrom}\t{start}\t{end}\t{allele_freq}\n")
            variants.append((chrom, start, end, allele_freq))

    return print(variants[:5])  # Return the first 5 variants to test

extract_vcf_variants("data/raw/chr19_variants.bed")

[('19', '59108550', '59109183')]
[('19', '80839', '80840', '0.14'), ('19', '90973', '90974', '0.0037'), ('19', '91105', '91106', '0.96'), ('19', '93541', '93542', '0.01'), ('19', '93817', '93818', '0.0041')]


Sorted the annotation files in terminal using 
`sort -k1,1 -k2,2n -u chr19_exons_annotated.txt -o ch19_exons_annotated_sorted.txt`

Now to intersect the mutations with CDS gene annotations:

In [10]:
from pybedtools import BedTool

variants = BedTool("data/raw/chr19_variants.bed")
exons = BedTool("data/processed/ch19_exons_annotated_sorted.txt")

variants_in_exons = variants.intersect(exons)

print(variants_in_exons.head())


19	105020	105021	RSQ=0.9089;LDAF=0.0692;AC=145;AA=G;AN=2184;VT=SNP;ERATE=0.0026;THETA=0.0125;SNPSOURCE=LOWCOV;AVGPOST=0.9851;AF=0.07;ASN_AF=0.03;AMR_AF=0.06;AFR_AF=0.09;EUR_AF=0.08	.	+
 19	159547	159548	ERATE=0.0013;AVGPOST=0.9907;LDAF=0.0047;AN=2184;THETA=0.0171;VT=SNP;AA=.;AC=0;SNPSOURCE=LOWCOV;RSQ=0.0919;AF=0	.	+
 19	230129	230130	AA=GATC;AC=1311;AF=0.60;AFR_AF=0.59;AMR_AF=0.57;AN=2184;ASN_AF=0.88;AVGPOST=0.6811;ERATE=0.0090;EUR_AF=0.42;LDAF=0.5828;RSQ=0.4753;THETA=0.0030;VT=INDEL	.	+
 19	279506	279507	AN=2184;AC=8;THETA=0.0008;RSQ=0.9326;VT=SNP;LDAF=0.0038;SNPSOURCE=LOWCOV;ERATE=0.0003;AVGPOST=0.9995;AA=t;AF=0.0037;ASN_AF=0.01;EUR_AF=0.0040	.	+
 19	279508	279509	LDAF=0.0009;RSQ=0.6705;AA=g;AN=2184;VT=SNP;SNPSOURCE=LOWCOV;AC=1;THETA=0.0051;ERATE=0.0003;AVGPOST=0.9992;AF=0.0005;ASN_AF=0.0017	.	+
 19	279512	279513	AC=7;AN=2184;LDAF=0.0032;THETA=0.0008;VT=SNP;RSQ=0.9797;SNPSOURCE=LOWCOV;AA=a;ERATE=0.0003;AVGPOST=0.9999;AF=0.0032;AFR_AF=0.01	.	+
 19	279555	279556	AC=7;RSQ=0.6699;AN=2184