### Compare variant calls and annotations from two RNA-seq experiments

In [1]:
import sys
import pandas as pd
import os

input_file1 = "/cluster/db/mecoulter/CEMartinez_RNAseq-209915706/results/bcftools/3H_locus_filtered.anno.vcf"
input_file2 = "/cluster/db/mecoulter/RNAseq2/bcftools/3H_locus_filtered.anno.vcf"

first_set = set()
second_set = set()

for line in open(input_file1):#Barke - Int19/Int56
    if line.startswith("#"):
        continue
    fields = line.rstrip().split("\t")
    SNP_position = fields[1]
    ref = fields[3]
    alt = fields[4]
    variant_id = f"{SNP_position}_{ref}_{alt}"
    first_set.add(variant_id)

print(f"{len(first_set)} unique variants in experiment one")
    
for line in open(input_file2):#Barke/Int_124_17 - Int_124_52
    if line.startswith("#"):
        continue
    fields = line.rstrip().split("\t")
    SNP_position = fields[1]
    ref = fields[3]
    alt = fields[4]
    variant_id = f"{SNP_position}_{ref}_{alt}"
    second_set.add(variant_id)

print(f"{len(second_set)} unique variants in experiment two")

FileNotFoundError: [Errno 2] No such file or directory: '/cluster/db/mecoulter/CEMartinez_RNAseq-209915706/results/bcftools/3H_locus_filtered.anno.vcf'

* What variants are shared between two experiments?

In [8]:
print(f"Number of shared variants: {len(first_set & second_set)}, proportion (compared to first set): {len(first_set & second_set)/len(first_set)} ")

Number of shared variants: 407, proportion (compared to first set): 0.2600638977635783 


In [27]:
input_file = input_file1 

annotation_input = "/cluster/db/mecoulter/BaRT2v18/BaRT_2_18_annotation_genes.txt"

expression_info = "/cluster/db/mecoulter/RNAseq2/RNAquant_analysis_result/expressed_genes_both_adj_p_values.csv"


gene_annotation = pd.read_csv(expression_info,engine="c",index_col=0)

expressed_genes = set(gene_annotation.index)

print(f"Number of expressed genes in RNAseq dataset: {len(expressed_genes)}\n")#Sanity check



impact = {"MODIFIER":0, "LOW": 0, "MODERATE": 0, "HIGH": 0 }

gene_impact = {} #Per gene impact annotation
gene_SNP_annotation = {} #Per gene SNP annotation, HIGH or MODERATE only

annotation_counts = {}

for line in open(input_file):
	if line.startswith("#"):
		continue
	fields = line.rstrip().split("\t")
	INFO = fields[7]
	SNP_position = fields[1]
	annotations = INFO.split("ANN=")[1].split("=")[0].split(",")#List of all possible annotations
	for annotation in annotations:
		try:
			allele, p_annotation, putative_impact, position, gene, feature_type, transcript, transcript_type  = annotation.split("|")[:8]
		except ValueError:
			print(f"Error at position {SNP_position}! Annotation is {annotation}")
			sys.exit()
		gene_impact.setdefault(gene,set()).add(putative_impact)
		if putative_impact == "HIGH" or putative_impact == "MODERATE":
			gene_SNP_annotation.setdefault(gene,set()).add(p_annotation)

for gene, impacts in gene_impact.items():
	for putative_impact in impacts:
		impact[putative_impact] += 1

for gene, annotations in gene_SNP_annotation.items():
	for annotation in annotations:
		try:
			annotation_counts[annotation] += 1
		except KeyError:
			annotation_counts[annotation] = 1

print(f"Number of genes with each impact category: {impact}\n")#One gene can have multiple categories

print(f"Number of genes with either HIGH or MODERATE variants: {len(gene_SNP_annotation.keys())}\n")

print(f"Number of genes with HIGH or MODERATE variants with each annotation category: {annotation_counts}\n")

for gene, annotations in gene_SNP_annotation.items():
	annotation_list = ",".join(list(annotations))
	print(f"Gene {gene} annotations: {annotation_list}")

disruptive = {"stop_gained", "frameshift_variant", "start_lost"}

if annotation_input:
	gene_annotation = pd.read_csv(annotation_input, delimiter="\t",engine="c",index_col=0)


gene_annotation.head()

first_experiment_genes = set()

print("\nGenes with most impacting variants:\n")
for gene, annotations in gene_SNP_annotation.items():
    if len(annotations & disruptive) >= 1:
        annotation_list = ", ".join(list(annotations))
        pannzer = gene_annotation.at[gene,"Pannzer annotation"]
        try:
            pannzer1 = pannzer.split(";")[0]#Just use first in list otherwise too many
        except:#If pannzer object is nan
            pannzer1 = pannzer
        print(f"Gene {gene} ({pannzer1}, expressed?: {gene in expressed_genes}): {annotation_list} \n")
        first_experiment_genes.add(gene)

Number of expressed genes in RNAseq dataset: 19802

Number of genes with each impact category: {'MODIFIER': 145, 'LOW': 72, 'MODERATE': 72, 'HIGH': 17}

Number of genes with either HIGH or MODERATE variants: 74

Number of genes with HIGH or MODERATE variants with each annotation category: {'missense_variant': 72, 'frameshift_variant': 9, 'splice_donor_variant&intron_variant': 2, 'disruptive_inframe_deletion': 4, 'stop_gained': 1, 'missense_variant&splice_region_variant': 3, 'splice_acceptor_variant&intron_variant': 4, 'disruptive_inframe_insertion': 1, 'conservative_inframe_insertion': 1, 'start_lost': 1, 'frameshift_variant&stop_gained': 1}

Gene BaRT2v18chr3HG122620 annotations: missense_variant
Gene BaRT2v18chr3HG122640 annotations: missense_variant,frameshift_variant
Gene BaRT2v18chr3HG122660 annotations: missense_variant
Gene BaRT2v18chr3HG122690 annotations: missense_variant
Gene BaRT2v18chr3HG122720 annotations: missense_variant
Gene BaRT2v18chr3HG122730 annotations: missense_va

In [54]:
input_file = input_file2

annotation_input = "/cluster/db/mecoulter/BaRT2v18/BaRT_2_18_annotation_genes.txt"

expression_info = "/cluster/db/mecoulter/RNAseq2/RNAquant_analysis_result/expressed_genes_both_adj_p_values.csv"


class SnpEffAnnotation:
    def __init__(self, annotation):
        try:
            self.allele, self.p_annotation, self.putative_impact, self.position, self.gene, self.feature_type, self.transcript, self.transcript_type  = annotation.split("|")[:8]
        except ValueError:
            print(f"Error at position {SNP_position}! Annotation is {annotation}")
            sys.exit()
    def __repr__(self):
        return self.p_annotation

class SnpEff:
    gene_impact = {}
    gene_SNP_annotation = {}
    Instances = {}#Gene: {position1: [annotations], position2: [annotations]}
    Instances_clean = {}
    position_info = {}
    
    def __init__(self, line):
        self.fields = line.rstrip().split("\t")
        self.INFO = self.fields[7]
        self.SNP_position = self.fields[1]
        self.reference_allele = self.fields[3]
        self.alternative_allele = self.fields[4]
        annotations = self.INFO.split("ANN=")[1].split("=")[0].split(",")#List of all possible annotations
        self.annotations = []
        for a in annotations:
            annotation = SnpEffAnnotation(a)
            self.annotations.append(annotation)
            self.gene_impact.setdefault(annotation.gene,[]).append(putative_impact)  
            
            if annotation.putative_impact == "HIGH" or annotation.putative_impact == "MODERATE":
                gene_SNP_annotation.setdefault(annotation.gene,[]).append(f"{annotation.p_annotation} : {self.SNP_position}")
                
            try:
                position_annotations = self.Instances[annotation.gene]
                position_annotations.setdefault(self.SNP_position, []).append(annotation)
                self.Instances[annotation.gene] = position_annotations
            except KeyError:
                positions_annotations = {}
                positions_annotations[self.SNP_position] = [annotation]
                self.Instances[annotation.gene] = positions_annotations
                
        self.position_info[self.SNP_position] = self
                
    def __getitem__(self, item):
        if self.Instances_clean:
            return self.Instances_clean[item]
        else:
            return self.Instances[item]
    
    def clean_up_annotations(self):
        """Many positions have multiple annotations.
        This is because of multiple transcripts. Most transcripts will be similar, 
        so will effectively have the same annotation. So remove duplicates. Will
        also remove annotations not of interest e.g upstream/ downstream"""
        
        not_interesting = {"downstream_gene_variant", "upstream_gene_variant", \
        "intergenic_region", "intron_variant"}
        
        for gene, position_annotation in self.Instances.items():
            position_annotation_clean = {}
            for position, annotations in position_annotation.items():
                annotations_cleaned = []
                all_p_annotations = set()
                for annotation in annotations:
                    if annotation.p_annotation not in all_p_annotations and annotation.p_annotation not in not_interesting:
                        #if annotation.putative_impact == "HIGH" or annotation.putative_impact == "MODERATE":
                        annotations_cleaned.append(annotation)
                        all_p_annotations.add(annotation.p_annotation)
                if annotations_cleaned:#Only keep high/moderate impact snps
                    position_annotation_clean[position] = annotations_cleaned
            self.Instances_clean[gene] = position_annotation_clean
    def __repr__(self):
        string = ""
        print_dict = self.Instances_clean if self.Instances_clean else self.Instances
        
        for gene, position_annotation in print_dict.items():
            string += f"{gene}: \n"
            for position, annotations in position_annotation.items():
                string += f"{position} : {','.join([str(annotation) for annotation in annotations])}\n"
            string += "\n"  
        return string
    
    def __len__(self):
        return(len(self.Instances.keys()))
    
    def write_tsv(self, outfile, gene_annotation):
        """Write a .csv file with results of cleaned up annotation. 
        Gene: Position: Annotation"""
        with open(outfile, "w") as out:
            out.write("Gene\tPosition\tAnnotation\tref allele\talt allele\tPanzzer Annotation\n")
            for gene, position_annotation in self.Instances_clean.items():
                index = 0
                try:
                    panz_annotation = gene_annotation.at[gene,"Pannzer annotation"]
                except KeyError:
                    print(f"KeyError for gene {gene}")
                    panz_annotation = "Not found"
                for position, annotations in position_annotation.items():
                    ref = self.position_info[position].reference_allele
                    alt = self.position_info[position].alternative_allele
                    out.write(f"{gene}\t{position}\t{','.join([str(annotation) for annotation in annotations])}\t\
                    {ref}\t{alt}\t")#
                    if not index:#Only write annotation once
                        out.write(f"{panz_annotation}\n")
                        index += 1
                    else:
                        out.write("\n")
        
        
        
            


gene_annotation = pd.read_csv(annotation_input,engine="c",index_col=0, sep="\t")

#expressed_genes = set(gene_annotation.index)



impact = {"MODIFIER":0, "LOW": 0, "MODERATE": 0, "HIGH": 0 }
locus = [33181340, 36970860]#3H locus boundaries on 3H

for line in open(input_file):
    #print(line)
    if line.startswith("#"):
        continue
    #print(line)
    position = int(line.rstrip().split("\t")[1])
    if position < locus[0] or position > locus[1]:#ONly include snps in 3H locus
        continue
    snpeff = SnpEff(line)

snpeff.clean_up_annotations()
print(snpeff)
print(f"Length of Instances is {len(snpeff)}")
snpeff.write_tsv("/cluster/db/mecoulter/RNAseq2/3H_locus_snpeff_gene_annotation.tab", gene_annotation)


BaRT2v18chr3HG122920: 
33237835 : 5_prime_UTR_variant
33237849 : 5_prime_UTR_premature_start_codon_gain_variant,5_prime_UTR_variant
33241825 : missense_variant,missense_variant&splice_region_variant,3_prime_UTR_variant
33244171 : 3_prime_UTR_variant
33244188 : 3_prime_UTR_variant

BaRT2v18chr3HG122930: 
33249270 : 5_prime_UTR_variant
33249283 : 5_prime_UTR_premature_start_codon_gain_variant,5_prime_UTR_variant
33250161 : synonymous_variant
33250186 : 3_prime_UTR_variant

BaRT2v18chr3HG122940: 

BaRT2v18chr3HG122940-BaRT2v18chr3HG122950: 

BaRT2v18chr3HG122960: 

BaRT2v18chr3HG122950-BaRT2v18chr3HG122960: 

BaRT2v18chr3HG122970: 
33294871 : 3_prime_UTR_variant
33294891 : 3_prime_UTR_variant
33294929 : 3_prime_UTR_variant
33295153 : 3_prime_UTR_variant
33295177 : 3_prime_UTR_variant
33295236 : 3_prime_UTR_variant
33296098 : 3_prime_UTR_variant
33296420 : 3_prime_UTR_variant
33296572 : 3_prime_UTR_variant
33297796 : 3_prime_UTR_variant
33297999 : 3_prime_UTR_variant
33298005 : 3_prime_UTR

In [45]:
gene_annotation = pd.read_csv(expression_info,engine="c",index_col=0)
gene_annotation.head()

Unnamed: 0,HID19.1-Barke.1,HID56.1-Barke.1,HEB_124_17.2-Barke.2,HEB_124_52.2-Barke.2,HEB_124_52.2-HEB_124_17.2
BaRT2v18chr1HG000020,0.002923,0.17822,0.953696,0.17687,0.936994
BaRT2v18chr1HG000040,0.008994,0.006047,0.984281,0.879735,0.940247
BaRT2v18chr1HG000050,0.0637,0.937603,1.0,0.953811,0.976945
BaRT2v18chr1HG000060,0.015122,0.169527,1.0,0.838232,0.970221
BaRT2v18chr1HG000070,0.000618,0.005162,0.985872,0.528943,0.920581
