### Compare variant calls and annotations from two RNA-seq experiments

In [1]:
import sys
import pandas as pd
import os

input_file1 = "/cluster/db/mecoulter/CEMartinez_RNAseq-209915706/results/bcftools/3H_locus_filtered.anno.vcf"
input_file2 = "/cluster/db/mecoulter/RNAseq2/bcftools/3H_locus_filtered.anno.vcf"

first_set = set()
second_set = set()

for line in open(input_file1):#Barke - Int19/Int56
    if line.startswith("#"):
        continue
    fields = line.rstrip().split("\t")
    SNP_position = fields[1]
    ref = fields[3]
    alt = fields[4]
    variant_id = f"{SNP_position}_{ref}_{alt}"
    first_set.add(variant_id)

print(f"{len(first_set)} unique variants in experiment one")
    
for line in open(input_file2):#Barke/Int_124_17 - Int_124_52
    if line.startswith("#"):
        continue
    fields = line.rstrip().split("\t")
    SNP_position = fields[1]
    ref = fields[3]
    alt = fields[4]
    variant_id = f"{SNP_position}_{ref}_{alt}"
    second_set.add(variant_id)

print(f"{len(second_set)} unique variants in experiment two")

FileNotFoundError: [Errno 2] No such file or directory: '/cluster/db/mecoulter/CEMartinez_RNAseq-209915706/results/bcftools/3H_locus_filtered.anno.vcf'

* What variants are shared between two experiments?

In [8]:
print(f"Number of shared variants: {len(first_set & second_set)}, proportion (compared to first set): {len(first_set & second_set)/len(first_set)} ")

Number of shared variants: 407, proportion (compared to first set): 0.2600638977635783 


In [27]:
input_file = input_file1 

annotation_input = "/cluster/db/mecoulter/BaRT2v18/BaRT_2_18_annotation_genes.txt"

expression_info = "/cluster/db/mecoulter/RNAseq2/RNAquant_analysis_result/expressed_genes_both_adj_p_values.csv"


gene_annotation = pd.read_csv(expression_info,engine="c",index_col=0)

expressed_genes = set(gene_annotation.index)

print(f"Number of expressed genes in RNAseq dataset: {len(expressed_genes)}\n")#Sanity check



impact = {"MODIFIER":0, "LOW": 0, "MODERATE": 0, "HIGH": 0 }

gene_impact = {} #Per gene impact annotation
gene_SNP_annotation = {} #Per gene SNP annotation, HIGH or MODERATE only

annotation_counts = {}

for line in open(input_file):
	if line.startswith("#"):
		continue
	fields = line.rstrip().split("\t")
	INFO = fields[7]
	SNP_position = fields[1]
	annotations = INFO.split("ANN=")[1].split("=")[0].split(",")#List of all possible annotations
	for annotation in annotations:
		try:
			allele, p_annotation, putative_impact, position, gene, feature_type, transcript, transcript_type  = annotation.split("|")[:8]
		except ValueError:
			print(f"Error at position {SNP_position}! Annotation is {annotation}")
			sys.exit()
		gene_impact.setdefault(gene,set()).add(putative_impact)
		if putative_impact == "HIGH" or putative_impact == "MODERATE":
			gene_SNP_annotation.setdefault(gene,set()).add(p_annotation)

for gene, impacts in gene_impact.items():
	for putative_impact in impacts:
		impact[putative_impact] += 1

for gene, annotations in gene_SNP_annotation.items():
	for annotation in annotations:
		try:
			annotation_counts[annotation] += 1
		except KeyError:
			annotation_counts[annotation] = 1

print(f"Number of genes with each impact category: {impact}\n")#One gene can have multiple categories

print(f"Number of genes with either HIGH or MODERATE variants: {len(gene_SNP_annotation.keys())}\n")

print(f"Number of genes with HIGH or MODERATE variants with each annotation category: {annotation_counts}\n")

for gene, annotations in gene_SNP_annotation.items():
	annotation_list = ",".join(list(annotations))
	print(f"Gene {gene} annotations: {annotation_list}")

disruptive = {"stop_gained", "frameshift_variant", "start_lost"}

if annotation_input:
	gene_annotation = pd.read_csv(annotation_input, delimiter="\t",engine="c",index_col=0)


gene_annotation.head()

first_experiment_genes = set()

print("\nGenes with most impacting variants:\n")
for gene, annotations in gene_SNP_annotation.items():
    if len(annotations & disruptive) >= 1:
        annotation_list = ", ".join(list(annotations))
        pannzer = gene_annotation.at[gene,"Pannzer annotation"]
        try:
            pannzer1 = pannzer.split(";")[0]#Just use first in list otherwise too many
        except:#If pannzer object is nan
            pannzer1 = pannzer
        print(f"Gene {gene} ({pannzer1}, expressed?: {gene in expressed_genes}): {annotation_list} \n")
        first_experiment_genes.add(gene)

Number of expressed genes in RNAseq dataset: 19802

Number of genes with each impact category: {'MODIFIER': 145, 'LOW': 72, 'MODERATE': 72, 'HIGH': 17}

Number of genes with either HIGH or MODERATE variants: 74

Number of genes with HIGH or MODERATE variants with each annotation category: {'missense_variant': 72, 'frameshift_variant': 9, 'splice_donor_variant&intron_variant': 2, 'disruptive_inframe_deletion': 4, 'stop_gained': 1, 'missense_variant&splice_region_variant': 3, 'splice_acceptor_variant&intron_variant': 4, 'disruptive_inframe_insertion': 1, 'conservative_inframe_insertion': 1, 'start_lost': 1, 'frameshift_variant&stop_gained': 1}

Gene BaRT2v18chr3HG122620 annotations: missense_variant
Gene BaRT2v18chr3HG122640 annotations: missense_variant,frameshift_variant
Gene BaRT2v18chr3HG122660 annotations: missense_variant
Gene BaRT2v18chr3HG122690 annotations: missense_variant
Gene BaRT2v18chr3HG122720 annotations: missense_variant
Gene BaRT2v18chr3HG122730 annotations: missense_va

In [3]:
input_file = input_file2

annotation_input = "/cluster/db/mecoulter/BaRT2v18/BaRT_2_18_annotation_genes.txt"

expression_info = "/cluster/db/mecoulter/RNAseq2/RNAquant_analysis_result/expressed_genes_both_adj_p_values.csv"


gene_annotation = pd.read_csv(expression_info,engine="c",index_col=0)

expressed_genes = set(gene_annotation.index)

#output_file = sys.argv[2]

impact = {"MODIFIER":0, "LOW": 0, "MODERATE": 0, "HIGH": 0 }

gene_impact = {} #Per gene impact annotation
gene_SNP_annotation = {} #Per gene SNP annotation, HIGH or MODERATE only

annotation_counts = {}

for line in open(input_file):
	if line.startswith("#"):
		continue
	fields = line.rstrip().split("\t")
	INFO = fields[7]
	SNP_position = fields[1]
	annotations = INFO.split("ANN=")[1].split("=")[0].split(",")#List of all possible annotations
	for annotation in annotations:
		try:
			allele, p_annotation, putative_impact, position, gene, feature_type, transcript, transcript_type  = annotation.split("|")[:8]
		except ValueError:
			print(f"Error at position {SNP_position}! Annotation is {annotation}")
			sys.exit()
		gene_impact.setdefault(gene,set()).add(putative_impact)
		if putative_impact == "HIGH" or putative_impact == "MODERATE":
			gene_SNP_annotation.setdefault(gene,set()).add(p_annotation)

for gene, impacts in gene_impact.items():
	for putative_impact in impacts:
		impact[putative_impact] += 1

for gene, annotations in gene_SNP_annotation.items():
	for annotation in annotations:
		try:
			annotation_counts[annotation] += 1
		except KeyError:
			annotation_counts[annotation] = 1

print(f"Number of genes with each impact category: {impact}")#One gene can have multiple categories

print(f"Number of genes with either HIGH or MODERATE variants: {len(gene_SNP_annotation.keys())}")

print(f"Number of genes with HIGH or MODERATE variants with each annotation category: {annotation_counts}")


for gene, annotations in gene_SNP_annotation.items():
	annotation_list = ",".join(list(annotations))
	print(f"Gene {gene} annotations: {annotation_list}")
    

disruptive = {"stop_gained", "frameshift_variant", "start_lost"}

if annotation_input:
	gene_annotation = pd.read_csv(annotation_input, delimiter="\t",engine="c",index_col=0)


gene_annotation.head()

print("\nGenes with most impacting variants:\n")
for gene, annotations in gene_SNP_annotation.items():
    if len(annotations & disruptive) >= 1:
        annotation_list = ", ".join(list(annotations))
        pannzer = gene_annotation.at[gene,"Pannzer annotation"]
        try:
            pannzer1 = pannzer.split(";")[0]#Just use first in list otherwise too many
        except:#If pannzer object is nan
            pannzer1 = pannzer
        print(f"Gene {gene} ({pannzer1}, expressed?: {gene in expressed_genes}): {annotation_list} \n")


print("\n\n")
        
print("Which genes with big variants are present in both experiments?\n")
for gene, annotations in gene_SNP_annotation.items():
    if gene in first_experiment_genes:
        if len(annotations & disruptive) >= 1:
            annotation_list = ", ".join(list(annotations))
            pannzer = gene_annotation.at[gene,"Pannzer annotation"]
            try:
                pannzer1 = pannzer.split(";")[0]#Just use first in list otherwise too many
            except:#If pannzer object is nan
                pannzer1 = pannzer
            print(f"Gene {gene} ({pannzer1}, expressed?: {gene in expressed_genes}): {annotation_list} \n")

#Create table of results:
table = pd.DataFrame({"Genes":list(gene_SNP_annotation.keys()), "Variant annotation": list(gene_SNP_annotation.values())})

table.to_csv(path_or_buf="/cluster/db/mecoulter/RNAseq2/high_impact_variant_annotation.csv",header=True)

		

Number of genes with each impact category: {'MODIFIER': 131, 'LOW': 68, 'MODERATE': 52, 'HIGH': 11}
Number of genes with either HIGH or MODERATE variants: 55
Number of genes with HIGH or MODERATE variants with each annotation category: {'missense_variant': 52, 'splice_acceptor_variant&intron_variant': 3, 'missense_variant&splice_region_variant': 4, 'stop_gained': 1, 'frameshift_variant': 4, 'splice_donor_variant&intron_variant': 2, 'stop_lost': 2, 'disruptive_inframe_insertion': 1}
Gene BaRT2v18chr3HG122610 annotations: missense_variant,splice_acceptor_variant&intron_variant
Gene BaRT2v18chr3HG122620 annotations: missense_variant
Gene BaRT2v18chr3HG122640 annotations: missense_variant
Gene BaRT2v18chr3HG122680 annotations: missense_variant
Gene BaRT2v18chr3HG122690 annotations: missense_variant
Gene BaRT2v18chr3HG122720 annotations: missense_variant,missense_variant&splice_region_variant
Gene BaRT2v18chr3HG122730 annotations: missense_variant
Gene BaRT2v18chr3HG122740 annotations: miss

NameError: name 'first_experiment_genes' is not defined