In [3]:
import os
import sys
import pandas as pd

In [2]:
os.chdir("/cluster/db/mecoulter/RNAseq2/bcftools/")

### Analyse output of SNPeff for second RNAseq experiment to work out which genes have variants of large effect

In [5]:
input_file = "3H_locus_filtered.anno.vcf"
annotation_input = "/cluster/db/mecoulter/BaRT2v18/BaRT_2_18_annotation_genes.txt"

In [24]:
impact = {"MODIFIER":0, "LOW": 0, "MODERATE": 0, "HIGH": 0 }

gene_impact = {} #Per gene impact annotation
gene_SNP_annotation = {} #Per gene SNP annotation, HIGH or MODERATE only

annotation_counts = {}
annotation_position = {}

for line in open(input_file):
	if line.startswith("#"):
		continue
	fields = line.rstrip().split("\t")
	INFO = fields[7]
	SNP_position = fields[1]#Position of SNP on 3H in bp
	REF = fields[3] #Reference allele
	ALT = fields[4] #Alternative allele
	
	annotations = INFO.split("ANN=")[1].split("=")[0].split(",")#List of all possible annotations
	  
	for annotation in annotations:
		try:#For notes on what each of these fields are (below) see the snpEff manual
			allele, p_annotation, putative_impact, position, gene, feature_type, transcript, transcript_type  = annotation.split("|")[:8]
		except ValueError:
			print(f"Error at position {SNP_position}! Annotation is {annotation}")
			sys.exit()
		gene_impact.setdefault(gene,set()).add(putative_impact)
		if putative_impact == "HIGH" or putative_impact == "MODERATE":
			gene_SNP_annotation.setdefault(gene,set()).add(p_annotation)
			annotation_position.setdefault(SNP_position,[]).append([REF,ALT,p_annotation,transcript])

for gene, impacts in gene_impact.items():
	for putative_impact in impacts:
		impact[putative_impact] += 1

for gene, annotations in gene_SNP_annotation.items():
	for annotation in annotations:
		try:
			annotation_counts[annotation] += 1
		except KeyError:
			annotation_counts[annotation] = 1

In [7]:
print(f"Number of genes with each impact category: {impact}")#One gene can have multiple categories

print(f"Number of genes with either HIGH or MODERATE variants: {len(gene_SNP_annotation.keys())}")

print(f"Number of genes with HIGH or MODERATE variants with each annotation category: {annotation_counts}")

Number of genes with each impact category: {'MODIFIER': 131, 'LOW': 68, 'MODERATE': 52, 'HIGH': 11}
Number of genes with either HIGH or MODERATE variants: 55
Number of genes with HIGH or MODERATE variants with each annotation category: {'splice_acceptor_variant&intron_variant': 3, 'missense_variant': 52, 'missense_variant&splice_region_variant': 4, 'stop_gained': 1, 'frameshift_variant': 4, 'splice_donor_variant&intron_variant': 2, 'stop_lost': 2, 'disruptive_inframe_insertion': 1}


* Go through each gene with high or moderate impact SNP:

In [8]:
for gene, annotations in gene_SNP_annotation.items():
	annotation_list = ",".join(list(annotations))
	print(f"Gene {gene} annotations: {annotation_list}")

Gene BaRT2v18chr3HG122610 annotations: splice_acceptor_variant&intron_variant,missense_variant
Gene BaRT2v18chr3HG122620 annotations: missense_variant
Gene BaRT2v18chr3HG122640 annotations: missense_variant
Gene BaRT2v18chr3HG122680 annotations: missense_variant
Gene BaRT2v18chr3HG122690 annotations: missense_variant
Gene BaRT2v18chr3HG122720 annotations: missense_variant,missense_variant&splice_region_variant
Gene BaRT2v18chr3HG122730 annotations: missense_variant
Gene BaRT2v18chr3HG122740 annotations: missense_variant
Gene BaRT2v18chr3HG122750 annotations: missense_variant
Gene BaRT2v18chr3HG122760 annotations: missense_variant
Gene BaRT2v18chr3HG122770 annotations: missense_variant
Gene BaRT2v18chr3HG122790 annotations: missense_variant
Gene BaRT2v18chr3HG122800 annotations: missense_variant
Gene BaRT2v18chr3HG122810 annotations: missense_variant&splice_region_variant,missense_variant,stop_gained
Gene BaRT2v18chr3HG122810-BaRT2v18chr3HG122820 annotations: missense_variant,missense_v

* Lots of missense variants. What about genes with very high impacts? Read in BaRTv2.18 gene annotation file to see what things are:

In [9]:
disruptive = {"stop_gained", "frameshift_variant", "start_lost"}

gene_annotation = pd.read_csv(annotation_input, delimiter="\t",engine="c",index_col=0)
gene_annotation.head()


Unnamed: 0_level_0,Chromosome,Start,End,Strand,Sources,End support,Pannzer annotation,GO IDs,GO terms,Coding potentiality
BaRTv2 gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BaRT2v18chr1HG000020,chr1H,72060,73355,+,Illumina;HC Iso-Seq,Illumina;Enriched TSS and TES,"50S ribosomal protein L28, chloroplastic",GO:0097159;GO:0005783;GO:0003723;GO:0012505;GO...,binding;organelle envelope;intracellular membr...,Unproductive;Coding
BaRT2v18chr1HG000030,chr1H,72100,73281,-,Illumina,Illumina,,,,Non_Coding
BaRT2v18chr1HG000040,chr1H,102316,103908,+,HC Iso-Seq;HC Iso-Seq,Enriched TSS and TES;Enriched TSS and TES,,GO:0097159;GO:0005488;GO:0003676;GO:1901363,nucleic acid binding;organic cyclic compound b...,Coding;Coding
BaRT2v18chr1HG000050,chr1H,102332,108065,-,Illumina;Illumina;HC Iso-Seq;HC Iso-Seq;HC Iso...,Illumina;Illumina;Enriched TSS and TES;Enriche...,Peroxin-14,GO:0072662;GO:0044743;GO:0016558;GO:0006996;GO...,protein binding;signaling receptor binding;bin...,Coding;Coding;Unproductive;Coding;Unproductive...
BaRT2v18chr1HG000060,chr1H,110052,113271,-,Illumina;HC Iso-Seq;HC Iso-Seq;HC Iso-Seq;HC I...,Illumina;Enriched TSS and TES;Enriched TSS and...,RING-finger ubiquitin ligase;E3 ubiquitin-prot...,GO:0071310;GO:0009756;GO:0006508;GO:0031410;GO...,intracellular membrane-bounded organelle;estab...,Coding;Unproductive;Unproductive;Unproductive;...


In [14]:
print("Genes with most impacting variants:\n")

for gene, annotations in gene_SNP_annotation.items():
	if len(annotations & disruptive) >= 1: #if gene has one or more very disruptive mutations
		annotation_list = ",".join(list(annotations))
		pannzer = gene_annotation.at[gene,"Pannzer annotation"]
		print(f"Gene {gene} annotations: {annotation_list}, pannzer annotation of gene: {pannzer}\n")

Genes with most impacting variants:

Gene BaRT2v18chr3HG122810 annotations: missense_variant&splice_region_variant,missense_variant,stop_gained, pannzer annotation of gene: Glycosyltransferase, ALG3;Putative dolichyl-phosphate-mannose--glycolipid alpha-mannosyltransferase;ALG3 domain-containing protein;Asparagine-linked glycosylation 3

Gene BaRT2v18chr3HG122830 annotations: missense_variant,frameshift_variant, pannzer annotation of gene: Signal recognition particle SEC65 subunit

Gene BaRT2v18chr3HG122860 annotations: missense_variant,frameshift_variant, pannzer annotation of gene: nan

Gene BaRT2v18chr3HG123110 annotations: missense_variant,frameshift_variant,stop_lost, pannzer annotation of gene: nan

Gene BaRT2v18chr3HG123140 annotations: missense_variant,frameshift_variant, pannzer annotation of gene: Xyloglucan endotransglucosylase/hydrolase



* Look at dict annotation_position. Where/what are these high impact SNPs?

In [25]:
for position, infos in annotation_position.items():
    for info in infos:
        if info[2] in disruptive: #If annotation of SNP is a disruptive
            print(f"SNP is at position {position}, reference allele is {info[0]}, alternative allele is {info[1]}, predicted anotation: {info[2]}, transcript is {info[3]}\n")

SNP is at position 32472443, reference allele is C, alternative allele is T, predicted anotation: stop_gained, transcript is BaRT2v18chr3HG122810.5

SNP is at position 32472443, reference allele is C, alternative allele is T, predicted anotation: stop_gained, transcript is BaRT2v18chr3HG122810.6

SNP is at position 32472443, reference allele is C, alternative allele is T, predicted anotation: stop_gained, transcript is BaRT2v18chr3HG122810.10

SNP is at position 32542652, reference allele is G, alternative allele is GA, predicted anotation: frameshift_variant, transcript is BaRT2v18chr3HG122830.1

SNP is at position 32893802, reference allele is TC, alternative allele is T, predicted anotation: frameshift_variant, transcript is BaRT2v18chr3HG122860.1

SNP is at position 34751252, reference allele is AGATTG, alternative allele is A, predicted anotation: frameshift_variant, transcript is BaRT2v18chr3HG123110.11

SNP is at position 34751252, reference allele is AGATTG, alternative allele 