/
codingSiteTypes.py
171 lines (129 loc) · 8.36 KB
/
codingSiteTypes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python
# script to annotate each coding site in the genome according to its type
# (codon position, synonymous / non-synonymous (variants only) and degeneracy.
# A vcf can be included to improve accuracy
# NOTE includion of a vcf requires that the vcf is tabix-indexed and tabix is installed. See https://github.com/samtools
import genomics
import sys, argparse, gzip, subprocess, itertools
################################################################################
def tabixStream(fileName, region = None, chrom = None, start=None, end=None, header=False):
if not region: region = chrom+":"+str(start)+"-"+str(end)
if header:
p = subprocess.Popen(['tabix', '-h', fileName, region], stdout=subprocess.PIPE, bufsize=1, universal_newlines=True)
else:
p = subprocess.Popen(['tabix',fileName, region], stdout=subprocess.PIPE, bufsize=1, universal_newlines=True)
return iter(p.communicate()[0].strip().split("\n"))
################################################################################
parser=argparse.ArgumentParser()
parser.add_argument("-a", "--annotation", help="Input annotation file (gff or gtf)", action = "store", required = True)
parser.add_argument("-f", "--format", help="Annotation file format", action = "store", choices = ("gff3", "gtf"), default="gff3")
parser.add_argument("-o", "--outFile", help="Output siteTypes file", action = "store")
parser.add_argument("-v", "--vcf", help="VCF file with variants (only ALTs will be considered)", action='store')
parser.add_argument("-r", "--ref", help="Genome .fa file. Must also have ", action='store', required = True)
parser.add_argument("--ignoreConflicts", help="Don't fail if two annotations give conflicting information about the same site", action='store_true')
parser.add_argument("--scaffoldLookup", help="Table of scaffold names (col1:reference and col2:annotation) if they differ", action = "store")
parser.add_argument("--useAnnotationScaffoldNames", help="Use the scaffold names in the annotation rather than the reference", action = "store_true")
args = parser.parse_args()
################################################################################
#get gene data
sys.stderr.write("Parsing annotation\n")
with gzip.open(args.annotation,"rt") if args.annotation.endswith(".gz") else open(args.annotation,"rt") as ann:
geneData = genomics.parseGenes(ann.readlines(), fmt=args.format)
#get scaffold names
sys.stderr.write("Loading reference genome\n")
with gzip.open(args.ref,"rt") if args.ref.endswith(".gz") else open(args.ref,"rt") as ref:
scaffolds,_sequences_ = genomics.parseFasta(ref.read(), makeUppercase=True)
sequences = {}
for i,scaffold in enumerate(scaffolds): sequences[scaffold] = _sequences_[i]
#if a scaffold lookup table is provided, we need to change the names
if args.scaffoldLookup and args.useAnnotationScaffoldNames:
#if we are going to use the annotation scaffold names, change the reference
with open(args.scaffoldLookup) as lookup:
scafNamesDict = dict([line.split() for line in lookup])
sequences_renamed = {}
scaffolds_renamed = []
for scaffold in scaffolds:
if scaffold in scafNamesDict.keys():
sequences_renamed[scafNamesDict[scaffold]] = sequences[scaffold]
scaffolds_renamed.append(scafNamesDict[scaffold])
else:
sys.stderr.write(f" WARNING!: {scaffold} is not in scaffoldLookup and will not be analysed\n")
sequences = sequences_renamed
scaffolds = scaffolds_renamed
if args.scaffoldLookup and not args.useAnnotationScaffoldNames:
#if we are going to use the reference scaffold names, change the gene data
with open(args.scaffoldLookup) as lookup:
scafNamesDict = dict([line.split()[::-1] for line in lookup])
geneData_renamed = {}
for scaffold in scaffolds: geneData_renamed[scaffold] = geneData[scafNamesDict[scaffold]]
geneData = geneData_renamed
#open output
if not args.outFile: outFile = sys.stdout
else: outFile = gzip.open(args.outFile,"wt") if args.outFile.endswith(".gz") else open(args.outFile,"wt")
outFile.write("\t".join(["scaffold","position","codon_position","substitution_type","degeneracy"]) + "\n")
for scaffold in scaffolds:
posData = {}
positionsAnalysed = set()
try:
mRNAs = geneData[scaffold].keys()
sys.stderr.write("Analysing {}: {} mRNAs\n".format(scaffold,len(mRNAs)))
except:
sys.stderr.write("Skipping {}. No annotated mRNAs\n".format(scaffold))
continue
counter = 0
for mRNA in mRNAs:
region = scaffold + ":" + str(geneData[scaffold][mRNA]["start"]) + "-" + str(geneData[scaffold][mRNA]["end"])
sys.stderr.write(" Analysing mRNA {}: {}, {} exons\n".format(mRNA, region, geneData[scaffold][mRNA]["exons"]))
siteAlleles = {}
for i in range(geneData[scaffold][mRNA]["exons"]):
start = geneData[scaffold][mRNA]["cdsStarts"][i]
end = geneData[scaffold][mRNA]["cdsEnds"][i]
#extract the sequence data for this gene, to be used below
#first get sequence from reference
siteAlleles.update(dict(zip(range(start,end+1), [set(base) for base in sequences[scaffold][start-1:end]])))
#then, get variants
if args.vcf:
vcfStream = tabixStream(args.vcf,
chrom = scaffold,
start = geneData[scaffold][mRNA]["cdsStarts"][i],
end = geneData[scaffold][mRNA]["cdsEnds"][i])
for line in vcfStream:
if line != "":
CHROM,POS,ID,REF,ALT = line.split()[:5]
for a in ALT:
if a == "A" or a == "C" or a == "G" or a == "T" : siteAlleles[int(POS)].add(a)
#remove any Ns
for alleleSet in siteAlleles:
try: alleleSet.remove("N")
except: pass
#for each CDS, extract a list of scaffold positions
cdsPositions = genomics.CDSpositions(geneData[scaffold][mRNA]['cdsStarts'],
geneData[scaffold][mRNA]['cdsEnds'],
geneData[scaffold][mRNA]['strand'], trim=True)
#make a list of lists for each codon. Each of which is a list of alleles
codonAlleles = [[siteAlleles[cdsPositions[y]] if geneData[scaffold][mRNA]["strand"] == "+"
else genomics.complement(siteAlleles[cdsPositions[y]]) for y in range(x,x+3)] for x in range(len(cdsPositions))[::3]]
#dictionary of substitution type and degeneracy for each position
_posData_ = dict(zip(cdsPositions, [x for _codonAlleles_ in codonAlleles
for x in zip(range(1,4),
genomics.synNon(*_codonAlleles_),
genomics.degeneracy(*_codonAlleles_))]))
#now check whether we don't already have site type data for these seites from a previous gene
previouslyAnalysedPositions = positionsAnalysed.intersection(cdsPositions)
#check for conflicts if any sites are included in multiple RNAs
for pos in previouslyAnalysedPositions:
if posData[pos] != _posData_[pos]:
if args.ignoreConflicts:
#print("WARNING: Position {} of {} occurs in two mRNAs giving conflicting site classifications. Setting output to NA\n".format(pos, scaffold), file=sys.stderr)
_posData_[pos] = ("NA","NA","NA")
else:
raise AssertionError("Position {} of {} occurs in two mRNAs giving conflicting site classifications.\n".format(pos, scaffold))
#update position data and the list of positions analysed
posData.update(_posData_)
positionsAnalysed.update(cdsPositions)
counter += 1
# report number of genes for this scaffold
sys.stderr.write(" Done analysing {} mRNAs. Writing output for {}\n".format(counter, scaffold))
for pos in sorted(positionsAnalysed):
outFile.write("\t".join([scaffold, str(pos), "\t".join([str(x) for x in posData[pos]])]) + "\n")
outFile.close()