/
ANNOTATION_PIPELINE_IPER_OPTION.sh
282 lines (218 loc) · 8.15 KB
/
ANNOTATION_PIPELINE_IPER_OPTION.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/bin/bash
#ANNOTATION_PIPELINE_IPER_OPTION.sh script to run pipeline
#
# written by Richard Wolfe
#
# $1 = protein sequences fasta file from prodigal
# >scaffold_0_1
# GFSP......
# $2 = gene sequences fasta file from prodigal
# >scaffold_0_1
# Note: if this file does not exist, then will just make errors on screen
# ACTG.......
# $3 = NOT_USED
# the list of contigs to pull from the above 2 files
# All_contogstopull.txt
# ORIG_scaffold_0 (will pull scaffold_0_0....scafold_0_1...)
# Note: ORIG can be anything but must have an _ before Ex: pre_fix_scaffold_0
# $4 = An id that is added in front of the scaffold_4_24 in the final 2 fasta files
# and the annotation file. Can be anything Ex: T82_8
#
# $5 = IPER or NO_IPER or PFAM
# if want to run iperscan or the PFAM script
# $6 = number of threads
#make sure 6 attributes
#check if 6th variable is empty
if [ -z $6 ]
then
echo "You did not provide 6 attributes"
exit 1
fi
#check if valid string for $5
if [ "$5" == "IPER" ]
then
echo "Iperscan option is selected"
elif [ "$5" == "PFAM" ]
then
echo "PFAM scripts will be run"
elif [ "$5" == "NO_IPER" ]
then
echo "Iperscan will not be performed"
else
echo "Variable 5 must be either IPER or NO_IPER"
exit 1
fi
#exit
#get filenames from path so we dont include the path
#in the new file names
fname1=$(basename $1)
fname2=$(basename $2)
echo $fname1
echo $fname2
#check if files already exist so we dont overwrite
#with new files
#only checks a few
for f in $fname1.3 combined.iprscan renamed_ublast_uniref90.b6 renamed_ublast_KEGG.b6 uniref90_ublast_renamed.b6 KEGG_ublast_renamed.b6 renamed.unirbh.txt renamed.unirbh.txt renamed_ublast_uniref90.b6.BIT_SCORE60.b6 renamed_ublast_KEGG.b6.BIT_SCORE60.b6 uniref90_ublast_renamed.b6_BIT_SCORE_300 KEGG_ublast_renamed.b6_BIT_SCORE_300
do
if [ -f $f ]
then
echo "Error .. File $f already exists"
exit 1
fi
done
if [ "$5" == "IPER" ]
then
#make sequences in fasta file on single line
python make_fasta_seq_single_line.py -i $1 -o $fname1.3
echo ' '
echo 'Calculating the number of lines to put in the iperscan files'
echo ' '
numLines=`cat $fname1.3 | wc -l`
numSeq1=$(( numLines / 2 ))
echo "There are $numLines lines in the file $fname1.3"
echo "There are $numSeq1 sequences in the file $fname1.3"
numLines=$(( numLines / 2))
numLines=$(( numLines / 25 ))
numLines=$(( numLines + 1 ))
numLines=$(( numLines * 2 ))
numSeq1=$(( numLines / 2 ))
echo "There will be $numLines lines in each iperscan file"
echo "There will be $numSeq1 sequences in each iperscan file"
#echo 'A sequence takes 2 lines'
echo ' '
#exit
################
#run interproscan
echo " "
echo "Running parallel interproscan"
echo " "
interproscan_parallel.sh $fname1.3 $numLines
#calculating the number of sequences in the split files
iprlines=`cat file_* | wc -l`
iprscanresults=`cat file_*.iprscan | wc -l`
iperlines=$(( iprlines - iprscanresults ))
iprsequences=$(( iperlines / 2 ))
echo ' '
echo "The total number of sequences that were in all of the 25 split files = $iprsequences"
echo ' '
#calculate the number of sequences in the iprscan results
iprseqcount=0
for f in file_a*
do
#if there is an output file from iperscan
if [ -f $f.iprscan ]
then
#count the lines in the input file
filesequences=`cat $f | wc -l`
iprseqcount=$(( iprseqcount + filesequences ))
fi
done
#This should not happen but just in case
file="file_az.iprscan"
if [ -f $file ]
then
echo ' '
echo 'ERROR MORE THAN 25 iperscan files were made'
fi
iprseqcount=$(( iprseqcount / 2 ))
echo ' '
echo "The number of sequences that made it through iperscan = $iprseqcount"
echo ' '
if (( $iprseqcount != $iprsequences ))
then
echo ' '
echo 'NOT ALL OF THE SEQUENCES WENT THROUGH IPRSCAN'
echo ' '
else
echo ' '
echo 'All of the sequences went through iprscan'
echo ' '
fi
##################################################
echo ' '
echo 'Combining the 25 iperscan results into the file combined.iprscan'
echo ' '
cat file_a*.iprscan > combined.iprscan
rm -f file_a*.iprscan
rm -f file_a*
####end if iperscan
#if we are using the local pfam ddatabase
elif [ "$5" == "PFAM" ]
then
#run the protein file through the pfam scan
#/opt/PfamScan/pfam_scan.pl -fasta $1 -dir /opt/PfamScan/Data_files -outfile pfam_scan_output.txt
python parallel_PfamScan.py -n $6 -f $1 -o pfam_scan_output.txt --OSC F
#convert the output file to the same as it would be from iperscan
python convert_pfam_to_iperscan.py -i pfam_scan_output.txt -o combined.iprscan -p /opt/PfamScan/Data_files/Pfam-A.hmm.dat
else
#iperscan was not selected
#make an empty file
echo " "
echo "Making an empty combined.iperscan file because iperscan was not run"
echo " "
echo " " > combined.iprscan
fi
#run reverse best hits on the pulled sequences
echo ' '
echo 'Running the reverse best hits'
echo ' '
reverse_best_hits.sh $1 $6
#this makes 11 files
perl perl1.pl renamed_ublast_uniref90.b6.BIT_SCORE60.b6 > renamed_ublast_uniref90.b6.BIT_SCORE60.b6.OUT1.txt
perl perl1.pl renamed_ublast_KEGG.b6.BIT_SCORE60.b6 > renamed_ublast_KEGG.b6.BIT_SCORE60.b6.OUT1.txt
perl perl2.pl renamed.unirbh.txt > renamed.unirbh.txt.OUT1.txt
perl perl2.pl renamed.keggrbh.txt > renamed.keggrbh.txt.OUT1.txt
SUBSTRING1="$4"
#Pulling all the annotations from unirbh, keggrbh, uniref, kegg, iprscan
perl perl4_NEW.pl renamed_ublast_uniref90.b6.BIT_SCORE60.b6.OUT1.txt renamed_ublast_KEGG.b6.BIT_SCORE60.b6.OUT1.txt renamed.unirbh.txt.OUT1.txt renamed.keggrbh.txt.OUT1.txt combined.iprscan > ANNOTATION_OUT_$fname1.3.txt
#Arranging the Annotations in sequence according to database preference - RBH-KEGG, RBH-UNIREF, BLAST-KEGG, BLAST-UNIREF, IPRSCAN
grep "RBH" ANNOTATION_OUT_$fname1.3.txt | grep "db=KEGG" > RBH_KEGG1
grep "RBH" ANNOTATION_OUT_$fname1.3.txt | grep "db=UNIREF" >> RBH_KEGG1
grep "BLAST" ANNOTATION_OUT_$fname1.3.txt | grep "db=KEGG" >> RBH_KEGG1
grep "BLAST" ANNOTATION_OUT_$fname1.3.txt | grep "db=UNIREF" >> RBH_KEGG1
grep "IPRSCAN" ANNOTATION_OUT_$fname1.3.txt >> RBH_KEGG1
#Pulling annotations for same contig/scaffold on same line
python pull_all_contig_annotations.py -i RBH_KEGG1 -o ANNOTATION_OUT_$fname1.3.txt_FINAL.txt
#Ranking system - All databases - A, 4 databases(One RBH atleast) - B, 3 databases(One BLAST atleast) - C, 2 databases(One BLAST atleast) - D, 1 database(IPRSCAN only) - E
perl perl6.pl ANNOTATION_OUT_$fname1.3.txt_FINAL.txt > $SUBSTRING1.ANNOTATION_OUT.txt_FINAL_RANKED.txt
#Adding headers to the .faa and .fna files with binned contigs
(
python write_annotation_to_fasta.py -i $1 -o $fname1.3.4 -a ANNOTATION_OUT_$fname1.3.txt_FINAL.txt -j $4
) &
#check if this file exists
if [ -f $2 ]
then
(
python write_annotation_to_fasta.py -i $2 -o $fname2.3.4 -a ANNOTATION_OUT_$fname1.3.txt_FINAL.txt -j $4
)&
fi
wait
#i added so prefix is added to output file
#sed -i "s/scaffold_/$4_scaffold_/g" $SUBSTRING1.ANNOTATION_OUT.txt_FINAL_RANKED.txt
sed -i "s/^/$4_/g" $SUBSTRING1.ANNOTATION_OUT.txt_FINAL_RANKED.txt
#i added so that the sequences with out an annotation are added to the results file
grep 'Unknown_Function' $fname1.3.4 > $SUBSTRING1.$fname1.3.4.5_unknown_headers
sed -i "s/>//g" $SUBSTRING1.$fname1.3.4.5_unknown_headers
awk -F' Unknown_Function' '{print $1 "\tF\tUnknown Function"}' $SUBSTRING1.$fname1.3.4.5_unknown_headers >> $SUBSTRING1.ANNOTATION_OUT.txt_FINAL_RANKED.txt
#All the required files copied to results directory
mkdir annotation_results
cp $SUBSTRING1.ANNOTATION_OUT.txt_FINAL_RANKED.txt annotation_results/$4_ANNOTATION_OUT.txt
cp $fname1.3.4 annotation_results/$4_ANNOTATED_$fname1
cp $fname2.3.4 annotation_results/$4_ANNOTATED_$fname2
rm -f $SUBSTRING1.ANNOTATION_OUT.txt_FINAL_RANKED.txt
echo " "
echo "Removing intermediate files"
echo " "
rm -f ANNOTATION_OUT_$fname1.3.txt
rm -f ANNOTATION_OUT_$fname1.3.txt_FINAL.txt
rm -f RBH_KEGG1
rm -f $fname1.3.4
rm -f $fname2.3.4
rm -f $SUBSTRING1.$fname1.3.4.5_unknown_headers
echo "Step3 Completed Successfully!"
echo "All the Analysis steps completed Successfully!"
echo "Results were written to the folder: annotation_results"
echo " "
echo ' '
echo 'Annotation pipeline has completed'
echo ' '