-
Notifications
You must be signed in to change notification settings - Fork 1
/
uploadSpecialtyGenes.pl
executable file
·220 lines (151 loc) · 5.89 KB
/
uploadSpecialtyGenes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env perl
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use Getopt::Long::Descriptive;
use JSON;
use Data::Dumper;
use lib "$Bin";
use SolrAPI;
my $solrServer = $ENV{PATRIC_SOLR};
my $solrFormat="&wt=csv&csv.separator=%09&csv.mv.separator=;";
my $solrh = SolrAPI->new();
my $json = JSON->new->allow_nonref;
my ($opt, $usage) = describe_options(
"%c %o",
[],
["genome_list=s", "File containing list of annotation files"],
["spgene_ref=s", "File containing spgene_ref data"],
["commit=s", "Commit updates to Solr, true|false", { default => "false"}],
[],
["help|h", "Print usage message and exit"]
);
print($usage->text), exit 0 if $opt->help;
die($usage->text) unless $opt->genome_list && $opt->spgene_ref;
#my $spgeneRef = $solrh->getSpGeneRef();
#source,source_id,property,locus_tag,organism,function,classification,pmid,assertion
my $spgeneRef;
open SPGREF, $opt->spgene_ref or die "Can't open spgene_ref file: $opt->spgene_ref!!\n" if $opt->spgene_ref;
while (my $line = <SPGREF>){
chomp $line;
my ($property,$source,$source_id,$gene_name,$locus_tag,$gene_id,
$gi,$genus,$species,$organism,$product,$function,$classification,
$antibiotics_class,$antibiotics,$pmid,$assertion) = split /\t/, $line;
my $key = $source."_".$source_id;
$spgeneRef->{$key} = "$property\t$source\t$source_id\t$locus_tag\t$organism\t".
"$function\t$classification\t$antibiotics_class\t$antibiotics\t$pmid\t$assertion";
#print "$key\t$spgeneRef->{$key}\n";
}
close SPGREF;
open LIST, $opt->genome_list or die "Can't open genome_list file: $opt->genome_list!!\n";
while (my $file_name = <LIST>) {
chomp $file_name;
my $genome_id = $file_name;
$genome_id =~s/.*\///;
$genome_id=~s/.PATRIC.faa.hit//;
print "Processing $genome_id\n";
# process each genome files
if (-f $file_name){
open GENOME, "$file_name" or next;
}else {
print "\t$file_name doesn't exist!!\n";
}
# global arrays to hold spgene records
my @spgenes;
# new genome, get primary identifiers and existing features
my $features = getFeatures($genome_id);
while (my $line = <GENOME>){
chomp $line;
my ($patric_id, $spgene_match) = $line=~/([^\t]+)\t(.*)/;
next unless $features->{$patric_id};
my $spgene = prepareSPGene($features->{$patric_id},$spgene_match);
push @spgenes, $spgene if $spgene;
}
close GENOME;
prepareJsonFile($genome_id, \@spgenes);
postJsonFile($genome_id) if $opt->commit eq "true";
}
close LIST;
sub getFeatures {
my ($genome_id) = @_;
my $core = "/genome_feature";
my $query = "/select?q=annotation:PATRIC AND feature_type:CDS AND genome_id:$genome_id";
my $fields = "&fl=genome_id,genome_name,taxon_id,sequence_id,accession,annotation,feature_id,patric_id,alt_locus_tag,refseq_locus_tag,gene,product,owner,public";
my $rows = "&rows=1000000";
my $sort = "";
my $solrFormat="&wt=json";
my $solrQuery = $solrServer.$core.$query.$fields.$rows.$sort.$solrFormat;
my $result = `wget -q -O - "$solrQuery"`;
my $resultObj = decode_json($result);
my $features;
foreach my $record(@{$resultObj->{'response'}->{'docs'}}){
$features->{$record->{patric_id}} = $record;
}
return $features;
}
sub prepareSPGene {
my ($feature, $spgene_match) = @_;
my $spgene;
my ($msource, $msource_id, $qcov, $scov, $identity, $evalue) = split /\t/, $spgene_match;
$msource_id=~s/^\S*\|//;
my $key = $msource.'_'.$msource_id;
my ($property,$source,$source_id,$locus_tag,$organism,$function, $classification,$antibiotics_class,$antibiotics,$pmid, $assertion)
= split /\t/, $spgeneRef->{$key};
return unless $source_id;
my ($qgenus, $qspecies, $sgenus, $sspecies);
($qgenus) = $feature->{genome_name}=~/^(\S+)/;
($qspecies) = $feature->{genome_name}=~/^(\S+ +\S+)/;
($sgenus) = $organism=~/^(\S+)/ if $organism;
($sspecies) = $organism=~/^(\S+ +\S+)/ if $organism;
my ($same_genus, $same_species, $same_genome, $evidence);
$same_genus = 1 if ($qgenus eq $sgenus && $sgenus ne "");
$same_species = 1 if ($qspecies eq $sspecies && $sspecies ne "");
$same_genome = 1 if ($feature->{genome} eq $organism && $organism ne "") ;
$evidence = ($feature->{refseq_locus_tag} && $locus_tag && $feature->{refseq_locus_tag} eq $locus_tag)? 'Literature':'BLAT';
$spgene->{owner} = $feature->{owner};
$spgene->{public} = $feature->{public};
$spgene->{genome_id} = $feature->{genome_id};
$spgene->{genome_name} = $feature->{genome_name};
$spgene->{taxon_id} = $feature->{taxon_id};
$spgene->{feature_id} = $feature->{feature_id};
$spgene->{patric_id} = $feature->{patric_id};
$spgene->{alt_locus_tag} = $feature->{alt_locus_tag};
$spgene->{refseq_locus_tag} = $feature->{refseq_locus_tag};
$spgene->{gene} = $feature->{gene};
$spgene->{product} = $feature->{product};
$spgene->{property} = $property;
$spgene->{source} = $source;
$spgene->{property_source} = $property.': '.$source;
$spgene->{source_id} = $source_id;
$spgene->{organism} = $organism;
$spgene->{function} = $function;
$spgene->{classification} = [split /;|,/, $classification];
$spgene->{antibiotics_class} = $antibiotics_class;
$spgene->{antibiotics} = [split /;/, $antibiotics];
$spgene->{pmid} = [ split /,|;/, $pmid];
$spgene->{assertion} = ""; # $assertion;
$spgene->{query_coverage} = $qcov;
$spgene->{subject_coverage} = $scov;
$spgene->{identity} = $identity;
$spgene->{e_value} = $evalue;
$spgene->{same_genus} = $same_genus;
$spgene->{same_species} = $same_species;
$spgene->{same_genome} = $same_genome;
$spgene->{evidence} = $evidence;
return $spgene;
}
sub prepareJsonFile {
my($genome_id, $spgenes) = @_;
print "\tPrepare $genome_id.sp_gene.json\n";
my $spgene_json = $json->pretty->encode($spgenes);
open SPG, ">$genome_id.sp_gene.json";
print SPG "$spgene_json";
close SPG;
}
sub postJsonFile {
my($genome_id) = @_;
print "\tPost $genome_id.sp_gene.json\n";
`post.update.sh sp_gene $genome_id.sp_gene.json`;
#`rm $genome_id.genome_feature.json`;
}