Skip to content

Commit

Permalink
Max length of gxf or fasta record filter
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Laird committed Jun 9, 2016
1 parent 59cb2f5 commit 8dc7123
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 2 deletions.
12 changes: 12 additions & 0 deletions examples/gff3_filter_large.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"input_filter" : { "_pre" : "max_length" },
"mapping" : {
"max_length" : {
"_callback" : "run",
"_module" : "Bio::FormatTranscriber::Callback::MaxLength",
"_init" : {"length" : "100000" },
"_parameters" : {"record" : "{{record}}" },
"_filter" : 1
}
}
}
108 changes: 108 additions & 0 deletions lib/Bio/FormatTranscriber/Callback/MaxLength.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
=head1 LICENSE
Copyright [1999-2016] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut


=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
Questions may also be sent to the Ensembl help desk at
<http://www.ensembl.org/Help/Contact>.
=cut

=head1 NAME
Bio::FormatTranscriber::Callback::MaxLength
=head1 SYNOPSIS
use Bio::FormatTranscriber::Callback::MaxLength;
$callback_obj = Bio::FormatTranscriber::Callback::MaxLength->new({ length => 100000 });
$less_than_max_bool = $callback_obj->run({record => $record});
=head1 DESCRIPTION
Filter a record based on if the length is greater than the
given maximum. Throws an error if the object type being filtered
doesn't support length()
Mapping entry could look like:
"max_length" : {
"_callback" : "run",
"_module" : "Bio::FormatTranscriber::Callback::MaxLength",
"_init" : { "length" : 100000 },
"_parameters" : { "record" : "{{record}}" }
}
and be called during the _pre or _post field step of a filter, ie.
"input_filter": { "_pre" : "max_length"
}
=cut

package Bio::FormatTranscriber::Callback::MaxLength;

use strict;
use warnings;
use Carp;

use Data::Dumper;

use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw);

sub new {
my $class = shift;
my $params = shift;

my $self = {};

unless(defined $params->{length}) {
throw "You must specify the maximum length, otherwise, what's the point?";
}

$self->{length} = $params->{length};

return bless $self, $class;
}

sub run {
my $self = shift;
my $params = shift;

# Die if we weren't configured to receive a record
unless((ref $params eq 'HASH') &&
$params->{record}) {
throw "No record found for sequence";
}

if($params->{record}->length() > $self->{length}) {
return 0;
}

return 1;
}

1;
4 changes: 2 additions & 2 deletions lib/Bio/FormatTranscriber/Callback/fasta_header.pm
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Bio::FormatTranscriber::Callback::fasta_header
parameter_2 => { hash of substitutions },
);
$callback_obj->run({record => $record});
$updated_record = $callback_obj->run({record => $record});
=head1 DESCRIPTION
Expand Down Expand Up @@ -72,7 +72,7 @@ Mapping entry could look like:
and be called during the _pre or _post field step of a filter, ie.
"input_filter": { "_pre" : "fasta_header"
"input_filter": { "_pre" : "fasta_header"
},
=cut
Expand Down
28 changes: 28 additions & 0 deletions t/data/patch_data.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
##gff-version 3
##sequence-region CHR_HG126_PATCH 72325441 72740748
#!genome-build GRCh38.p5
#!genome-version GRCh38
#!genome-date 2013-12
#!genome-build-accession NCBI:GCA_000001405.20
CHR_HG126_PATCH GRCh38 chromosome 72325441 72740748 . . . ID=chromosome:CHR_HG126_PATCH;Alias=HG126_PATCH
###
CHR_HG126_PATCH ensembl gene 72374621 72447493 . - . ID=gene:ENSG00000281766;Name=RYBP;biotype=protein_coding;description=RING1 and YY1 binding protein [Source:HGNC Symbol%3BAcc:HGNC:10480];gene_id=ENSG00000281766;logic_name=assembly_patch_ensembl;version=1
CHR_HG126_PATCH ensembl transcript 72374621 72447493 . - . ID=transcript:ENST00000628983;Parent=gene:ENSG00000281766;Name=RYBP-201;biotype=protein_coding;tag=basic;transcript_id=ENST00000628983;version=1
CHR_HG126_PATCH ensembl three_prime_UTR 72374621 72378402 . - . Parent=transcript:ENST00000628983
CHR_HG126_PATCH ensembl exon 72374621 72378655 . - . Parent=transcript:ENST00000628983;Name=ENSE00003764699;constitutive=1;ensembl_end_phase=-1;ensembl_phase=2;exon_id=ENSE00003764699;rank=5;version=1
CHR_HG126_PATCH ensembl CDS 72378403 72378655 . - 1 ID=CDS:ENSP00000486012;Parent=transcript:ENST00000628983;protein_id=ENSP00000486012
CHR_HG126_PATCH ensembl exon 72379058 72379156 . - . Parent=transcript:ENST00000628983;Name=ENSE00003774199;constitutive=1;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSE00003774199;rank=4;version=1
CHR_HG126_PATCH ensembl CDS 72379058 72379156 . - 1 ID=CDS:ENSP00000486012;Parent=transcript:ENST00000628983;protein_id=ENSP00000486012
CHR_HG126_PATCH ensembl exon 72379269 72379445 . - . Parent=transcript:ENST00000628983;Name=ENSE00003765281;constitutive=1;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSE00003765281;rank=3;version=1
CHR_HG126_PATCH ensembl CDS 72379269 72379445 . - 1 ID=CDS:ENSP00000486012;Parent=transcript:ENST00000628983;protein_id=ENSP00000486012
CHR_HG126_PATCH ensembl exon 72446378 72446503 . - . Parent=transcript:ENST00000628983;Name=ENSE00003772464;constitutive=1;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSE00003772464;rank=2;version=1
CHR_HG126_PATCH ensembl CDS 72446378 72446503 . - 1 ID=CDS:ENSP00000486012;Parent=transcript:ENST00000628983;protein_id=ENSP00000486012
CHR_HG126_PATCH ensembl CDS 72447270 72447295 . - 0 ID=CDS:ENSP00000486012;Parent=transcript:ENST00000628983;protein_id=ENSP00000486012
CHR_HG126_PATCH ensembl exon 72447270 72447493 . - . Parent=transcript:ENST00000628983;Name=ENSE00003760613;constitutive=1;ensembl_end_phase=2;ensembl_phase=-1;exon_id=ENSE00003760613;rank=1;version=1
CHR_HG126_PATCH ensembl five_prime_UTR 72447296 72447493 . - . Parent=transcript:ENST00000628983
###
CHR_HG126_PATCH havana lincRNA_gene 72505075 72550889 . - . ID=gene:ENSG00000280864;Name=RP11-654C22.2;biotype=lincRNA;gene_id=ENSG00000280864;havana_gene=OTTHUMG00000159191;havana_version=1;logic_name=proj_havana;version=1
CHR_HG126_PATCH havana lincRNA 72505075 72550889 . - . ID=transcript:ENST00000629165;Parent=gene:ENSG00000280864;Name=RP11-654C22.2-001;biotype=lincRNA;havana_transcript=OTTHUMT00000353763;havana_version=1;tag=basic;transcript_id=ENST00000629165;version=1
CHR_HG126_PATCH havana exon 72505075 72505358 . - . Parent=transcript:ENST00000629165;Name=ENSE00003769648;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003769648;rank=3;version=1
CHR_HG126_PATCH havana exon 72549722 72549841 . - . Parent=transcript:ENST00000629165;Name=ENSE00003773559;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003773559;rank=2;version=1
CHR_HG126_PATCH havana exon 72550428 72550889 . - . Parent=transcript:ENST00000629165;Name=ENSE00003775139;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003775139;rank=1;version=1

0 comments on commit 8dc7123

Please sign in to comment.