Skip to content

Commit

Permalink
Merge pull request #58 from Ensembl/revert-57-registry_and_divisions_…
Browse files Browse the repository at this point in the history
…support

Revert "Registry and divisions support"
  • Loading branch information
at7 committed Jul 21, 2017
2 parents 76df543 + 5483bb2 commit 99ae5d5
Show file tree
Hide file tree
Showing 19 changed files with 664 additions and 524 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,6 @@ use base ('Bio::EnsEMBL::Hive::Process');

use Bio::EnsEMBL::Registry;

sub data_dir {
my ($self,$species) = @_;
my $data_dump_dir = $self->param('pipeline_dir');
my $species_division = $self->param('species_division');
# If division is defined append the pipeline_dir
if ($species_division)
{
$data_dump_dir = $data_dump_dir."/".$species_division;
}
return $data_dump_dir;
}

sub get_all_species {
my $self = shift;
my $registry = 'Bio::EnsEMBL::Registry';
Expand Down
26 changes: 16 additions & 10 deletions modules/Bio/EnsEMBL/Variation/Pipeline/ReleaseDataDumps/CleanUp.pm
Original file line number Diff line number Diff line change
Expand Up @@ -33,37 +33,38 @@ use strict;

use base ('Bio::EnsEMBL::Variation::Pipeline::ReleaseDataDumps::BaseDataDumpsProcess');

sub fetch_input {
my $self = shift;
}

sub run {
my $self = shift;
my $data_dump_dir = $self->param('pipeline_dir');
my $tmp_dir = $self->param('tmp_dir');
my $file_type = $self->param('file_type');

my $species = $self->param('species');
my $mode = $self->param('mode');
my $data_dump_dir = $self->data_dir($species);


if ($mode eq 'post_gvf_dump') {
my $working_dir = "$data_dump_dir/$file_type/$species";
opendir(my $dh, $working_dir) or die $!;
my @dir_content = readdir($dh);
closedir($dh);
foreach my $file (@dir_content) {
opendir(DIR, $working_dir) or die $!;
while (my $file = readdir(DIR)) {
if ($file =~ m/gvf$/) {
`gzip $working_dir/$file`;
}
if ($file =~ m/^Validate/) {
`mv $working_dir/$file $tmp_dir`;
}
}
closedir(DIR);
}

if ($mode eq 'post_join_dumps') {
foreach my $file_type (qw/vcf gvf/) {
my $working_dir = "$data_dump_dir/$file_type/$species";
opendir(my $dh, $working_dir) or die $!;
my @dir_content = readdir($dh);
closedir($dh);
foreach my $file (@dir_content) {
opendir(DIR, $working_dir) or die $!;
while (my $file = readdir(DIR)) {
if ($file =~ m/generic/) {
my $file_name = $file;
$file_name =~ s/_generic//;
Expand All @@ -76,9 +77,14 @@ sub run {
`gzip $working_dir/$file`;
}
}
closedir(DIR);
}
}
}

sub write_output {
my $self = shift;
}

1;

134 changes: 72 additions & 62 deletions modules/Bio/EnsEMBL/Variation/Pipeline/ReleaseDataDumps/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,24 @@ use Bio::EnsEMBL::Registry;
use FileHandle;
use JSON;


sub fetch_input {
my $self = shift;
}

sub run {
my $self = shift;
$self->write_config_file();
}

sub write_output {
my $self = shift;
$self->dataflow_output_id({'config_file' => $self->param('config_file')}, 1);
return 1;
}

sub write_config_file {
my $self = shift;
my $species = $self->param('species');
my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core');
my $vdba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'variation');
my @input;
my $params = {};
my $division = $self->division($dba);


# structural_variation svs
# somatic
# incl consequences: protein info: sift, polyphen
Expand All @@ -61,53 +63,57 @@ sub write_config_file {
# individuals:
# sets: phenotypes, clinically_associated
my $config = {};

$self->variation_data_survey($config,$species,$vdba);

my $species_config = {
failed => ['failed'],
generic => ['evidence', 'validation_status'],
incl_consequences => ['incl_consequences', 'protein_coding_details', 'evidence'],
};
foreach my $attribute (qw/ancestral_allele global_maf clinical_significance/) {
if ($config->{$species}->{$attribute}) {
push @{$species_config->{generic}}, $attribute;
push @{$species_config->{incl_consequences}}, $attribute;
my $species_variation_data = $self->get_all_species();

$self->variation_data_survey($config);

foreach my $species (keys %$species_variation_data) {
my $species_config = {
failed => ['failed'],
generic => ['evidence', 'validation_status'],
incl_consequences => ['incl_consequences', 'protein_coding_details', 'evidence'],
};
foreach my $attribute (qw/ancestral_allele global_maf clinical_significance/) {
if ($config->{$species}->{$attribute}) {
push @{$species_config->{generic}}, $attribute;
push @{$species_config->{incl_consequences}}, $attribute;
}
}
if ($config->{$species}->{sift}) {
push @{$species_config->{incl_consequences}}, 'sift';
}
}
if ($config->{$species}->{sift}) {
push @{$species_config->{incl_consequences}}, 'sift';
}
if ($config->{$species}->{svs}) {
$species_config->{structural_variations} = ['structural_variations'];
if ($config->{$species}->{clinical_significance_svs}) {
push @{$species_config->{structural_variations}}, 'clinical_significance';
if ($config->{$species}->{svs}) {
$species_config->{structural_variations} = ['structural_variations'];
if ($config->{$species}->{clinical_significance_svs}) {
push @{$species_config->{structural_variations}}, 'clinical_significance';
}
}
if ($species eq 'Homo_sapiens') {
$species_config->{sets}->{clinically_associated} = ['evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{sets}->{phenotype_associated} = ['evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{incl_consequences} = ['sift', 'polyphen', 'incl_consequences', 'protein_coding_details', 'evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{somatic_incl_consequences} = ['somatic', 'sift', 'polyphen', 'incl_consequences', 'protein_coding_details', 'evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{somatic} = ['somatic', 'evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{generic} = ['evidence', 'ancestral_allele', 'clinical_significance', 'global_maf', 'variation_id', 'allele_string'];
}
$config->{$species} = $species_config;
}
if ($species eq 'Homo_sapiens') {
$species_config->{sets}->{clinically_associated} = ['evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{sets}->{phenotype_associated} = ['evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{incl_consequences} = ['sift', 'polyphen', 'incl_consequences', 'protein_coding_details', 'evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{somatic_incl_consequences} = ['somatic', 'sift', 'polyphen', 'incl_consequences', 'protein_coding_details', 'evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{somatic} = ['somatic', 'evidence', 'ancestral_allele', 'clinical_significance', 'global_maf'];
$species_config->{generic} = ['evidence', 'ancestral_allele', 'clinical_significance', 'global_maf', 'variation_id', 'allele_string'];
}
$config->{$species} = $species_config;

$params->{species} = $species;
$params->{config} = $config->{$species};
if ($division ne '') {
$params->{species_division} = $division;
}
push @input, $params;

$self->dataflow_output_id(\@input, 2);
$self->dataflow_output_id(\@input, 1);
my $pipeline_dir = $self->param('pipeline_dir');
my $config_file = "$pipeline_dir/data_dumps_config.json";
my $fh = FileHandle->new($config_file, 'w');
my $json = JSON->new->allow_nonref;
print $fh $json->encode($config);
$fh->close();
$self->param('config_file', $config_file);
}

sub variation_data_survey {
my ($self,$config,$species,$vdba)=@_;
my $vdbc = $vdba->dbc();
my $self = shift;
my $config = shift;
my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_all($self->param('registry_file'));
my $vdbas = $registry->get_all_DBAdaptors(-group => 'variation');

my $queries = {
sift => 'select count(*) from protein_function_predictions;',
ancestral_allele => 'select variation_id from variation where ancestral_allele is not null limit 1;',
Expand All @@ -118,27 +124,31 @@ sub variation_data_survey {
};

foreach my $data_type (keys %$queries) {
my $sth = $vdbc->prepare($queries->{$data_type});
my $sub_set_species = query_database($vdbas, $queries->{$data_type});
foreach my $species (keys %$sub_set_species) {
$config->{$species}->{$data_type} = 1;
}
}
}

sub query_database {
my $vdbas = shift;
my $query = shift;
my $species_names = {};
foreach my $vdba (@$vdbas) {
my $species_name = $vdba->species();
my $dbh = $vdba->dbc->db_handle;
my $sth = $dbh->prepare($query);
$sth->execute();
while (my @row = $sth->fetchrow_array) {
my $count = $row[0];
if ($count > 0) {
$config->{$species}->{$data_type} = 1;
$species_names->{$species_name} = 1;
}
}
$sth->finish();
}
$vdbc->disconnect_if_idle();
}


sub division {
my ($self, $dba) = @_;
my ($division) = @{$dba->get_MetaContainer()->list_value_by_key('species.division')};
return if ! $division;
$division =~ s/^Ensembl//;

return lc($division);
return $species_names;
}

1;
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,23 @@ use strict;

use base ('Bio::EnsEMBL::Variation::Pipeline::ReleaseDataDumps::BaseDataDumpsProcess');

sub fetch_input {
my $self = shift;
}

sub run {
my $self = shift;
my $mode = $self->param('mode');
my $tmp_dir = $self->param('tmp_dir');
my $species = $self->param('species');
my $data_dump_dir = $self->data_dir($species);

$self->post_gvf_dump_cleanup($data_dump_dir,$tmp_dir,$species) if ($mode eq 'post_gvf_dump_cleanup');
$self->post_gvf2vcf_cleanup($data_dump_dir,$tmp_dir,$species) if ($mode eq 'post_gvf2vcf_cleanup');
$self->post_gvf_dump_cleanup if ($mode eq 'post_gvf_dump_cleanup');
$self->post_gvf2vcf_cleanup if ($mode eq 'post_gvf2vcf_cleanup');
}

sub post_gvf_dump_cleanup {
my ($self,$data_dump_dir,$tmp_dir,$species) = @_;
my $self = shift;
my $data_dump_dir = $self->param('pipeline_dir');
my $tmp_dir = $self->param('tmp_dir');
my $species = $self->param('species');

system("gzip $data_dump_dir/gvf/$species/*.gvf");
system("cat $data_dump_dir/gvf/$species/Validate_* > $tmp_dir/GVF_Validate_$species");
system("rm $data_dump_dir/gvf/$species/Validate_*");
Expand All @@ -54,7 +58,10 @@ sub post_gvf_dump_cleanup {
}

sub post_gvf2vcf_cleanup {
my ($self,$data_dump_dir,$tmp_dir,$species) = @_;
my $self = shift;
my $data_dump_dir = $self->param('pipeline_dir');
my $tmp_dir = $self->param('tmp_dir');
my $species = $self->param('species');
system("cat $data_dump_dir/vcf/$species/Validate_* > $tmp_dir/VCF_Validate_$species");
system("rm $data_dump_dir/vcf/$species/Validate_*");
system("cat $data_dump_dir/vcf/$species/*.{err,out} > $tmp_dir/VCF_$species");
Expand All @@ -65,8 +72,6 @@ sub post_gvf2vcf_cleanup {

sub write_output {
my $self = shift;
$self->dataflow_output_id({}, 2);
$self->dataflow_output_id({}, 1);
}

1;
Loading

0 comments on commit 99ae5d5

Please sign in to comment.