Skip to content

Commit

Permalink
Merge branch 'release/104' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
James Allen committed May 11, 2021
2 parents df615b9 + a65a18f commit c7a21e9
Show file tree
Hide file tree
Showing 90 changed files with 75,432 additions and 78 deletions.
10 changes: 8 additions & 2 deletions lib/Bio/EnsEMBL/DataCheck/Checks/ChromosomesAnnotated.pm
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,14 @@ sub skip_tests {

my $sa = $self->dba->get_adaptor('Slice');

my $mca = $self->dba->get_adaptor('MetaContainer');
my $cs_version = $mca->single_value_by_key('assembly.default');

my @chromosomal = ('chromosome', 'chromosome_group', 'plasmid');

my $chr_count = 0;
foreach my $cs_name (@chromosomal) {
my $slices = $sa->fetch_all($cs_name);
my $slices = $sa->fetch_all($cs_name, $cs_version);
foreach (@$slices) {
# seq_regions that are not genuine biological chromosomes,
# but are instead collections of unmapped sequence,
Expand All @@ -67,10 +70,13 @@ sub tests {

my $sa = $self->dba->get_adaptor('Slice');

my $mca = $self->dba->get_adaptor('MetaContainer');
my $cs_version = $mca->single_value_by_key('assembly.default');

my @chromosomal = ('chromosome', 'chromosome_group', 'plasmid');

foreach my $cs_name (@chromosomal) {
my $slices = $sa->fetch_all($cs_name);
my $slices = $sa->fetch_all($cs_name, $cs_version);
foreach (@$slices) {
my @non_bio_chr = @{$_->get_all_Attributes('chromosome')};
next if scalar(@non_bio_chr);
Expand Down
4 changes: 2 additions & 2 deletions lib/Bio/EnsEMBL/DataCheck/Checks/CompareVariationFeatures.pm
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ sub tests {
my $curr_dna_dba = $self->get_dna_dba();
my $old_core_dba = $self->get_old_dba(undef, 'core');

my $desc_curr_core = 'Current core database found';
my $desc_curr_core = 'Current core database found: '.$curr_dna_dba->dbc->dbname;
my $curr_core_pass = ok(defined $curr_dna_dba, $desc_curr_core);

my $desc_old_core = 'Old core database found';
my $desc_old_core = 'Old core database found: '.$old_core_dba->dbc->dbname;
my $old_core_pass = ok(defined $old_core_dba, $desc_old_core);

if ($curr_core_pass && $old_core_pass) {
Expand Down
57 changes: 46 additions & 11 deletions lib/Bio/EnsEMBL/DataCheck/Checks/ExonBounds.pm
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ sub tests {
my $attrib = $aa->fetch_by_code('trans_spliced');
my $attrib_type_id = $attrib->[0] || 0;

my $exon_transcript_sql = qq/
my $sql_tables = qq/
exon e INNER JOIN
exon_transcript et USING (exon_id) INNER JOIN
transcript t USING (transcript_id) INNER JOIN
Expand All @@ -53,22 +53,57 @@ sub tests {
t.transcript_id = ta.transcript_id AND
ta.attrib_type_id = $attrib_type_id
)
/;
my $sql_conditions = qq/
WHERE
cs.species_id = $species_id AND
ta.transcript_id IS NULL
/;
my $exon_transcript_sql = "$sql_tables $sql_conditions";

my $desc_1 = "Transcript co-ordinates are the same as the exon extremities";
my $desc_1 = "Exon bounds match transcript bounds";
my $diag_1 = "Exon bounds do not match transcript bounds";
my $sql_1 = qq/
my $sql_1a = qq/
SELECT t.transcript_id, t.stable_id, t.seq_region_start, t.seq_region_end FROM
$exon_transcript_sql
GROUP BY t.transcript_id, t.stable_id
HAVING
MIN(e.seq_region_start) <> t.seq_region_start OR
MAX(e.seq_region_end) <> t.seq_region_end
$exon_transcript_sql AND
et.rank = 1 AND
t.seq_region_strand = 1 AND
e.seq_region_start <> t.seq_region_start
/;
is_rows_zero($self->dba, $sql_1, $desc_1, $diag_1);
my $sql_1b = qq/
SELECT t.transcript_id, t.stable_id, t.seq_region_start, t.seq_region_end FROM
$exon_transcript_sql AND
et.rank = 1 AND
t.seq_region_strand = -1 AND
e.seq_region_end <> t.seq_region_end
/;
my $sql_1c = qq/
SELECT t.transcript_id, t.stable_id, t.seq_region_start, t.seq_region_end FROM
$sql_tables INNER JOIN
(SELECT transcript_id, MAX(rank) AS max_rank FROM
exon_transcript GROUP BY transcript_id) et2
ON t.transcript_id = et2.transcript_id
$sql_conditions AND
et.rank = et2.max_rank AND
t.seq_region_strand = 1 AND
e.seq_region_end <> t.seq_region_end
/;
my $sql_1d = qq/
SELECT t.transcript_id, t.stable_id, t.seq_region_start, t.seq_region_end FROM
$sql_tables INNER JOIN
(SELECT transcript_id, MAX(rank) AS max_rank FROM
exon_transcript GROUP BY transcript_id) et2
ON t.transcript_id = et2.transcript_id
$sql_conditions AND
et.rank = et2.max_rank AND
t.seq_region_strand = -1 AND
e.seq_region_start <> t.seq_region_start
/;

is_rows_zero($self->dba, $sql_1a, "$desc_1 (1/4)", $diag_1);
is_rows_zero($self->dba, $sql_1b, "$desc_1 (2/4)", $diag_1);
is_rows_zero($self->dba, $sql_1c, "$desc_1 (3/4)", $diag_1);
is_rows_zero($self->dba, $sql_1d, "$desc_1 (4/4)", $diag_1);

my $desc_2 = "Exon and transcript have the same strand";
my $diag_2 = "Transcript and exon have different strands";
Expand Down Expand Up @@ -117,11 +152,11 @@ sub tests {

if (defined $last_transcript_id && $last_transcript_id == $transcript_id) {
if ($strand == 1) {
if ($last_end > $start) {
if ($last_start < $start && $start < $last_end) {
push(@exon_overlaps, "Exons $last_exon_id and $exon_id overlap ($last_end > $start)");
}
} else {
if ($last_start < $end) {
if ($last_start < $end && $last_end < $end) {
push(@exon_overlaps, "Exons $last_exon_id and $exon_id overlap ($last_start < $end)");
}
}
Expand Down
11 changes: 5 additions & 6 deletions lib/Bio/EnsEMBL/DataCheck/Checks/Karyotype.pm
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,11 @@ use List::Util qw/min max/;
extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'Karyotype',
DESCRIPTION => 'Karyotype data exists for human, mouse and rat',
GROUPS => ['assembly'],
DATACHECK_TYPE => 'advisory',
DB_TYPES => ['core'],
TABLES => ['attrib_type', 'coord_system', 'karyotype', 'seq_region', 'seq_region_attrib']
NAME => 'Karyotype',
DESCRIPTION => 'Karyotype data exists for human, mouse and rat',
GROUPS => ['assembly', 'core'],
DB_TYPES => ['core'],
TABLES => ['attrib_type', 'coord_system', 'karyotype', 'seq_region', 'seq_region_attrib']
};

sub skip_tests {
Expand Down
2 changes: 1 addition & 1 deletion lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyAssembly.pm
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ sub tests {
cs.attrib RLIKE 'default_version' AND
at.code = 'toplevel' AND
m.meta_key = 'assembly.default' AND
cs.version <> m.meta_value AND
BINARY(cs.version) <> BINARY(m.meta_value) AND
m.species_id = $species_id AND
cs.species_id = $species_id
GROUP BY
Expand Down
14 changes: 11 additions & 3 deletions lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyConditional.pm
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,16 @@ sub repeat_analysis {
# in order to do a count, use SQL rather than API.
my $helper = $self->dba->dbc->sql_helper;
my $species_id = $self->dba->species_id;
my $mca = $self->dba->get_adaptor('MetaContainer');
my @rep_list = ();
if ($mca->get_division eq 'EnsemblPlants') {
@rep_list = qw("repeatmask_repeatmodeler");
}
else {
@rep_list = qw("repeatmask_repeatmodeler" "repeatdetector");
}
my $to_skip = join("', '", @rep_list);

my $sql = qq/
SELECT logic_name FROM
coord_system INNER JOIN
Expand All @@ -183,8 +193,7 @@ sub repeat_analysis {
analysis USING (analysis_id)
WHERE
species_id = $species_id
AND
logic_name <> "repeatmask_repeatmodeler"
AND logic_name NOT IN ('$to_skip')
GROUP BY
logic_name
ORDER BY logic_name
Expand All @@ -195,7 +204,6 @@ sub repeat_analysis {
skip 'No repeat features', 1 unless scalar(@logic_names);

my $desc = "'repeat.analysis' meta_keys exist for appropriate repeat analyses";
my $mca = $self->dba->get_adaptor('MetaContainer');
my @values = sort @{ $mca->list_value_by_key('repeat.analysis') };
is_deeply(\@values, \@logic_names, $desc);
}
Expand Down
8 changes: 5 additions & 3 deletions lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ sub tests {

# Check that the format of meta_values conforms to expectations.
my %formats = (
'annotation.provider_url' => '^https?:\/\/.+$',
'assembly.provider_url' => '^https?:\/\/.+$',
'annotation.provider_url' => '(https?:\/\/.+|www.*\.ensembl\.org)',
'assembly.provider_url' => '(https?:\/\/.+|www.*\.ensembl\.org)',
'assembly.accession' => 'GCA_\d+\.\d+',
'assembly.date' => '\d{4}-\d{2}',
'assembly.default' => '[\w\.\-]+',
'genebuild.id' => '\d+',
'genebuild.initial_release_date' => '\d{4}-\d{2}',
'genebuild.last_geneset_update' => '\d{4}-\d{2}',
'genebuild.method' => '(full_genebuild|projection_build|import|mixed_strategy_build|external_annotation_import|maker_genebuild|curated)',
'genebuild.method' => '(full_genebuild|projection_build|import|mixed_strategy_build|external_annotation_import|maker_genebuild|curated|import_build)',
'genebuild.start_date' => '\d{4}\-\d{2}\-\S+',
'patch' => '[^\n]+',
'sample.location_param' => '[\w\.\-]+:\d+\-\d+',
Expand All @@ -71,6 +71,8 @@ sub tests {
my $desc = "Value for $meta_key has correct format";
my $format = $formats{$meta_key};
my $values = $mca->list_value_by_key($meta_key);
@$values = grep { $_ ne '' } @$values;

SKIP: {
skip "No $meta_key defined", 1 unless scalar(@$values);
foreach my $value (@$values) {
Expand Down
3 changes: 1 addition & 2 deletions lib/Bio/EnsEMBL/DataCheck/Checks/XrefTypes.pm
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ extends 'Bio::EnsEMBL::DataCheck::DbCheck';
use constant {
NAME => 'XrefTypes',
DESCRIPTION => 'Xrefs are only attached to one feature type.',
GROUPS => ['xref', 'xref_mapping'],
DATACHECK_TYPE => 'advisory',
GROUPS => ['core', 'xref', 'xref_mapping'],
DB_TYPES => ['core'],
TABLES => ['external_db', 'object_xref', 'xref'],
PER_DB => 1,
Expand Down
5 changes: 4 additions & 1 deletion lib/Bio/EnsEMBL/DataCheck/DbCheck.pm
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,10 @@ sub find_old_dbname {
die "No metadata database found in the registry" unless defined $meta_dba;

my ($sql, $params);
if ($group =~ /(funcgen|variation)/i) {
if (
$group =~ /(funcgen|variation)/i ||
$mca->single_value_by_key('schema_type') =~ /(funcgen|variation)/i
) {
$sql = q/
SELECT DISTINCT gd.dbname FROM
genome_database gd INNER JOIN
Expand Down
14 changes: 10 additions & 4 deletions lib/Bio/EnsEMBL/DataCheck/Pipeline/ConvertTapToJson.pm
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,21 @@ sub run {
my $passed = $self->param('json_passed');
my $by_species = $self->param('json_by_species');

$self->parse_results($tap, $output_file, $by_species, $passed);
if (-e $tap) {
$self->parse_results($tap, $output_file, $by_species, $passed);
}
}

sub write_output {
my $self = shift;

$self->dataflow_output_id(
{ json_output_file => $self->param('json_output_file') }, 1
);
my $json_output_file = $self->param('json_output_file');

if (-e $json_output_file) {
$self->dataflow_output_id(
{ json_output_file => $json_output_file }, 1
);
}
}

sub parse_results {
Expand Down
8 changes: 5 additions & 3 deletions lib/Bio/EnsEMBL/DataCheck/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -1543,10 +1543,11 @@
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::InterProFeatures"
},
"Karyotype" : {
"datacheck_type" : "advisory",
"datacheck_type" : "critical",
"description" : "Karyotype data exists for human, mouse and rat",
"groups" : [
"assembly"
"assembly",
"core"
],
"name" : "Karyotype",
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::Karyotype"
Expand Down Expand Up @@ -2598,9 +2599,10 @@
"package_name" : "Bio::EnsEMBL::DataCheck::Checks::XrefPrefixes"
},
"XrefTypes" : {
"datacheck_type" : "advisory",
"datacheck_type" : "critical",
"description" : "Xrefs are only attached to one feature type.",
"groups" : [
"core",
"xref",
"xref_mapping"
],
Expand Down
Empty file.
Empty file.
Empty file.
36 changes: 36 additions & 0 deletions t/test-genome-DBs/drosophila/core/analysis.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
486 2017-09-01 10:14:21 flybase \N \N \N flybase \N \N \N Bio::EnsEMBL::EGPipeline::LoadGFF3::LoadGFF3 \N \N \N
487 2017-09-18 13:32:29 xref_refseq_blastn refseq_dna \N \N blastn \N blastn -word_size 11 -num_alignments 100000 -num_descriptions 100000 -lcase_masking -num_threads 3 Bio::EnsEMBL::Analysis::Runnable::BlastEG \N \N \N
488 2018-05-31 16:42:54 xrefuniprot \N \N \N xrefuniprot \N \N \N Bio::EnsEMBL::EGPipeline::Xref::LoadUniProtXrefs \N \N \N
489 2018-05-31 14:43:36 xrefchecksum \N \N \N xrefchecksum \N \N \N Bio::EnsEMBL::EGPipeline::Xref::LoadUniParc \N \N \N
490 2018-05-31 15:48:17 xrefuniparc \N \N \N xrefuniparc \N \N \N Bio::EnsEMBL::EGPipeline::Xref::LoadUniProt \N \N \N
491 2018-08-14 17:29:42 seg Seg \N \N InterProScan 5.30-69.0 \N \N \N \N \N \N
492 2018-08-14 17:29:42 tmhmm TMHMM 2.0c \N InterProScan 5.30-69.0 \N \N \N \N \N \N
493 2018-08-14 17:29:38 signalp SignalP 4.1 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
494 2017-11-23 21:17:07 goa_import GO \N \N goa_import \N \N \N \N \N \N \N
495 2018-05-31 16:44:45 gouniprot \N \N \N gouniprot \N \N \N Bio::EnsEMBL::EGPipeline::Xref::LoadUniProtGO \N \N \N
496 2018-08-14 17:29:24 cdd CDD 3.16 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
497 2018-08-14 17:29:26 hmmpanther PANTHER 12.0 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
498 2018-08-14 17:29:23 gene3d Gene3D 4.2.0 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
499 2018-08-14 17:29:31 pfam Pfam 31.0 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
500 2018-08-14 17:29:34 superfamily SuperFamily 1.75 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
501 2018-08-14 17:29:36 smart Smart 7.1 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
502 2018-08-14 17:29:29 pfscan Prosite_profiles 2018_02 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
503 2018-08-14 17:29:38 mobidblite MobiDBLite 1.5 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
504 2018-08-14 17:29:44 interpro2go InterPro2GO \N \N InterProScan 5.30-69.0 \N \N \N \N \N \N
505 2018-08-14 17:29:32 pirsf PIRSF 3.02 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
506 2018-08-14 17:29:33 scanprosite Prosite_patterns 2018_02 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
507 2018-08-14 17:29:39 ncoils ncoils 2.2.1 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
508 2018-08-14 17:29:43 interpro2pathway InterPro2Pathway \N \N InterProScan 5.30-69.0 \N \N \N \N \N \N
509 2018-08-14 17:29:38 tigrfam TIGRfam 15.0 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
510 2017-09-18 13:25:24 xref_refseq_blastp refseq_peptide \N \N blastp \N blastp -word_size 3 -num_alignments 100000 -num_descriptions 100000 -lcase_masking -seg yes -num_threads 3 Bio::EnsEMBL::Analysis::Runnable::BlastEG \N \N \N
511 2018-08-14 17:29:32 prints PRINTS 42.0 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
512 2018-08-14 17:29:26 hamap HAMAP 2018_03 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
513 2018-08-14 17:29:32 sfld SFLD 3 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
514 2018-08-14 17:29:19 blastprodom ProDom 2006.1 \N InterProScan 5.30-69.0 \N \N \N \N \N \N
515 2017-04-28 16:27:07 trnascan_align \N \N \N tRNAscan-SE 1.23 /nfs/software/ensembl/RHEL7/linuxbrew/bin/tRNAscan-SE Bio::EnsEMBL::Analysis::Runnable::tRNAscan \N \N \N
516 2017-04-28 16:27:07 mirbase miRBase 21 \N \N \N \N \N Bio::EnsEMBL::EGPipeline::RNAFeatures::miRBase \N \N \N
517 2016-06-09 11:49:06 cmscan_rfam_12.1 Rfam 12.1 /nfs/panda/ensemblgenomes/external/Rfam/12.1/Rfam.cm Infernal 1.1 /nfs/panda/ensemblgenomes/external/bin/cmscan Bio::EnsEMBL::Analysis::Runnable::CMScan \N \N \N
518 2017-04-28 16:27:07 cmscan_rfam_12.2_lca Rfam 12.2 /nfs/panda/ensemblgenomes/external/Rfam/12.2/Rfam.cm Infernal 1.1 /nfs/software/ensembl/RHEL7/linuxbrew/bin/cmscan Bio::EnsEMBL::Analysis::Runnable::CMScan \N \N \N
519 2017-08-16 17:14:44 dust \N \N \N dustmasker \N /nfs/software/ensembl/RHEL7/linuxbrew/bin/dustmasker \N Bio::EnsEMBL::Analysis::Runnable::DustMasker \N dust low_complexity_region
520 2017-08-16 17:14:44 trf \N \N \N trf 4.0 /nfs/software/ensembl/RHEL7/linuxbrew/bin/trf 2 5 7 80 10 40 500 -d -h Bio::EnsEMBL::Analysis::Runnable::TRF \N trf tandem_repeat
521 2017-08-16 17:14:45 repeatmask_repbase repbase \N \N RepeatMasker 4.0.5 /nfs/software/ensembl/RHEL7/linuxbrew/bin/RepeatMasker -nolow -gccalc -species "Drosophila melanogaster" -engine crossmatch -q Bio::EnsEMBL::Analysis::Runnable::RepeatMasker \N repeatmasker repeat_region

0 comments on commit c7a21e9

Please sign in to comment.