diff --git a/utils/create_user_data_dbs b/utils/create_user_data_dbs deleted file mode 100644 index 9fb99ed7..00000000 --- a/utils/create_user_data_dbs +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env perl -# Copyright [2009-2014] EMBL-European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -### create_user_data_dbs -use lib '/nfs/ensembl/perllib/catalystlibs/lib/perl5/site_perl/5.8.8'; -use lib '/nfs/ensembl/perllib/catalystlibs/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi'; - - -use strict; - -# use EnsEMBL::Web::SpeciesDefs; -use FindBin qw($Bin); -use File::Basename qw( dirname ); -# Load libraries needed for reading config -use Data::Dumper; -use File::Slurp qw(slurp); -use File::Find::Object::Rule; - -BEGIN { - - unshift @INC, "$Bin/../../../conf"; - unshift @INC, "$Bin/../../.."; - unshift @INC, "$Bin/../../utils"; - eval { require SiteDefs }; - if ($@) { die "Can't use SiteDefs.pm - $@\n"; } - map { unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS; -} - - - -use utils::Tool; - -my $SPECIES_DEFS = EnsEMBL::Web::SpeciesDefs->new(); - - -die "cannot find user_data_template.ddl. We can\'t run without it\n" unless -e('user_data_template.ddl'); -my $DDL = slurp('user_data_template.ddl'); -chomp $DDL; -my @table_create_statments = split /;/ ,$DDL; - -my @SPECIES; - - - -# # Get Species List from SiteDefs - -@SPECIES = @{ utils::Tool::all_species()}; - - - -foreach my $species (@SPECIES) { - print "Getting credentientials for $species\n" . '='x20 . "\n"; - - my $user_db_name = user_db_name($species); - check_user_db_exists($species) eq 'No' ? create_from_new($species, $user_db_name) : print patch_existing($species); - -} - -sub user_db_name { - my $species = shift; - - my $user_data_name = $SPECIES_DEFS->GENOMIC_UNIT eq 'bacteria' ? '_collection_userdata' : '_userdata'; - - my $user_db_name = lcfirst($species) . $user_data_name; - - return $user_db_name; -} - - -sub check_user_db_exists { - my $species = shift; - my $user_db_name = user_db_name($species); - - my $dbh = get_dbh_from_species_type($species, 'core'); - my $query = qq{SELECT IF(EXISTS (SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '$user_db_name'), 'Yes','No') as 'exists'}; - # warn $query; - my $array_ref = $dbh->selectall_arrayref($query); - #warn Dumper $array_ref; - return $array_ref->[0]->[0]; -} - - - - - -sub get_dbh_from_species_type { - my ($species, $type) = @_; - - die "need species and database type" unless $species && $type; - - - my $database = $type eq 'core' ? $SPECIES_DEFS->get_config($species, 'databases')->{'DATABASE_CORE'}->{'NAME'}: user_db_name($species); - - # make the DB handle, get the credentials from SPECIES_DEFS - my $db_credentials = $SPECIES_DEFS->get_config($species, 'databases')->{'DATABASE_CORE'}; - my $host = $db_credentials->{HOST} || (warn "** no HOST in config for [ $species ]"); - print "Using HOST: [$host]\n". '='x20 . "\n"; - - my $port = $db_credentials->{PORT} || warn "** no PORT in config for [ $species ]"; - my $write_user = $SPECIES_DEFS->DATABASE_WRITE_USER || die "** no WRITE_USER in config "; - my $write_pass = $SPECIES_DEFS->DATABASE_WRITE_PASS || die "** no WRITE_PASS in config "; - - my $dsn = "DBI:mysql:database=$database;host=$host"; - $dsn .= ";port=$port" if ($port); - my $dbh = DBI->connect( $dsn, $write_user, $write_pass ); - - return $dbh; - -} - - - -sub create_from_new { - my ($species, $user_db_name) = @_; - - warn "creating from new [$species]"; - # my $db_stub = qq{mysql -u $write_user -p}. $write_pass . qq{ -h $host -P $port}; - # # my $db_stub = "mysql -uadmin -pMotxY157 -hmysql-eg-live-1.ebi.ac.uk -P4159"; - # my $db_create_st = $db_stub . " -e 'create database IF NOT EXISTS $user_db_name'"; - # print 'Stub is:' . $db_create_st, "\n"; - my $dbh = get_dbh_from_species_type($species, 'core'); - - - $dbh->do("create database $user_db_name"); - - my $userdata_dbh = get_dbh_from_species_type($species,'userdata'); - - #load the schema from the DDL - - map { $userdata_dbh->do($_);} @table_create_statments; - - # need to populate the analysis,meta,meta_coord and coord_system - # tables with data. This script does this by copying the data from - # the relevant tables on the core DB. - my @tables = qw( analysis meta meta_coord coord_system); - my $core_db_name = $SPECIES_DEFS->get_config($species, 'databases')->{'DATABASE_CORE'}->{'NAME'}; - - foreach my $table (@tables){ - # copy meta table - $userdata_dbh->do("create table if not exists $user_db_name.$table select * from $core_db_name.$table"); - } - -} - - -sub calculate_applicable_patches { - - my $species = shift; - - my $userdata_dbh = get_dbh_from_species_type($species,'userdata'); - my $core_dbh = get_dbh_from_species_type($species, 'core'); - - - # look in the meta table, find out what the schema versions is - # look in patch directory and choose the right ones - # return an array of pathes to use in the order they must be applied - - - # work out the patch dir relative to this one - - my $patch_dir = "$Bin/../../../ensembl/sql"; - - my $current_core_schema = $core_dbh->selectrow_array(qq{select meta_value from meta where meta_key = 'schema_version'}); - my $user_db_schema_version = $userdata_dbh->selectrow_array(qq{select meta_value from meta where meta_key = 'schema_version'}); - - - my @found = File::Find::Object::Rule->file()->name( '*.sql' )->in( $patch_dir); - die "didn't find any Patch files in dir:'$patch_dir'\n" unless @found > 0; - - my @schema_range = ($user_db_schema_version + 1)..$current_core_schema; - my $regex; - map {$regex .= "patch_$_|"} @schema_range; - chop $regex; - my @patch_list = grep/$regex/, @found; - return \@patch_list; - - -} - - - - -__END__ - -=head1 NAME - create_user_data_dbs - -=head1 AUTHOR - -Stephen Keenan B keenan@ebi.ac.uk - diff --git a/utils/eg_ebi_search_dump.ini b/utils/eg_ebi_search_dump.ini deleted file mode 100644 index 58c46811..00000000 --- a/utils/eg_ebi_search_dump.ini +++ /dev/null @@ -1,22 +0,0 @@ -# mysql -uadmin -piPBi22yI -hmysql-eg-production-1 -P4161 - -#$user='admin'; -#$pass='iPBi22yI'; -#$port='4161'; -#$host='mysql-eg-production-1.ebi.ac.uk'; -#$dir = '/nas/panda/ensembl/indexingScratch/multispecies'; - -#$user='admin'; -#$pass='tGc3Vs2O'; -#$port='4126'; -#$host='mysql-eg-devel-1.ebi.ac.uk'; -#$dir = '/nas/panda/ensembl/indexingScratch'; - -#mysql -uensrw -pwrit3r -hmysql-eg-staging-1 -P4159 ensembl_blast -$user='ensrw'; -$pass='writ3r'; -$port='4159'; -$host='mysql-eg-staging-1.ebi.ac.uk'; -$dir = '/nfs/ensembl/scratch/indexing'; - - diff --git a/utils/eg_ebi_search_dump.pl b/utils/eg_ebi_search_dump.pl deleted file mode 100755 index f2ced8e3..00000000 --- a/utils/eg_ebi_search_dump.pl +++ /dev/null @@ -1,1815 +0,0 @@ -#!/usr/local/bin/perl -# Copyright [2009-2014] EMBL-European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -package ebi_search_dump; - -use strict; -use DBI; -use Carp; -use Getopt::Long; -use IO::Zlib; -use FindBin qw($Bin); -use File::Basename qw( dirname ); -use Data::Dumper; - - -my ( - $host, $user, $pass, $port, $species, $ind, - $release, $max_entries, $nogzip, $parallel, $dir, $inifile, $genomic_unit, @SPECIES -); - -# Load libraries needed for reading config -# ----------------------------------- -BEGIN{ - unshift @INC, "$Bin/../../../conf"; - unshift @INC, "$Bin/../../.."; - eval{ require SiteDefs }; - if ($@){ die "Can't use SiteDefs.pm - $@\n"; } - map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS; -} - -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::DBLoader; -use Bio::SeqIO; -use utils::Tool; -use EnsEMBL::Web::SpeciesDefs; - - - - -my %rHash = map { $_ } @ARGV; -if ( $inifile = $rHash{'-inifile'} ) { - my $icontent = `cat $inifile`; - warn $icontent; - eval $icontent; -} - -GetOptions( - "host=s", \$host, "port=i", \$port, - "user=s", \$user, "pass=s", \$pass, - "species=s", \$species, "release=s", \$release, - "index=s", \$ind, "nogzip!", \$nogzip, - "max_entries=i", \$max_entries, "parallel", \$parallel, - "dir=s", \$dir, "help", \&usage, - "inifile=s", \$inifile -); - -$species ||= 'ALL'; - -@SPECIES = split ',', $species; - -# Get Species List from SiteDefs -if( @SPECIES ) { - @SPECIES = @{ utils::Tool::check_species(\@SPECIES) }; -} else { - @SPECIES = @{ utils::Tool::all_species()}; -} - -$dir ||= "."; -$release ||= 'LATEST'; -#$port ||= 3306; - - -my $SPECIES_DEFS = EnsEMBL::Web::SpeciesDefs->new(); -$host ||= $SPECIES_DEFS->DATABASE_HOST; -$port ||= $SPECIES_DEFS->DATABASE_HOST_PORT; -$user ||= 'ensro'; - - -usage() and exit unless ($release && $ind); - -my $entry_count; -my $global_start_time = time; -my $total = 0; -my $FAMILY_DUMPED; - -my $fh; -## HACK 1 - if the INDEX is set to all grab all dumper methods... -my @indexes = split ',', $ind; -@indexes = map { /dump(\w+)/ ? $1 : () } keys %ebi_search_dump:: - if $ind eq 'ALL'; - -warn Dumper \@indexes; - -my $dbHash = get_databases(); -warn Dumper $dbHash; - -#warn Dumper $dbcHash; - -warn "SPECIES : ", Dumper @SPECIES; - - -foreach my $species ( @SPECIES ) { -warn $species; -$species = lc($species); - my $conf = $dbHash->{$species}; - foreach my $index (@indexes) { - - my $function = "dump$index"; - no strict "refs"; - - $species =~ s/_/ /g; - if ($index ne 'Family'){ - &$function( ucfirst($species), $conf); - print $function,"\n"; - } elsif ($index eq 'Family' && !$FAMILY_DUMPED) { - &dumpFamily($conf); - - } - - } - -} - -print_time($global_start_time); -warn " Dumped $total entries ...\n"; - -# ------------------------------------------------------------------------------- - -sub text_month { - - my $m = shift; - - my @months = qw[JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC]; - - return $months[$m]; - -} - -# ------------------------------------------------------------------------------- - -sub print_time { - - my $start = shift; - - my $t = time - $start; - my $s = $t % 60; - $t = ( $t - $s ) / 60; - my $m = $t % 60; - $t = ( $t - $m ) / 60; - my $h = $t % 60; - - print "Time taken: " . $h . "h " . $m . "m " . $s . "s\n"; - -} - -#------------------------------------------------------------------------------------------------ -sub usage { - print < - - -host Database host to connect to. Defaults to ens-staging. - -port Database port to connect to. Defaults to 3306. - -species Species name. Defaults to ALL. - -index Index to create. Defaults to ALL. - -release Release of the database to dump. Defaults to 'latest'. - -user Database username. Defaults to ensro. - -pass Password for user. - -dir Directory to write output to. Defaults to /lustre/scratch1/ensembl/gp1/xml. - -nogzip Don't compress output as it's written. - -help This message. - -inifile First take the arguments from this file. Then overwrite with what is provided in the command line - -EOF - -} - -sub get_databases { - - my ( $dbHash, $dbcHash ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - - my $db = DBI->connect( $dsn, $user, $pass ); - - warn "DSN: $dsn"; - - my @dbnames = - map { $_->[0] } @{ $db->selectall_arrayref("show databases") }; - - $db->disconnect(); - warn Dumper \@dbnames; - - my $latest_release = 0; - my ( $db_species, $db_release, $db_type ); - my $compara_hash; - for my $dbname (@dbnames) { - if ( ( $db_species, $db_type, $db_release ) = - $dbname =~ /^([a-z]+_[a-z0-9]+)_([a-z]+)_(\d+)_\w+$/ ) - { - - warn join ' * ', $db_species, $db_type, $db_release, $species, "\n"; - - next if ( $species ne 'ALL' ) && ( $db_species ne lc($species) ); - $latest_release = $db_release if ( $db_release > $latest_release ); - $dbHash->{$db_species}->{$db_type}->{$db_release} = $dbname; - - } - if ( ($db_release) = $dbname =~ /ensembl_compara_(\d+)/ ) { - - #N.B Re:COMAPARA for now using - #ensembl_compara_VERSION. Others will follow - $compara_hash->{$db_release} = $dbname; - } - - } - - map { $dbHash->{$_}->{'compara'} = $compara_hash } keys %$dbHash; - $release = $latest_release if ( $release eq 'LATEST' ); - - return $dbHash; - -} - -sub footer { - my ($ecount) = @_; - p(""); - p("$ecount"); - p(""); - - print "Dumped $ecount entries\n"; - if ($nogzip) { - close(FILE) or die $!; - } - else { - $fh->close(); - } - $total += $ecount; -} - -sub header { - my ( $dbname, $dbspecies, $dbtype ) = @_; - - p(""); - p("]>"); - p(""); - p("$dbname"); - p("Ensembl $dbspecies $dbtype database"); - p("$release"); - p(""); - p(""); -} - -sub p { - my ($str) = @_; - - # TODO - encoding - $str .= "\n"; - if ($nogzip) { - print FILE $str or die "Can't write to file ", $!; - } - else { - print $fh $str or die "Can't write string: $str"; - } -} - -sub format_date { - my $t = shift; - - my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ]; - $y += 1900; - $d = "0" . $d if ( $d < 10 ); - my $mm = text_month($m); - return "$d-$mm-$y"; -} - -sub format_datetime { - my $t = shift; - - my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ]; - $y += 1900; - $d = "0" . $d if ( $d < 10 ); - my $ms = text_month($m); - return sprintf "$d-$ms-$y %02d:%02d:%02d", $hh, $mm, $ss; -} - -sub dumpFamily { - my ( $conf ) = @_; - - my $FAMDB = $conf->{'compara'}->{$release} or next; - - -# my $dbname = $conf->{'core'}->{$release} or next; - - my $file = "$dir/Family_all_species_core_$FAMDB.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $FAMDB to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - header( $FAMDB, 'compara_all_species', $FAMDB ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$FAMDB", $user, $pass ) or die "DBI::error"; - - - -# my $CORE = $conf->{'core'}->{$release}; -# my $t_sth = $dbh->prepare( qq{select meta_value from $CORE.meta where meta_key='species.taxonomy_id'}); -# $t_sth->execute; -# my $taxon_id = ( $t_sth->fetchrow ); - -# return unless $taxon_id; - - $dbh->do("SET SESSION group_concat_max_len = 100000"); - my $sth = $dbh->prepare( -qq{ select f.family_id as id, f.stable_id as fid , f.description, group_concat(m.stable_id, unhex('1D') ,m.source_name) as IDS -from $FAMDB.family as f, $FAMDB.family_member as fm, $FAMDB.member as m - where fm.family_id = f.family_id and fm.member_id = m.member_id group by fid} - ); - $sth->execute; - foreach my $xml_data ( @{ $sth->fetchall_arrayref( {} ) } ) { - - my @bits = split /,/, delete $xml_data->{IDS}; - map { push @{ $xml_data->{IDS} }, [ split /\x1D/ ] } @bits; -# $xml_data->{species} = $dbspecies; - $xml_data->{species} = ''; - p familyLineXML($xml_data); - - } - - footer( $sth->rows ); - $FAMILY_DUMPED = 1; -} - -sub familyLineXML { - my ( $xml_data ) = @_; - - my $members = scalar @{ $xml_data->{IDS} }; - - - my $description = $xml_data->{description}; - - $description =~ s//>/g; - $description =~ s/'/'/g; - $description =~ s/&/&/g; - - - - - my $xml = qq{ - -$xml_data->{fid} - $description - } . - ( - join "", - ( - map { - qq{ - } if $_->[1] =~ /(Uniprot|ENSEMBL).*/ - } @{ $xml_data->{IDS} } - ) - ) - . - qq{ - - - $members - $xml_data->{species} - Ensembl_protein_family - -}; - return $xml; - -} - -sub dumpGene { -warn "in dumpGene"; - my ( $dbspecies, $conf) = @_; - - foreach my $DB ( 'core', 'vega' ) { -# foreach my $DB ( 'core' ) { - my $counter = make_counter(0); - my $SNPDB = eval {$conf->{variation}->{$release}}; - my $DBNAME = $conf->{$DB}->{$release} - or warn "$dbspecies $DB $release: no database not found"; - next unless $DBNAME; - - print "START... $DB"; - my $file = "$dir/Gene_$DBNAME.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - header( $DBNAME, $dbspecies, $DB ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - - warn "Dumping $DBNAME to $file ... ", format_datetime($start_time), - "\n"; - my $extra = $DB ne 'core' ? ";db=$DB" : ''; - - my $dbh = DBI->connect( "$dsn:$DBNAME", $user, $pass ) - or die "DBI::error"; - - # determine genomic unit - my $division = $dbh->selectrow_array("SELECT meta_value FROM meta WHERE meta_key = 'species.division'"); - (my $genomic_unit = $division) =~ s/^ensembl//; # eg EnsemblProtists -> protists - warn "Genomic unit [$genomic_unit]"; - die "Genomic unit not found" unless $genomic_unit; - - # SNP query - my $snp_sth = eval {$dbh->prepare("select distinct(vf.variation_name) from $SNPDB.transcript_variation as tv, $SNPDB.variation_feature as vf where vf.variation_feature_id = tv.variation_feature_id and tv.transcript_stable_id in(?)");}; - - my %xrefs = (); - my %xrefs_desc = (); - my %disp_xrefs = (); - foreach my $type (qw(Gene Transcript Translation)) { - my $T = $dbh->selectall_arrayref( - "select ox.ensembl_id, - x.display_label, x.dbprimary_acc, ed.db_name, es.synonym, x.description - from ($DBNAME.object_xref as ox, $DBNAME.xref as x, $DBNAME.external_db as ed) left join $DBNAME.external_synonym as es on es.xref_id = x.xref_id - where ox.ensembl_object_type = '$type' and ox.xref_id = x.xref_id and x.external_db_id = ed.external_db_id" - ); - foreach (@$T) { - - $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[1] } = 1 if $_->[1]; - $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[2] } = 1 if $_->[2]; -# $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[4] } = 1 if $_->[4]; - $xrefs_desc{$type}{ $_->[0] }{ $_->[5] } = 1 if $_->[5]; - if (my $syn = $_->[4]) { - $syn =~ s/^\'|\'$//g; - next if ($syn =~ /^(FBtr|FBpp)\d+/); - next if ($syn =~ /^CG\d+\-/); - $xrefs{$type}{ $_->[0] }{ $_->[3] . "_synonym" }{ $syn } = 1; - } - - - } - - warn "XREF $type query..."; - } - warn "Exons query..."; - my %exons = (); - my $T = $dbh->selectall_arrayref( - "select distinct t.gene_id, esi.stable_id - from transcript as t, exon_transcript as et, exon_stable_id as esi - where t.transcript_id = et.transcript_id and et.exon_id = esi.exon_id" - ); - foreach (@$T) { - $exons{ $_->[0] }{ $_->[1] } = 1; - } - warn "Domains query..."; - my %domains; - $T = $dbh->selectall_arrayref(' - SELECT DISTINCT g.gene_id, pf.hit_name - FROM gene g, transcript t, translation tl, protein_feature pf - WHERE g.gene_id = t.gene_id AND t.transcript_id = tl.transcript_id AND tl.translation_id = pf.translation_id - '); - foreach (@$T) { - $domains{$_->[0]}{$_->[1]} = 1; - } - - my $gene_info = $dbh->selectall_arrayref( " - select gsi.gene_id, tsi.transcript_id, trsi.translation_id, - gsi.stable_id as gsid, tsi.stable_id as tsid, trsi.stable_id as trsid, - g.description, ed.db_name, x.dbprimary_acc,x.display_label, ad.display_label, ad.description, g.source, g.status, g.biotype - from (((( $DBNAME.gene_stable_id as gsi, $DBNAME.gene as g, - $DBNAME.transcript_stable_id as tsi, - $DBNAME.analysis_description as ad, - $DBNAME.transcript as t) left join - $DBNAME.translation as tr on t.transcript_id = tr.transcript_id) left join - $DBNAME.translation_stable_id as trsi on tr.translation_id = trsi.translation_id) left join - $DBNAME.xref as x on g.display_xref_id = x.xref_id) left join - $DBNAME.external_db as ed on ed.external_db_id = x.external_db_id - where t.gene_id = gsi.gene_id and t.transcript_id = tsi.transcript_id and t.gene_id = g.gene_id - and g.analysis_id = ad.analysis_id - order by gsi.stable_id, tsi.stable_id; - " ); - warn "Gene query..."; - - my %hash = map { $_->[0] } @$gene_info; - my $ecount = scalar keys %hash, "\n\n"; - - my %old; - - foreach my $row (@$gene_info) { - - # g = gene_id, t = transcript_id , tr = translation_id , - # gs = gene_stable_id, ts = transcript_stable_id , trs = - # translation_stable_id, d = description, - # ddb= external_db_dispay_name, - # dpa = xref_primary_accession, - # dn = xref display_label, - # a = analysis_description - # display label, - # ad = analysis description descripion, - #s = gene.source, st = gene.status, bt = gene.biotype - - my ( - $gene_id, $transcript_id, - $translation_id, $gene_stable_id, - $transcript_stable_id, $translation_stable_id, - $gene_description, $extdb_db_display_name, - $xref_primary_acc, $xref_display_label, - $analysis_description_display_label, $analysis_description, - $gene_source, $gene_status, - $gene_biotype - ) = @$row; - if ( $old{'gene_id'} != $gene_id ) { - if ( $old{'gene_id'} ) { - - if ($SNPDB && $DB eq 'core') { - my @transcript_stable_ids = keys %{$old{transcript_stable_ids}}; - $snp_sth->execute("@transcript_stable_ids"); - my $snps = $snp_sth->fetchall_arrayref; - $old{snps} = $snps; - } - - -# my @transcript_stable_ids = keys $old{'transcript_ids'}; -# $snp_sth->execute(@transcript_stable_ids); -# my $snps = $snp_sth->fetchall_arrayref; -# die Dumper($snps); - p geneLineXML( $dbspecies, \%old, $counter, $genomic_unit ); - - } - %old = ( - 'gene_id' => $gene_id, - 'gene_stable_id' => $gene_stable_id, - 'description' => $gene_description, - 'translation_stable_ids' => { - $translation_stable_id ? ( $translation_stable_id => 1 ) - : () - }, - 'transcript_stable_ids' => { - $transcript_stable_id ? ( $transcript_stable_id => 1 ) - : () - }, - 'transcript_ids' => { - $transcript_id ? ( $transcript_id => 1 ) - : () - }, - 'exons' => {}, - 'domains' => {}, - 'external_identifiers' => {}, - 'alt' => $xref_display_label - ? "($analysis_description_display_label: $xref_display_label)" - : "(novel gene)", - 'ana_desc_label' => $analysis_description_display_label, - 'ad' => $analysis_description, - 'source' => ucfirst($gene_source), - 'st' => $gene_status, - 'biotype' => $gene_biotype - ); - $old{'source'} =~ s/base/Base/; - - $old{'exons'} = $exons{$gene_id}; - foreach my $K ( keys %{ $exons{$gene_id} } ) { - $old{'i'}{$K} = 1; - } - $old{'domains'} = $domains{$gene_id}; - - foreach my $db ( keys %{ $xrefs{'Gene'}{$gene_id} || {} } ) { - foreach my $K ( keys %{ $xrefs{'Gene'}{$gene_id}{$db} } ) { - $old{'external_identifiers'}{$db}{$K} = 1; - - } - } - foreach my $db ( - keys %{ $xrefs{'Transcript'}{$transcript_id} || {} } ) - { - foreach my $K ( - keys %{ $xrefs{'Transcript'}{$transcript_id}{$db} } ) - { - $old{'external_identifiers'}{$db}{$K} = 1; - - } - } - foreach my $db ( - keys %{ $xrefs{'Translation'}{$translation_id} || {} } ) - { - foreach my $K ( - keys %{ $xrefs{'Translation'}{$translation_id}{$db} } ) - { - $old{'external_identifiers'}{$db}{$K} = 1; - } - } - - } - else { - $old{'transcript_stable_ids'}{$transcript_stable_id} = 1; - $old{'transcript_ids'}{$transcript_id} = 1; - $old{'translation_stable_ids'}{$translation_stable_id} = 1; - - foreach my $db ( - keys %{ $xrefs{'Transcript'}{$transcript_id} || {} } ) - { - foreach my $K ( - keys %{ $xrefs{'Transcript'}{$transcript_id}{$db} } ) - { - $old{'external_identifiers'}{$db}{$K} = 1; - } - } - foreach my $db ( - keys %{ $xrefs{'Translation'}{$translation_id} || {} } ) - { - foreach my $K ( - keys %{ $xrefs{'Translation'}{$translation_id}{$db} } ) - { - $old{'external_identifiers'}{$db}{$K} = 1; - - } - } - } - } - - -# my @transcript_stable_ids = keys %$old{'transcript_ids'}; -# $snp_sth->execute(@transcript_ids); -# my $snps = $snp_sth->fetchall_arrayref; -# die Dumper($snps); - - - -# $snp_sth->execute(keys %$old{transcript_ids}); -# my $snps = $snp_sth->fetchall_arrayref($snp_sth); - if ($SNPDB && $DB eq 'core') { - my @transcript_stable_ids = keys %{$old{transcript_stable_ids}}; - $snp_sth->execute("@transcript_stable_ids"); - my $snps = $snp_sth->fetchall_arrayref; - $old{snps} = $snps; - } - - p geneLineXML( $dbspecies, \%old, $counter, $genomic_unit ); - - footer( $counter->() ); - warn "FINISHED...... genes $DB ..."; - - } - -} - -sub geneLineXML { - my ( $species, $xml_data, $counter, $genomic_unit ) = @_; - - return warn "gene id not set" if $xml_data->{'gene_stable_id'} eq ''; - die "genomic_unit not passed to geneLineXML" unless $genomic_unit; - - my $gene_id = $xml_data->{'gene_stable_id'}; - my $altid = $xml_data->{'alt'} or die "altid not set"; - my $transcripts = $xml_data->{'transcript_stable_ids'} - or die "transcripts not set"; - - my $snps = $xml_data->{'snps'}; - - my $peptides = $xml_data->{'translation_stable_ids'} - or die "peptides not set"; - my $exons = $xml_data->{'exons'} or warn "!!!! exons not set for gene_id: $gene_id"; - my $domains = $xml_data->{'domains'};# or warn "!!!! domains not set for gene_id: $gene_id"; <-- not always present? - my $external_identifiers = $xml_data->{'external_identifiers'} - or die "external_identifiers not set"; - my $description = $xml_data->{'description'}; - my $type = $xml_data->{'source'} . ' ' . $xml_data->{'biotype'} - or die "problem setting type"; - - my $exon_count = scalar keys %$exons; - my $domain_count = scalar keys %$domains; - my $transcript_count = scalar keys %$transcripts; - $description =~ s//>/g; - $description =~ s/'/'/g; - $description =~ s/&/&/g; - - $gene_id =~ s//>/g; - - $altid =~ s//>/g; - - my $xml = qq{ - - $gene_id $altid - $description}; - - my $cross_references = qq{ - }; - - foreach my $ext_db_name ( keys %$external_identifiers ) { - if ($ext_db_name =~ /(Uniprot|GO|Interpro|Medline|Sequence_Publications|EMBL)/) { - - map { $cross_references .= qq{ - }; } keys %{ $external_identifiers->{$ext_db_name} } - - } else { - foreach my $key (keys %{ $external_identifiers->{$ext_db_name} }) { - $key =~ s//>/g; - $key =~ s/&/&/g; - $ext_db_name =~s/^Ens*/ENSEMBL/; - $cross_references .= qq{ - }; - } - - } - } - - $cross_references .= ( - join "", - ( - map { - qq{ - } - } @$snps - ) - ); - - $cross_references .= qq{}; - -my $sp = $species =~ /1163/ ? 'Aspergillus fumigatus A1163' : $species; - - my $additional_fields .= qq{ - - $sp - Gene - $type - $genomic_unit - $transcript_count } - - . ( - join "", - ( - map { - qq{ - $_} - } keys %$transcripts - ) - ) - - . qq{ $exon_count } - - . ( - join "", - ( - map { - qq{ - $_} - } keys %$exons - ) - ) - . qq{ $domain_count } - - . ( - join "", - ( - map { - qq{ - $_} - } keys %$domains - ) - ) - - . ( - join "", - ( - map { - qq{ - $_} - } keys %$peptides - ) - ) - . qq{ - }; - - - - $counter->(); - return $xml . $cross_references . $additional_fields . ''; - -} - - - - - - -sub dumpGenomicAlignment { -warn "in dump Genomic"; - my ( $dbspecies, $conf ) = @_; - - # warn Dumper $conf; - - warn "\n", '*' x 20, "\n"; - my %tables = ( - 'dna_align_feature' => [ 'DnaAlignFeature', 'DNA alignment feature' ], - 'protein_align_feature' => - [ 'ProteinAlignFeature', 'Protein alignment feature' ] - ); - my $ecount; - - foreach my $db ( 'core', 'cdna', 'otherfeatures' ) { - - my $ecount = 0; - my $DB_NAME = $conf->{$db}->{$release} or next; - my $file = "$dir/GenomicAlignment_$DB_NAME.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $DB_NAME to $file ... ", format_datetime($start_time), - "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die "Can't open compressed stream to $file: ", $!; - } - else { - open( FILE, ">$file" ) || die "Can't open $file: ", $!; - } - - header( $DB_NAME, $dbspecies, $db ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - - my $dbh = DBI->connect( "$dsn:$DB_NAME", $user, $pass ) - or die "DBI::error"; - foreach my $table ( keys %tables ) { - my $source = $tables{$table}[0]; - - # $source .= ";db=$db" unless $db eq 'core';\ - - - - # Due to the sheer number of features - generating this - # dump causes temp table to be written on the mysqld - # filesys. /tmp space can be easily filled. - # Have split the feature fetching to happen by an - # analysis_id at a time. The changes below gave a x3 - # speed up on XML dumping as compared to fetching all - # features in one query at once. - - # make a lookup for the analysis display labels. - my $type = $tables{$table}[1]; - my $logic_name_lookup = $dbh->selectall_hashref( - "select a.analysis_id, a.logic_name - from $DB_NAME.analysis as a - ", [qw(analysis_id)] - ) or die $DBI::Err; - - my $display_label_lookup = $dbh->selectall_hashref( - "select ad.analysis_id, ad.display_label - from $DB_NAME.analysis_description as ad - ", [qw(analysis_id)] - ) or die $DBI::Err; - - my $sth = $dbh->prepare( - "select t.analysis_id,t.hit_name, count(*) as hits - from $DB_NAME.$table as t where t.analysis_id = ? - group by t.analysis_id, t.hit_name" - ) or die $DBI::Err; - - foreach my $ana_id ( - @{$dbh->selectall_arrayref("select distinct distinct(analysis_id) from $DB_NAME.$table") - } - ) - { - - my $adesc = - ( $display_label_lookup->{ $ana_id->[0] }->{display_label} - || $logic_name_lookup->{ $ana_id->[0] }->{logic_name} ); - - $sth->execute( $ana_id->[0] ) or die $DBI::Err; - - my $rows = []; # cache for batches of rows - while ( - my $row = ( - shift(@$rows) || # get row from cache, - # or reload cache: - shift( - @{ - $rows = - $sth->fetchall_arrayref( undef, 100_000 ) - || [] - } - ) - ) - ) - { - - my $hid = $row->[1]; - my $count = $row->[2]; - - my $xml = qq{ - - - $dbspecies - $source - $db - $count - $adesc - - }; - p($xml); - - } - - $ecount += $sth->rows; - - } - - } - print_time($start_time); - footer($ecount); - } -} - - -sub dumpMarker { - warn "in dump MArker"; - my ( $dbspecies, $conf ) = @_; - - # my $xml_data; - # $xml_data->{species} = $dbspecies; - - my $db = 'core'; - my $dbname = $conf->{$db}->{$release} or next; - my $file = "$dir/Marker_$dbname.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("can't open compressed stream to $file: $!"); - } - else { - open( file, ">$file" ) || die "can't open $file: $!"; - } - header( $dbname, $dbspecies, $db ); - my $dsn = "dbi:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; - -# my $sth = $dbh->prepare(q{ -# SELECT @rownum := @rownum+1 AS rownum, ms2.name as marker, ms1.name -# from (select @rownum := 0) r, (marker_synonym as ms1, marker as m) left join -# marker_synonym as ms2 on ms2.marker_synonym_id = m.display_marker_synonym_id -# where ms1.marker_id = m.marker_id -# order by m.marker_id} -# ); -# $sth->execute( ); - my $sth = $dbh->prepare( - q{ SELECT ms2.name as marker, ms1.name - from (marker_synonym as ms1, marker as m) left join - marker_synonym as ms2 on ms2.marker_synonym_id = m.display_marker_synonym_id - where ms1.marker_id = m.marker_id - order by m.marker_id} - ); - - my $data = $dbh->selectall_hashref( $sth, [ 'marker', 'name' ] ); - foreach my $marker ( keys %$data ) { - p markerXML( $marker, $data, $dbspecies ); - } - - footer( scalar keys(%$data) ); - -} - -sub markerXML { - my ( $marker, $xml_data, $species ) = @_; - - my $xml; - - my @keys = keys %{ $xml_data->{$marker} }; - my $desc = - 'A marker with ' - . scalar @keys - . ' synonym' - . ( scalar @keys > 1 ? 's ' : ' ' ) . '(' - . join( " ", @keys ) . ')'; - - - $desc =~ s//>/g; - - $xml = qq{ - - }; - - foreach (@keys) { - s//>/g; - - $xml .= qq{ - $_} - - } - $xml .= qq{ - $species - Marker - -}; - - return $xml; - -} - - - -sub dumpOligoProbe { - warn "in dump Oligo"; - my ( $dbspecies, $conf ) = @_; - - my $db = 'core'; - my $dbname = $conf->{$db}->{$release} or next; - my $file = "$dir/OligoProbe_$dbname.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - header( $dbname, $dbspecies, $db ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; - - my $sth = $dbh->prepare( - "select p.probeset, count(*) as hits, a.type - from oligo_probe as p, oligo_feature as f, oligo_array as a - where p.oligo_probe_id = f.oligo_probe_id and p.oligo_array_id = a.oligo_array_id - group by p.probeset" - ); - $sth->execute(); - - - while ( my $rowcache = $sth->fetchall_arrayref( undef, '10_000' ) ) { - my $xml; - while ( my $data = shift( @{$rowcache} ) ) { - $xml .= OligoProbeXML( $data, $dbspecies ); - - } - p $xml; - } - - footer( $sth->rows ); - -} - -sub OligoProbeXML { - my ( $xml_data, $dbspecies ) = @_; - -# my $desc =qq{$xml_data->[0], $xml_data->[2] oligo probeset $xml_data->[0] hits the genome in $xml_data->[1] locations.}; - - return qq{ - - - $xml_data->[2] - $dbspecies - OligoProbe - $xml_data->[1] - -}; - -} - - -sub dumpQTL { - warn "in dumpQTL"; - - my ( $dbspecies, $conf ) = @_; - - # print Dumper($conf); - my $xml_data; - $xml_data->{species} = $dbspecies; - my $db = 'core'; - my $dbname = $conf->{$db}->{$release} or next; - my $file = "$dir/QTL_$dbname.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - - header( $dbname, $dbspecies, $db ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; - - my $sth = $dbh->prepare( - "select c.name as chr, qf.seq_region_start, qf.seq_region_end, - a.logic_name as analysis, q.qtl_id, - q.trait, qs.source_database, qs.source_primary_id, - fms1.source as fm1_source, fms1.name as fm1_name, - fms2.source as fm2_source, fms2.name as fm2_name, - pms.source as pm_source, pms.name as pm_name - from ((((((seq_region as c, qtl_feature as qf, qtl_synonym as qs, - analysis as a, qtl as q) left join marker as fm1 on - fm1.marker_id = q.flank_marker_id_1) left join marker_synonym as fms1 on - fm1.display_marker_synonym_id = fms1.marker_synonym_id) left join marker as fm2 on - fm2.marker_id = q.flank_marker_id_2) left join marker_synonym as fms2 on - fm2.display_marker_synonym_id = fms2.marker_synonym_id) left join marker as pm on - pm.marker_id = q.peak_marker_id) left join marker_synonym as pms on - pm.display_marker_synonym_id = pms.marker_synonym_id - where c.seq_region_id = qf.seq_region_id and qs.qtl_id = q.qtl_id and - qf.analysis_id = a.analysis_id and qf.qtl_id = q.qtl_id - " - ); - $sth->execute(); - my $desc = ''; - my $old_qtl = 0; - my $old_ID = ''; - my $old_pos = ''; - my $counter = make_counter(0); - while ( my $T = $sth->fetchrow_hashref() ) { - - if ( $T->{qtl_id} eq $old_qtl ) { - - # $IDS .= " $T->{source_primary_id}"; - $desc .= " $T->{source_database}:$T->{source_primary_id}"; - - $xml_data->{cross_ref}->{ $T->{source_database} } = - $T->{source_primary_id}; - - # $xml_data->{source_primary_id} = $T->{source_primary_id}; - # print Dumper($T); - } - else { - $xml_data->{pm_name} = $T->{pm_name}; - $old_pos = - "$T->{chr}:" - . ( $T->{seq_region_start} - 1e4 ) . '-' - . ( $T->{seq_region_end} + 1e4 ); - $desc = "QTL exhibiting '$T->{trait}' has "; - my $f2 = $T->{pm_name} ? 1 : 0; - if ( $T->{fm1_name} || $T->{fm2_name} ) { - my $f1 = ( $T->{fm1_name} ) && ( $T->{fm2_name} ) ? 1 : 0; - - $desc .= - 'flanking marker' - . ( $f1 ? 's ' : ' ' ) - . $T->{fm1_name} - . ( $f1 ? ' and ' : '' ) - . $T->{fm2_name} - . ( $f2 ? '; ' : '' ); - $xml_data->{f1} = $T->{fm1_name}; - $xml_data->{f2} = $T->{fm2_name}; - - } - if ($f2) { - $desc .= "peak marker $T->{pm_name};"; - $xml_data->{pm} = $T->{pm_name}; - } - $desc .= - " and names: $T->{source_database}:$T->{source_primary_id}"; - - #my $sd = $T->{source_database}; - $xml_data->{description} = $desc; - $xml_data->{cross_ref}->{ $T->{source_database} } = - $T->{source_primary_id}; - $old_qtl = $T->{qtl_id}; - $xml_data->{pos} = $old_pos; - if ( $xml_data->{pm_name} ) { - p( QTLXML( $xml_data, $counter ) ); - } - } - } - - $xml_data->{description} = $desc; - - if ( $xml_data->{pm_name} ) { - p( QTLXML( $xml_data, $counter ) ); - } - $dbh->disconnect(); - footer( $counter->() ); -} - -sub QTLXML { - my ( $xml_data, $counter ) = @_; - - my $xml = qq{ - - $xml_data->{pm_name} - $xml_data->{description} - - $xml_data->{species} - $xml_data->{f1} - $xml_data->{f2} - $xml_data->{pm} - $xml_data->{pos} - QTL - - }; - - foreach ( keys( %{ $xml_data->{cross_ref} } ) ) { - $xml .= - qq{\n }; - } - - $xml .= qq{\n -}; - $counter->(); - return $xml; - -} - - -sub dumpSequence { -warn "in dump Sequence"; - my ( $dbspecies, $conf ) = @_; - - # my $sanger = sanger_project_names( $conf ); - my $sanger = 'SANGER STUFF'; - my %config = ( - "Homo sapiens" => [ - [ - 'Clone', - 'tilepath, cloneset_1mb, cloneset_30k, cloneset_32k', -'name,well_name,clone_name,synonym,embl_acc,sanger_project,alt_well_name,bacend_well_name' - ], - [ 'NT Contig', 'ntctgs', 'name' ], - [ 'Encode region', 'encode', 'name,synonym,description' ], - ], - "Mus musculus" => [ - [ - 'BAC', - 'cloneset_0_5mb,cloneset_1mb,bac_map,tilingpath_cloneset', - 'embl_acc,name,clone_name,well_name,synonym,alt_embl_acc' - ], - [ 'Fosmid', 'fosmid_map', 'name,clone_name' ], - [ 'Supercontig', 'superctgs', 'name' ], - ], - "Anopheles gambiae" => [ - [ 'BAC', 'bacs', 'name,synonym,clone_name' ], - [ 'BAC band', 'bacs_bands', 'name,synonym,clone_name' ], - ], - "Gallus gallus" => [ - [ 'BAC', 'bac_map', 'name,synonym,clone_name' ], - [ - 'BAC ends', 'bacends', - 'name,synonym,clone_name', 'otherfeatures' - ] - ] - ); - - my $dbname = $conf->{'core'}->{$release} or next; - - my $file = "$dir/Sequence_$dbname.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - header( $dbname, $dbspecies, 'core' ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; - - my $COREDB = $dbname; - my $ESTDB = $conf->{otherfeatures}->{$release}; - - my @types = @{ $config{$dbspecies} || [] }; - my $ecounter; - foreach my $arrayref (@types) { - - my ( $TYPE, $mapsets, $annotationtypes, $DB ) = @$arrayref; - - my $DB = $DB eq 'otherfeatures' ? $ESTDB : $COREDB; - my @temp = split( ',', $mapsets ); - my @mapsets; - foreach my $X (@temp) { - my $ID = $dbh->selectrow_array( - "select misc_set_id from $DB.misc_set where code = ?", - {}, $X ); - push @mapsets, $ID if ($ID); - } - - next unless @mapsets; - @temp = split( ',', $annotationtypes ); - my @mapannotationtypes; - foreach my $X (@temp) { - my $ID = $dbh->selectrow_array( - "select attrib_type_id from $DB.attrib_type where code = ?", - {}, $X ); - push @mapannotationtypes, $ID if ($ID); - } - next unless @mapannotationtypes; - my $Z = " ma.value"; - my $MAPSETS = join ',', @mapsets; - my $sth = $dbh->prepare( - "select mf.misc_feature_id, sr.name, - ma.value, mf.seq_region_end-mf.seq_region_start+1 as len, - at.code - from $DB.misc_feature_misc_set as ms, - $DB.misc_feature as mf, - seq_region as sr, - $DB.misc_attrib as ma, - $DB.attrib_type as at - where mf.seq_region_id = sr.seq_region_id and mf.misc_feature_id = ms.misc_feature_id and ms.misc_set_id in ($MAPSETS) and - mf.misc_feature_id = ma.misc_feature_id and ma.attrib_type_id = at.attrib_type_id - order by mf.misc_feature_id, at.code" - ); - $sth->execute(); - my ( $oldtype, $old_ID, $oldchr, $emblaccs, $oldlen, $synonyms, $NAME ); - - while ( my ( $ID, $chr, $val, $len, $type ) = $sth->fetchrow_array() ) { - - if ( $ID == $old_ID ) { - $NAME = $val - if $type eq 'well_name' - || $type eq 'clone_name' - || $type eq 'name' - || $type eq 'non_ref'; - $NAME = $val if !$NAME && $type eq 'embl_acc'; - $NAME = $val if !$NAME && $type eq 'synonym'; - $NAME = $val if !$NAME && $type eq 'sanger_project'; - push @{$emblaccs}, $val if $val; - } - else { - p seqLineXML( - $dbspecies, $TYPE, $NAME, $oldchr, - $emblaccs, $oldlen, $sanger - ) if $old_ID; - $NAME = undef; - $emblaccs = undef; - $NAME = $val - if $type eq 'well_name' - || $type eq 'clone_name' - || $type eq 'name' - || $type eq 'non_ref'; - $NAME = $val if !$NAME && $type eq 'embl_acc'; - $NAME = $val if !$NAME && $type eq 'synonym'; - $NAME = $val if !$NAME && $type eq 'sanger_project'; - $emblaccs->[0] = $val; - ( $old_ID, $oldchr, $oldlen ) = ( $ID, $chr, $len ); - $ecounter += 1; - } - } - p seqLineXML( $dbspecies, $TYPE, $NAME, $oldchr, $emblaccs, $oldlen, - $sanger ) - if $old_ID; - } - - footer($ecounter); - -# my $sth = $conf->{'dbh'}->prepare( -# "select c.name, c.length, cs.name -# from seq_region as c, coord_system as cs -# where c.coord_system_id = cs.coord_system_id" ); -# $sth->execute(); -# while( my($name,$length,$type) = $sth->fetchrow_array() ) { -# my $extra_IDS = ''; mysql $extra_desc = ''; -# if( %{$sanger->{$name}||{}} ) { -# $extra_IDS = join ' ', '',sort keys %{$sanger->{$name}}; -# $extra_desc = " and corresponds to the following Sanger projects: ".join( ', ',sort keys %{$sanger->{$name}}); -# } -# print_time O join "\t", -# (INC_SPECIES?"$conf->{'species'}\t":"").ucfirst($type), $name, -# ($type eq 'chromosome' && length( $name ) < 5) ? -# "/$conf->{'species'}/mapview?chr=$name" : -# ($length > 0.5e6 ? "/$conf->{'species'}/cytoview?region=$name" : -# "/$conf->{'species'}/contigview?region=$name" ), -# "$name$extra_IDS", "$name isnull a @{[ucfirst($type)]} (of length $length)$extra_desc\n"; -# } -} - -sub seqLineXML { - my ( $species, $type, $name, $chr, $val, $len, $sanger ) = @_; - - pop @$val; -# my $description = "$type $name is mapped to Chromosome $chr" . - -# ( -# @$val > 0 -# ? ' and has ' -# . @$val -# . " EMBL accession" -# . ( -# @$val > 1 -# ? 's' -# : '' -# ) -# . "/synonym" -# . ( -# @$val > 1 -# ? 's ' -# : ' ' -# ) -# . "@$val" -# : '' -# ) - -# . " and length $len bps\n"; - - my $xml = qq{ - - } - - . ( - join "", - ( - map { - qq{ - } - } @$val - ) - ) - - . qq{ - - $species - $type - $chr - $len - Genomic - - }; - - return $xml; - -} - - - - - -sub dumpSNP { - my ( $dbspecies, $conf ) = @_; - - # warn Dumper $conf; - - warn "\n", '*' x 20, "\n"; - - my $COREDB = my $dbname = $conf->{'core'}->{$release}; - - - my $dbname = $conf->{variation}->{$release} or next; - my $file = "$dir/SNP_$dbname.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $dbname to $file ... ", format_datetime($start_time), - "\n"; - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die "Can't open compressed stream to $file: $!"; - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - - header( $dbname, $dbspecies, $dbname ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) - or die "DBI::error"; - my $source_hash = $dbh->selectall_hashref(qq{SELECT source_id, name FROM source} ,[qw(source_id)]); - -# my $tid_to_gene = $dbh->selectall_hashref(qq{select t.transcript_id, gsi.stable_id from $COREDB.gene as g, $COREDB.gene_stable_id as gsi, $COREDB.transcript as t where gsi.gene_id = g.gene_id and t.gene_id = g.gene_id limit 10;},[qw(transcript_id)]); - - -# my $sth = $dbh->prepare("select vf.variation_name, vf.source_id, group_concat(vs.source_id, ' ',vs.name), vf.variation_feature_id,vf.variation_id from variation_feature vf , transcript_variation tv -# left join variation_synonym vs on vf.variation_id = vs.variation_id where tv.transcript_variation_id = vf.variation_feature_id group by vf.variation_id"); - - my $sth = $dbh->prepare("select vf.variation_name, vf.source_id, group_concat(vs.source_id, ' ',vs.name), vf.consequence_type from variation_feature vf left join variation_synonym vs on vf.variation_id = vs.variation_id group by vf.variation_id"); - - - # my $vfi2gene_sth = $dbh->prepare(qq{select distinct(gsi.stable_id) from $COREDB.gene as g, $COREDB.gene_stable_id as gsi, $COREDB.transcript as t where gsi.gene_id = g.gene_id and t.gene_id = g.gene_id and transcript_id in -# (select tv.transcript_id from transcript_variation tv , variation_feature vf where vf.variation_feature_id =tv.variation_feature_id and vf.variation_feature_id = ?)}); - - $sth->execute() or die "Error:", $DBI::errstr; - - while (my $rowcache = $sth->fetchall_arrayref(undef, 10_000)) { - - - my $xml; - while (my $row = shift(@{$rowcache})) { -# $vfi2gene_sth->execute($row->[3]); -# my $gsi = $vfi2gene_sth->fetchall_arrayref; - my $name = $row->[0]; - my @synonyms = split /,/, @$row->[2]; - my $snp_source = $source_hash->{$row->[1]}->{name}; - -# my $description = -# "A $snp_source SNP with " -# . scalar @synonyms -# . ' synonym' -# . ( @synonyms > 1 | @synonyms < 1 ? 's ' : ' ' ) -# . ( @synonyms > 0 ? "( " . (join "", map{ map{ $source_hash->{$_->[0]}->{name} , ':', $_->[1] , ' ' } [split] } @synonyms ) . ")" : '' ); - - $xml .= qq{ - - $dbspecies - SNP - $row->[3]}; - - foreach my $syn(@synonyms) { - my @syn_bits = split / /, $syn; - $syn_bits[1] =~ s/:/ /; - - my $source = $source_hash->{$syn_bits[0]}->{name}; - $xml .= qq{ - $syn_bits[1] [source; $source]}; - } -$xml .= qq{ - - -}; - - } - - p($xml); - } - - footer($sth->rows); - print_time($start_time); - -} - - - - - - - -sub dumpUnmappedFeatures { - my ( $dbspecies, $conf ) = @_; - - my $db = 'core'; - my $COREDB = $conf->{$db}->{$release} or next; - my $file = "$dir/UnmappedFeature_$COREDB.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $COREDB to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - header( $COREDB, $dbspecies, $db ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$COREDB", $user, $pass ) or die "DBI::error"; - - my %unmapped_queries = ( - 'None' => qq( - select a.logic_name, e.db_display_name, - uo.identifier, ur.summary_description, - 'Not mapped' - from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo, - $COREDB.unmapped_reason as ur - where a.analysis_id = uo.analysis_id and - uo.external_db_id = e.external_db_id and - uo.unmapped_reason_id = ur.unmapped_reason_id and - uo.ensembl_id = 0 - ), - 'Transcript' => qq( - select a.logic_name, e.db_display_name, - uo.identifier, ur.summary_description, - concat( 'Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id ) - from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo, - $COREDB.unmapped_reason as ur, $COREDB.transcript_stable_id as tsi, - $COREDB.transcript as t, $COREDB.gene_stable_id as gsi - where a.analysis_id = uo.analysis_id and - uo.external_db_id = e.external_db_id and - uo.unmapped_reason_id = ur.unmapped_reason_id and - uo.ensembl_id = t.transcript_id and - uo.ensembl_object_type = 'Transcript' and - t.transcript_id = tsi.transcript_id and - t.gene_id = gsi.gene_id - ), - 'Translation' => qq( - select a.logic_name, e.db_display_name, uo.identifier, ur.summary_description, - concat( 'Translation: ',trsi.stable_id,'; Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id ) - from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo, - $COREDB.unmapped_reason as ur, $COREDB.transcript_stable_id as tsi, - $COREDB.translation as tr, $COREDB.translation_stable_id as trsi, - $COREDB.transcript as t, $COREDB.gene_stable_id as gsi - where a.analysis_id = uo.analysis_id and - uo.external_db_id = e.external_db_id and - uo.unmapped_reason_id = ur.unmapped_reason_id and - uo.ensembl_id = tr.translation_id and - tr.transcript_id = t.transcript_id and - trsi.translation_id = tr.translation_id and - uo.ensembl_object_type = 'Translation' and - t.transcript_id = tsi.transcript_id and - t.gene_id = gsi.gene_id - ) - ); - my $entry_count = 0; - foreach my $type ( keys %unmapped_queries ) { - my $SQL = $unmapped_queries{$type}; - my $sth = $dbh->prepare($SQL); - $sth->execute; - while ( my $T = $sth->fetchrow_arrayref() ) { - - # print join "\t", ("$species\t") . qq(Unmapped feature), - # "$T->[1] $T->[2]", - # "$dbspecies/featureview?type=Gene;id=$T->[2]", "$T->[2] $T->[4]", - # "$T->[3]; $T->[4]\n"; - p unmappedFeatureXML( $T, $dbspecies ) - - } - $entry_count += $sth->rows - - } - - footer($entry_count); - -} - -sub unmappedFeatureXML { - my ( $xml_data, $dbspecies ) = @_; - - return qq{ - - $xml_data->[1] $xml_data->[2] - $xml_data->[3]; $xml_data->[4] - - $dbspecies - UnmappedFeature - - }; - -} - -sub dumpUnmappedGenes { - my ( $dbspecies, $conf ) = @_; - - my $db = 'core'; - my $dbname = $conf->{$db}->{$release} or next; - - my $file = "$dir/UnmappedGene_$dbname.xml"; - $file .= ".gz" unless $nogzip; - my $start_time = time; - warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n"; - - unless ($nogzip) { - $fh = new IO::Zlib; - $fh->open( "$file", "wb9" ) - || die("Can't open compressed stream to $file: $!"); - } - else { - open( FILE, ">$file" ) || die "Can't open $file: $!"; - } - header( $dbname, $dbspecies, $db ); - my $dsn = "DBI:mysql:host=$host"; - $dsn .= ";port=$port" if ($port); - my $ecount; - my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; - - my $COREDB = $conf->{$db}->{$release}; - - my %current_stable_ids = (); - foreach my $type (qw(gene transcript translation)) { - $current_stable_ids{$type} = { - map { @$_ } @{ - $dbh->selectall_arrayref( - "select stable_id,1 from $COREDB." . $type . "_stable_id" - ) - } - }; - } - my $species = $dbspecies; - my $sth = $dbh->prepare( - qq( - select sie.type, sie.old_stable_id, if(isnull(sie.new_stable_id),'NULL',sie.new_stable_id), ms.old_release*1.0 as X, ms.new_release*1.0 as Y - from $COREDB.mapping_session as ms, $COREDB.stable_id_event as sie - where ms.mapping_session_id = sie.mapping_session_id and ( old_stable_id != new_stable_id or isnull(new_stable_id) ) - order by Y desc, X desc - ) - ); - - $sth->execute(); - my %mapping = (); - while ( my ( $type, $osi, $nsi ) = $sth->fetchrow_array() ) { - next - if $current_stable_ids{$type}{ $osi - }; ## Don't need to cope with current IDS already searchable... - $mapping{$type}{$osi}{$nsi} = 1; - if ( $mapping{$type}{$nsi} ) { - foreach ( keys %{ $mapping{$type}{$nsi} } ) { - $mapping{$type}{$osi}{$_} = 1; - } - } - } - - foreach my $type ( keys %mapping ) { - $ecount += scalar keys %{ $mapping{$type} }, ' '; - - foreach my $osi ( keys %{ $mapping{$type} } ) { - - my @current_sis = (); - my @deprecated_sis = (); - foreach ( keys %{ $mapping{$type}{$osi} } ) { - if ( $current_stable_ids{$_} ) { - push @current_sis, $_; - } - elsif ( $_ ne 'NULL' ) { - push @deprecated_sis, $_; - } - } - if (@current_sis) { - - my $description = -qq{$type $osi is no longer in the Ensembl database but it has been mapped to the following current identifiers: @current_sis} - . ( - @deprecated_sis - ? "; and the following deprecated identifiers: @deprecated_sis" - : '' - ); - p unmappedGeneXML( $osi, $dbspecies, $description, lc($type) ); - - } - elsif (@deprecated_sis) { - - my $description = -qq($type $osi is no longer in the Ensembl database but it has been mapped to the following identifiers: @deprecated_sis); - p unmappedGeneXML( $osi, $dbspecies, $description, lc($type) ); - } - else { - - my $description = -qq($type $osi is no longer in the Ensembl database and it has not been mapped to any newer identifiers); - p unmappedGeneXML( $osi, $dbspecies, $description, lc($type) ); - } - } - } - - footer($ecount); -} - -sub unmappedGeneXML { - my ( $id, $dbspecies, $description, $type ) = @_; - - return qq{ - - $description - - $dbspecies - Unmapped$type - - }; - -} - -sub make_counter { - my $start = shift; - return sub { $start++ } -} - -sub FamilyDumped { - my $is_dumped; - return sub { $is_dumped } -} diff --git a/utils/search_dump.pl b/utils/search_dump.pl new file mode 100755 index 00000000..95fc7c63 --- /dev/null +++ b/utils/search_dump.pl @@ -0,0 +1,975 @@ +#!/usr/local/bin/perl + +# +# Dump search XML files for indexing by the EBEye search engine. +# + +package ebi_search_dump; + +use strict; +use DBI; +use Carp; +use File::Basename qw( dirname ); +use File::Find; +use FindBin qw($Bin); +use Getopt::Long; +use IO::Zlib; +use Data::Dumper; +use HTML::Entities; + +BEGIN{ + unshift @INC, "$Bin/../../conf"; + unshift @INC, "$Bin/../.."; + eval{ require utils::Tool }; + if ($@){ warn "Can't use utils::Tool (required for ensemblgenomes)\n"; } +} + +my ( + $host, $user, $pass, $port, $species, $ind, + $release, $max_entries, $nogzip, $parallel, $dir, $inifile, + $nogenetrees, $novariation, $noxrefs, $skip_existing, $format, $noortholog +); + +my %rHash = map { $_ } @ARGV; +if ( $inifile = $rHash{'-inifile'} ) { + my $icontent = `cat $inifile`; + warn $icontent; + eval $icontent; +} + +GetOptions( + "host=s", \$host, "port=i", \$port, + "user=s", \$user, "pass=s", \$pass, + "species=s", \$species, "release=s", \$release, + "index=s", \$ind, "nogzip!", \$nogzip, + "max_entries=i", \$max_entries, "parallel", \$parallel, + "dir=s", \$dir, "help", \&usage, + "inifile=s", \$inifile, + "nogenetrees", \$nogenetrees, + "novariation", \$novariation, + "noxrefs", \$noxrefs, + "skipexisting", \$skip_existing, + "format=s", \$format, + "noortholog", \$noortholog, + ); + +$format ||= 'ensembl'; +$ind ||= 'ALL'; +$dir ||= "."; +$release ||= 'LATEST'; + + +if ($format eq 'solr') { + $novariation = 1; + $nogenetrees = 1; + $nogzip = 1; + $ind = 'Gene'; + $noortholog = 1; +} + + +usage() and exit unless ( $host && $port && $user); + +## HACK 1 - if the INDEX is set to all grab all dumper methods... +my @indexes = split ',', $ind; +@indexes = map { /dump(\w+)/ ? $1 : () } keys %ebi_search_dump:: if $ind eq 'ALL'; +#warn Dumper \@indexes; + +my $dbHash = get_databases(); +print "*** No databases found ***\n" unless %{$dbHash}; +#warn Dumper $dbHash; + +my @datasets = split ',', $species; +# restrict species to only those defined in the current eg site +@datasets = @datasets ? @{ utils::Tool::check_species(\@datasets) } : @{ utils::Tool::all_species() }; + +print "\nDatasets to process: \n " . join("\n ", @datasets) . "\n"; + +my $entry_count; +my $global_start_time = time; +my $total = 0; +my $fh; + +foreach my $dataset ( @datasets ) { + my $conf = $dbHash->{lc($dataset)}; + + foreach my $index (@indexes) { + my $function = "dump$index"; + no strict "refs"; + + $dataset =~ s/_/ /g; + + if ( $index eq 'Gene' ) { + &$function( ucfirst($dataset), $conf ); + } + } +} + +print_time($global_start_time); +warn " Dumped $total entries ...\n"; + +#------------------------------------------------------------------------------ + +sub text_month { + my $m = shift; + my @months = qw[JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC]; + return $months[$m]; +} + +sub print_time { + my $start = shift; + my $t = time - $start; + my $s = $t % 60; + $t = ( $t - $s ) / 60; + my $m = $t % 60; + $t = ( $t - $m ) / 60; + my $h = $t % 60; + print "Time taken: " . $h . "h " . $m . "m " . $s . "s\n"; +} + +sub usage { + print < + + -host REQUIRED. Database host to connect to. + -port REQUIRED. Database port to connect to. + -user Database username. Defaults to ensro. + -species Species name. Defaults to ALL. + -index Index to create. Defaults to ALL. + -release Release of the database to dump. Defaults to 'latest'. + -pass Password for user. + -dir Directory to write output to. Defaults to /lustre/scratch1/ensembl/gp1/xml. + -nogzip Don't compress output as it's written. + -help This message. + -inifile First take the arguments from this file. Then overwrite with what is provided in the command line + +EOF +} + +sub get_databases { + + my ( $dbHash, $dbcHash ); + + my $dsn = "DBI:mysql:host=$host"; + $dsn .= ";port=$port" if ($port); + + my $dbh = DBI->connect( $dsn, $user, $pass ); + my @dbnames = map { $_->[0] } @{ $dbh->selectall_arrayref("show databases") }; + $dbh->disconnect(); + + my $latest_release = 0; + my ( $db_species, $db_release, $db_type ); + my $compara_hash; + for my $dbname (@dbnames) { + + if ( ($db_type, $db_release) = $dbname =~ /^ensembl_compara_(\w+)_(\d+)_\w+/ ) { + + $compara_hash->{$db_type}->{$db_release} = $dbname; + + } elsif ( ( $db_species, $db_type, $db_release ) = $dbname =~ /^([a-z]+_[a-z0-9]+)(?:_collection)?_([a-z]+)_(\d+)_\w+$/ ) { + + $db_species =~ s/_collection$//; + $latest_release = $db_release if ( $db_release > $latest_release ); + $dbHash->{$db_species}->{$db_type}->{$db_release} = $dbname; + + } + } + + map { $dbHash->{$_}->{'compara'} = $compara_hash } keys %$dbHash; + $release = $latest_release if ( $release eq 'LATEST' ); + + return $dbHash; +} + +sub footer { + my ($ecount) = @_; + + unless ($format eq 'solr') { + p(""); + p("$ecount"); + p(""); +} + print "Dumped $ecount entries\n"; + if ($nogzip) { + close(FILE) or die $!; + } + else { + $fh->close(); + } + $total += $ecount; +} + +sub header { + my ( $dbname, $dataset, $dbtype ) = @_; + + p(""); + p("]>"); + p(""); + p("$dbname"); + p("Ensembl Genomes $dataset $dbtype database"); + p("$release"); + p(""); + p(""); +} + +sub p { + my ($str) = @_; + # TODO - encoding + $str .= "\n"; + if ($nogzip) { + print FILE $str or die "Can't write to file ", $!; + } + else { + print $fh $str or die "Can't write string: $str"; + } +} + +sub format_date { + my $t = shift; + my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ]; + $y += 1900; + $d = "0" . $d if ( $d < 10 ); + my $mm = text_month($m); + return "$d-$mm-$y"; +} + +sub format_datetime { + my $t = shift; + my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ]; + $y += 1900; + $d = "0" . $d if ( $d < 10 ); + my $ms = text_month($m); + return sprintf "$d-$ms-$y %02d:%02d:%02d", $hh, $mm, $ss; +} + +sub dumpGene { + + my ( $dataset, $conf ) = @_; + + #foreach my $DB ( 'core') { #, 'vega' ) { + my $DB = 'core'; + my $SNPDB = $novariation ? undef : eval { $conf->{variation}->{$release} }; + my $FUNCGENDB = eval { $conf->{funcgen}->{$release} }; + my $DBNAME = $conf->{$DB}->{$release} or warn "$dataset $DB $release: no database not found"; + next unless $DBNAME; + + print "\nSTART dumpGene\n"; + print "Database: $DBNAME\n"; + + my $dsn = "DBI:mysql:host=$host"; + $dsn .= ";port=$port" if ($port); + + my $dbh; + my $attempt = 0; + my $max_attempts = 100; + while (!$dbh and ++$attempt <= $max_attempts) { + eval { $dbh = DBI->connect( "$dsn:$DBNAME", $user, $pass ) }; + warn "DBI connect error: $@" if $@; + if (!$dbh) { + warn "Failed DBI connect attempt $attempt of $max_attempts\n" if !$dbh; + sleep 5; + } + } + + $dbh->do("SET sql_mode = 'NO_BACKSLASH_ESCAPES'"); # metazoa have backslahes in thier gene names and synonyms :.( + + # determine genomic unit + my $division = $dbh->selectrow_array("SELECT meta_value FROM meta WHERE meta_key = 'species.division'"); + (my $genomic_unit = lc($division)) =~ s/^ensembl//; # eg EnsemblProtists -> protists + print "Genomic unit: " . $genomic_unit . "\n"; + + my $genetree_lookup = $nogenetrees ? {} : get_genetree_lookup($genomic_unit, $conf); + + my $haplotypes = $dbh->selectall_hashref( + "SELECT gene_id FROM gene g, assembly_exception ae WHERE g.seq_region_id=ae.seq_region_id AND ae.exc_type='HAP'", + [qw(gene_id)] + ); + + my %transcript_probes; + my %transcript_probesets; + if ($FUNCGENDB) { + + print "Fetching probes...\n"; + + my $rows = $dbh->selectall_arrayref( + "SELECT x.dbprimary_acc, p.name + FROM $FUNCGENDB.probe p, $FUNCGENDB.array_chip ac, $FUNCGENDB.array a, $FUNCGENDB.status s, + $FUNCGENDB.status_name sn, $FUNCGENDB.object_xref ox, $FUNCGENDB.xref x + WHERE sn.name='MART_DISPLAYABLE' + AND sn.status_name_id=s.status_name_id + AND s.table_name='array' + AND s.table_id=a.array_id + AND p.array_chip_id = ac.array_chip_id + AND ac.array_id = a.array_id + AND p.probe_id=ox.ensembl_id + AND ox.ensembl_object_type='Probe' + AND ox.xref_id=x.xref_id + GROUP BY ox.object_xref_id" + ); + + foreach (@$rows) { + $transcript_probes{$_->[0]} ||= []; + push @{$transcript_probes{$_->[0]}}, $_->[1]; + } + + print "Fetching probe sets...\n"; + + $rows = $dbh->selectall_arrayref( + "SELECT x.dbprimary_acc, ps.name + FROM $FUNCGENDB.probe_set ps, $FUNCGENDB.probe p, $FUNCGENDB.array_chip ac, $FUNCGENDB.array a, + $FUNCGENDB.status s, $FUNCGENDB.status_name sn, $FUNCGENDB.object_xref ox, $FUNCGENDB.xref x + WHERE sn.name='MART_DISPLAYABLE' + AND sn.status_name_id=s.status_name_id + AND s.table_name='array' + AND s.table_id=a.array_id + AND ps.probe_set_id = p.probe_set_id + AND p.array_chip_id = ac.array_chip_id + AND ac.array_id = a.array_id + AND ps.probe_set_id=ox.ensembl_id + AND ox.ensembl_object_type='ProbeSet' + AND ox.xref_id=x.xref_id + GROUP BY ox.object_xref_id;" + ); + + foreach (@$rows) { + $transcript_probesets{$_->[0]} ||= []; + push @{$transcript_probesets{$_->[0]}}, $_->[1]; + } + } + + my %xrefs = (); + my %xrefs_desc = (); + my %disp_xrefs = (); + unless ($noxrefs) { + foreach my $type (qw(Gene Transcript Translation)) { + + print "Fetching $type xrefs...\n"; + + my $xrefs = []; + if ($type ne 'Translation') { + + my $table = lc($type); + + $xrefs = $dbh->selectall_arrayref( + "SELECT t.${table}_id, x.display_label, x.dbprimary_acc, ed.db_name, es.synonym, x.description + FROM ${table} t + JOIN xref x ON x.xref_id = t.display_xref_id + JOIN external_db ed ON ed.external_db_id = x.external_db_id + LEFT JOIN external_synonym es ON es.xref_id = x.xref_id" + ); + } + + my $object_xrefs = $dbh->selectall_arrayref( + "SELECT ox.ensembl_id, x.display_label, x.dbprimary_acc, ed.db_name, es.synonym, x.description + FROM (object_xref AS ox, xref AS x, external_db AS ed) + LEFT JOIN external_synonym AS es ON es.xref_id = x.xref_id + WHERE ox.ensembl_object_type = '$type' AND ox.xref_id = x.xref_id AND x.external_db_id = ed.external_db_id" + ); + + foreach (@$xrefs, @$object_xrefs) { + $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[1] } = 1 if $_->[1]; + $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[2] } = 1 if $_->[2]; + ## remove the duplicates + Temp fix for metazoa data + if (my $syn = $_->[4]) { + $syn =~ s/^\'|\'$//g; + next if ($syn =~ /^(FBtr|FBpp)\d+/); + next if ($syn =~ /^CG\d+\-/); + $xrefs{$type}{ $_->[0] }{ $_->[3] . "_synonym" }{ $syn } = 1; + } + ## + $xrefs_desc{$type}{ $_->[0] }{ $_->[5] } = 1 if $_->[5]; + } + } + } + + print "Fetching exons...\n"; + + my %exons = (); + my $T = $dbh->selectall_arrayref( + "SELECT DISTINCT t.gene_id, e.stable_id + FROM transcript AS t, exon_transcript AS et, exon AS e + WHERE t.transcript_id = et.transcript_id AND et.exon_id = e.exon_id" + ); + + foreach (@$T) { + $exons{ $_->[0] }{ $_->[1] } = 1; + } + + print "Fetching domains...\n"; + + my %domains; + $T = $dbh->selectall_arrayref( + 'SELECT DISTINCT g.gene_id, pf.hit_name + FROM gene g, transcript t, translation tl, protein_feature pf + WHERE g.gene_id = t.gene_id AND t.transcript_id = tl.transcript_id AND tl.translation_id = pf.translation_id' + ); + + foreach (@$T) { + $domains{$_->[0]}{$_->[1]} = 1; + } + + print "Fetching seq regions...\n"; + + my $species_to_seq_region = $dbh->selectall_hashref( + "SELECT + meta.meta_value AS species_name, + coord_system.species_id, coord_system.coord_system_id, seq_region.seq_region_id, + coord_system.name, seq_region.name AS seqname, + seq_region.length, attrib_type.name + FROM meta, coord_system, seq_region, seq_region_attrib, attrib_type + WHERE + coord_system.coord_system_id = seq_region.coord_system_id + AND seq_region_attrib.seq_region_id = seq_region.seq_region_id + AND seq_region_attrib.attrib_type_id = attrib_type.attrib_type_id + AND meta.species_id=coord_system.species_id + AND meta.meta_key = 'species.display_name' + AND attrib_type.name = 'Top Level' + GROUP BY seq_region.seq_region_id + ORDER BY species_name, seqname, LENGTH DESC", + [ 'species_name', 'seq_region_id' ] + ); + + #warn Dumper($species_to_seq_region); + + foreach my $species (keys %$species_to_seq_region) { + my $counter = make_counter(0); + my ($species_id) = @{$dbh->selectrow_arrayref("SELECT DISTINCT(species_id) FROM meta WHERE meta_value = ? LIMIT 0,1", undef, $species)}; + my ($taxon_id) = @{$dbh->selectrow_arrayref("SELECT meta_value FROM meta WHERE meta_key = 'species.taxonomy_id' AND species_id = ?", undef, $species_id)}; + my ($production_name) = @{$dbh->selectrow_arrayref("SELECT meta_value FROM meta WHERE meta_key = 'species.production_name' AND species_id = ?", undef, $species_id)}; + + my $ortholog_lookup = get_ortholog_lookup($conf, $production_name, $genomic_unit); + my $ortholog_lookup_pan = get_ortholog_lookup($conf, $production_name, 'pan_homology'); + + (my $filename = "Gene_${species}_${DB}") =~ s/[\W]/_/g; + my $file = "$dir/$filename." . ($format eq 'solr' ? 'tsv' : 'xml'); + $file .= ".gz" unless $nogzip; + my $start_time = time; + + if ($skip_existing and -f $file) { + warn "**** Index file already exists - skipping ****\n"; + next; + } + + print "Dumping $species to $file\n"; + print "Start time " . format_datetime($start_time) . "\n"; + print "Num seq regions: " . (scalar keys %{ $species_to_seq_region->{$species} }) . "\n"; + + if ($nogzip) { + open( FILE, ">$file" ) || die "Can't open $file: $!"; + } else { + $fh = new IO::Zlib; + $fh->open( "$file", "wb9" ) || die("Can't open compressed stream to $file: $!"); + } + + header( $DBNAME, $dataset, $DB ) unless $format eq 'solr'; + + # prepare the gene output sub + # this is called when ready to ouput the gene line + my $output_gene = sub() { + my ($gene_data) = shift; + my @transcript_stable_ids = keys %{ $gene_data->{transcript_stable_ids} }; + + # add variation features + if ($SNPDB) { + $gene_data->{snps} = $dbh->selectcol_arrayref( + "SELECT DISTINCT(vf.variation_name) FROM $SNPDB.transcript_variation AS tv, $SNPDB.variation_feature AS vf + WHERE vf.variation_feature_id = tv.variation_feature_id AND tv.feature_stable_id IN('" . join("', '", @transcript_stable_ids) . "')" + ); + } + + # add probes and probesets + if ($FUNCGENDB) { + $gene_data->{probes} = []; + $gene_data->{probesets} = []; + foreach (@transcript_stable_ids) { + push(@{$gene_data->{probes}}, @{$transcript_probes{$_}}) if $transcript_probes{$_}; + push(@{$gene_data->{probesets}}, @{$transcript_probesets{$_}}) if $transcript_probesets{$_}; + } + } + + # add orthologs + + $gene_data->{orthologs} = $ortholog_lookup_pan->{$gene_data->{gene_stable_id}}; # want all eg + foreach my $orth ( @{ $ortholog_lookup->{$gene_data->{gene_stable_id}} || [] } ) { + if (!grep { $orth->[0] eq $_->[0] } @{ $gene_data->{orthologs} }) { # want only unique ensembl + push @{ $gene_data->{orthologs} }, $orth; + } + } + + if ($format eq 'solr') { + p geneLineTSV( $species, $dataset, $gene_data, $counter ); + } else { + p geneLineXML( $species, $dataset, $gene_data, $counter ); + } + }; + + + #my $sr_count = 0; + + foreach my $seq_region_id ( keys %{ $species_to_seq_region->{$species} } ) { + #print ++$sr_count . " "; + #$|++; + + my $gene_sql = + "SELECT g.gene_id, t.transcript_id, tr.translation_id, + g.stable_id AS gsid, t.stable_id AS tsid, tr.stable_id AS trsid, + g.description, ed.db_display_name, x.dbprimary_acc,x.display_label AS xdlgene, + ad.display_label, ad.description, g.source, g.status, g.biotype, + sr.name AS seq_region_name, g.seq_region_start, g.seq_region_end + FROM (gene AS g, + analysis_description AS ad, + transcript AS t) LEFT JOIN + translation AS tr ON t.transcript_id = tr.transcript_id LEFT JOIN + xref AS `x` ON g.display_xref_id = x.xref_id LEFT JOIN + external_db AS ed ON ed.external_db_id = x.external_db_id LEFT JOIN + seq_region AS sr ON sr.seq_region_id = g.seq_region_id + WHERE t.gene_id = g.gene_id AND g.analysis_id = ad.analysis_id AND g.seq_region_id = ? + ORDER BY g.stable_id, t.stable_id"; + + #warn "$gene_sql $seq_region_id\n"; + + my $gene_info = $dbh->selectall_arrayref($gene_sql, undef, $seq_region_id); + next unless @$gene_info; + + my %old; + + foreach my $row (@$gene_info) { + + my ( + $gene_id, $transcript_id, + $translation_id, $gene_stable_id, + $transcript_stable_id, $translation_stable_id, + $gene_description, $extdb_db_display_name, + $xref_primary_acc, $xref_display_label, + $analysis_description_display_label, $analysis_description, + $gene_source, $gene_status, + $gene_biotype, $seq_region_name, + $seq_region_start, $seq_region_end + ) = @$row; + + if ( $old{'gene_id'} != $gene_id ) { + + # output old gene if we have one + $output_gene->(\%old) if $old{'gene_id'}; + + # start building a new gene + %old = ( + 'gene_id' => $gene_id, + 'haplotype' => $haplotypes->{$gene_id} ? 'haplotype' : 'reference', + 'gene_stable_id' => $gene_stable_id, + 'description' => $gene_description, + 'taxon_id' => $taxon_id, + 'translation_stable_ids' => { $translation_stable_id ? ( $translation_stable_id => 1 ) : () }, + 'transcript_stable_ids' => { $transcript_stable_id ? ( $transcript_stable_id => 1 ) : () }, + 'transcript_ids' => { $transcript_id ? ( $transcript_id => 1 ) : () }, + 'exons' => {}, + 'external_identifiers' => {}, + 'gene_name' => $xref_display_label ? $xref_display_label : $gene_stable_id, + 'seq_region_name' => $seq_region_name, + 'ana_desc_label' => $analysis_description_display_label, + 'ad' => $analysis_description, + 'source' => ucfirst($gene_source), + 'st' => $gene_status, + 'biotype' => $gene_biotype, + 'genomic_unit' => $genomic_unit, + 'location' => sprintf( '%s:%s-%s', $seq_region_name, $seq_region_start, $seq_region_end ), + 'exons' => $exons{$gene_id}, + 'genetrees' => $genetree_lookup->{$gene_stable_id} || [], + 'domains' => $domains{$gene_id}, + 'system_name' => $production_name, + ); + + $old{'source'} =~ s/base/Base/; + + # display name + if (!$xref_display_label or $xref_display_label eq $gene_stable_id) { + $old{'display_name'} = $gene_stable_id; + } else { + $old{'display_name'} = "$xref_display_label [$gene_stable_id]"; + } + + foreach my $K ( keys %{ $exons{$gene_id} } ) { + $old{'i'}{$K} = 1; + } + + foreach my $db ( keys %{ $xrefs{'Gene'}{$gene_id} || {} } ) { + foreach my $K ( keys %{ $xrefs{'Gene'}{$gene_id}{$db} } ) { + $old{'external_identifiers'}{$db}{$K} = 1; + } + } + + foreach my $db ( keys %{ $xrefs{'Transcript'}{$transcript_id} || {} } ) { + foreach my $K ( keys %{ $xrefs{'Transcript'}{$transcript_id}{$db} } ) { + $old{'external_identifiers'}{$db}{$K} = 1; + } + } + + foreach my $db ( keys %{ $xrefs{'Translation'}{$translation_id} || {} } ) { + foreach my $K ( keys %{ $xrefs{'Translation'}{$translation_id}{$db} } ) { + $old{'external_identifiers'}{$db}{$K} = 1; + } + } + + } else { + + $old{'transcript_stable_ids'}{$transcript_stable_id} = 1; + $old{'transcript_ids'}{$transcript_id} = 1; + $old{'translation_stable_ids'}{$translation_stable_id} = 1; + + foreach my $db ( keys %{ $xrefs{'Transcript'}{$transcript_id} || {} } ) { + foreach my $K ( keys %{ $xrefs{'Transcript'}{$transcript_id}{$db} } ) { + $old{'external_identifiers'}{$db}{$K} = 1; + } + } + + foreach my $db ( keys %{ $xrefs{'Translation'}{$translation_id} || {} } ) { + foreach my $K ( keys %{ $xrefs{'Translation'}{$translation_id}{$db} } ) { + $old{'external_identifiers'}{$db}{$K} = 1; + } + } + } + } + $output_gene->(\%old); + } + footer( $counter->() ); + } + + warn "FINISHED dumpGene ($DB)\n"; + #} #$DB loop +} + +sub geneLineXML { + my ( $species, $dataset, $xml_data, $counter ) = @_; + + return warn "gene id not set" if $xml_data->{'gene_stable_id'} eq ''; + + my $gene_id = $xml_data->{'gene_stable_id'}; + my $genomic_unit = $xml_data->{'genomic_unit'}; + my $location = $xml_data->{'location'}; + my $transcripts = $xml_data->{'transcript_stable_ids'} or die "transcripts not set"; + my $snps = $xml_data->{'snps'}; + my $orthologs = $xml_data->{'orthologs'}; + my $peptides = $xml_data->{'translation_stable_ids'} or die "peptides not set"; + my $exons = $xml_data->{'exons'} or die "exons not set"; + my $domains = $xml_data->{'domains'}; + my $external_identifiers = $xml_data->{'external_identifiers'} or die "external_identifiers not set"; + my $description = $xml_data->{'description'}; + my $gene_name = encode_entities($xml_data->{'gene_name'}); + my $seq_region_name = $xml_data->{'seq_region_name'}; + my $type = $xml_data->{'source'} . ' ' . $xml_data->{'biotype'} or die "problem setting type"; + my $haplotype = $xml_data->{'haplotype'}; + my $taxon_id = $xml_data->{'taxon_id'}; + my $exon_count = scalar keys %$exons; + my $domain_count = scalar keys %$domains; + my $transcript_count = scalar keys %$transcripts; + my $display_name = $xml_data->{'display_name'}; + my $genetrees = $xml_data->{'genetrees'}; + my $probes = $xml_data->{'probes'}; + my $probesets = $xml_data->{'probesets'}; + my $system_name = $xml_data->{'system_name'}; + + $display_name =~ s//>/g; + + $description =~ s//>/g; + $description =~ s/'/'/g; + $description =~ s/&/&/g; + + $gene_name =~ s//>/g; + $gene_name =~ s/'/'/g; + $gene_name =~ s/&/&/g; + + $gene_id =~ s//>/g; + + my $xml = qq{ + +$display_name +$description}; + + my $synonyms = ""; + my $unique_synonyms; + my $cross_references = qq{ + +}; + + # for some types of xref, merge the subtypes into the larger type + # e.g. Uniprot/SWISSPROT and Uniprot/TREMBL become just Uniprot + # synonyms are stored as additional fields rather than cross references + foreach my $ext_db_name ( keys %$external_identifiers ) { + + if ( $ext_db_name =~ /(Uniprot|GOA|GO|Interpro|Medline|Sequence_Publications|EMBL)/ ) { + my $matched_db_name = $1; + + # synonyms + if ( $ext_db_name =~ /_synonym/ ) { + foreach my $ed_key ( keys %{ $external_identifiers->{$ext_db_name} } ) { + # $unique_synonyms->{$ed_key} = 1; + my $encoded = encode_entities($ed_key); + $synonyms .= qq{ +$encoded}; + } + } + else { # non-synonyms + map { $cross_references .= qq{ +}; + } keys %{ $external_identifiers->{$ext_db_name} } + } + + } else { + + foreach my $key ( keys %{ $external_identifiers->{$ext_db_name} } ) { + $key = encode_entities($key); + $ext_db_name =~ s/^Ens.*/ENSEMBL/; + + if ( $ext_db_name =~ /_synonym/ ) { + $unique_synonyms->{$key} = 1; + $synonyms .= qq{ +$key}; + } + else { + $cross_references .= qq{ +}; + } + } + } + } + + $cross_references .= ( join "", ( map { qq{ +} + } @$snps)); + + $cross_references .= ( join "", ( map { qq{ +} + } @$orthologs ) ); + + $cross_references .= qq{ +}; + + map { $synonyms .= qq{ +} . encode_entities($_) . qq{ } + } keys %$unique_synonyms; + + my $additional_fields .= qq{ + +$species +$system_name +Gene +$type +$location +$transcript_count +$gene_name +$seq_region_name +$haplotype} + . ($dataset ne $species ? qq{ +$dataset} : '') + . ($genomic_unit ? qq{ +$genomic_unit} : '') + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} keys %$transcripts ) ) + . qq{ +$exon_count } + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} keys %$exons ) ) + . qq{ +$domain_count } + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} keys %$domains ) ) + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} keys %$peptides ) ) + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} @$genetrees ) ) + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} @$probes ) ) + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} @$probesets ) ) + . ( join "", ( map { qq{ +$_} + } map {encode_entities($_)} keys %$unique_synonyms ) ) + . qq{ +}; + + $counter->(); + return $xml . $cross_references . $additional_fields . "\n"; +} + + +sub geneLineTSV { + my ( $species, $dataset, $xml_data, $counter ) = @_; + + + my $external_identifiers = $xml_data->{'external_identifiers'} or die "external_identifiers not set"; + my $gene_id = $xml_data->{'gene_stable_id'} or die "gen id not set"; + $gene_id =~ s//>/g; + + + my $description = $xml_data->{'description'}; + my $gene_name = encode_entities($xml_data->{'gene_name'}); + my $display_name = $xml_data->{'display_name'}; + + $display_name =~ s//>/g; + + $description =~ s//>/g; + $description =~ s/'/'/g; + $description =~ s/&/&/g; + + $gene_name =~ s//>/g; + $gene_name =~ s/'/'/g; + $gene_name =~ s/&/&/g; + + + my $xrefs; + # for some types of xref, merge the subtypes into the larger type + # e.g. Uniprot/SWISSPROT and Uniprot/TREMBL become just Uniprot + # synonyms are stored as additional fields rather than cross references + foreach my $ext_db_name ( keys %$external_identifiers ) { + my $matched_db_name = $ext_db_name; + if ( $ext_db_name =~ /(Uniprot|GOA|GO|Interpro|Medline|Sequence_Publications|EMBL)/ ) { + $matched_db_name = $1; + @{$xrefs->{$matched_db_name}} = map { encode_entities($_) } keys %{ $external_identifiers->{$ext_db_name} }; + } + } + my $xrefs_str = join ';', map { join ',', @{$xrefs->{$_}||[]} } keys %$xrefs; + + my $url = sprintf qq{http://%s.ensembl.org/%s/Gene/Summary?g=%s;r=%s}, $xml_data->{'genomic_unit'}, $xml_data->{'system_name'}, $xml_data->{gene_stable_id}, $xml_data->{'location'}; + my @fields; + push @fields, $gene_id, $xml_data->{'biotype'}, $gene_name, $xrefs_str, $url, $description; + $counter->(); + return join "\t", @fields; +} + + +sub make_counter { + my $start = shift; + return sub { $start++ } +} + +#------------------------------------------------------------------------------ +# +# Build a gene tree id lookup +# It's slow, but common to all species, so we only have to do it once +# + +my $_genetree_lookup; + +sub get_genetree_lookup { + my ($genomic_unit, $conf) = @_; + + unless ($_genetree_lookup) { + + print "Building gene tree id lookup...\n"; + + foreach my $dbtype ($genomic_unit, 'pan_homology') { + + my $dbname = $conf->{compara}->{$dbtype}->{$release}; + next unless $dbname; + + print " $dbname\n"; + + my $dsn = "DBI:mysql:host=$host"; + $dsn .= ";port=$port" if ($port); + my $compara_dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; + + my $sql = + "SELECT m2.stable_id AS gene, gtr.stable_id AS genetree + FROM member m + JOIN gene_tree_node gtn ON gtn.member_id = m.member_id + JOIN gene_tree_root gtr ON gtr.root_id = gtn.root_id + JOIN member m2 ON m2.member_id = m.gene_member_id + WHERE gtr.stable_id IS NOT NULL + ORDER BY m2.stable_id"; + + #warn "$sql\n"; + + my $rows = $compara_dbh->selectall_arrayref($sql); + + foreach (@$rows) { + $_genetree_lookup->{$_->[0]} ||= []; + push(@{$_genetree_lookup->{$_->[0]}}, $_->[1]); + } + + $compara_dbh->disconnect; + } + } + + return $_genetree_lookup; +} + +#------------------------------------------------------------------------------ +# +# Build an ortholog lookup for given species/compara-db +# + +sub get_ortholog_lookup { + my ($conf, $species, $compara_db) = @_; + + return {} if ($noortholog); + + my $prefix = $compara_db eq 'pan_homology' ? 'ensemblgenomes' : 'ensembl'; + + my $orth_species = { + 'homo_sapiens' => "ensembl_ortholog", + 'mus_musculus' => "ensembl_ortholog", + 'drosophila_melanogaster' => "${prefix}_ortholog", + 'caenorhabditis_elegans' => "${prefix}_ortholog", + 'saccharomyces_cerevisiae' => "${prefix}_ortholog", + 'arabidopsis_thaliana' => "${prefix}_ortholog", + 'escherichia_coli_str_k_12_substr_mg1655' => "${prefix}_ortholog", + 'schizosaccharomyces_pombe' => "${prefix}_ortholog", + 'bacillus_subtilis_subsp_subtilis_str_168' => "${prefix}_ortholog", + }; + + return {} unless delete $orth_species->{$species}; # do we want orthologs for this species? + return {} unless my $dbname = $conf->{compara}->{$compara_db}->{$release}; # have we got a compara db? + + print "Building ortholog lookup for $species (compara_$compara_db)...\n"; + + my $dsn = "DBI:mysql:host=$host"; + $dsn .= ";port=$port" if ($port); + my $compara_dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error"; + + my $orth_species_string = join('","', keys %$orth_species); + + my $orthologs_sth = $compara_dbh->prepare(qq{ + SELECT + m1.stable_id , m2.stable_id, gdb2.name + FROM + genome_db gdb1 JOIN member m1 USING (genome_db_id) + JOIN homology_member hm1 USING (member_id) + JOIN homology h USING (homology_id) + JOIN homology_member hm2 USING (homology_id) + JOIN member m2 ON (hm2.member_id = m2.member_id) + JOIN genome_db gdb2 on (m2.genome_db_id = gdb2.genome_db_id) + WHERE + gdb1.name = "$species" + AND m2.source_name = "ENSEMBLGENE" + AND gdb2.name IN ("$orth_species_string") + AND h.description in ("ortholog_one2one", "apparent_ortholog_one2one", + "ortholog_one2many", "ortholog_many2many") + }); + $orthologs_sth->execute; + + # process rows in batches + my $lookup = {}; + my $rows = []; + while ( my $row = ( shift(@$rows) || shift( @{ $rows = $orthologs_sth->fetchall_arrayref( undef, 10_000 ) || [] } ) ) ) { + push @{ $lookup->{$row->[0]} }, [ $row->[1], $orth_species->{$row->[2]} ]; + } + + return $lookup; +} + diff --git a/utils/search_dump_extra.pl b/utils/search_dump_extra.pl new file mode 100755 index 00000000..f0146432 --- /dev/null +++ b/utils/search_dump_extra.pl @@ -0,0 +1,221 @@ +#!/usr/local/bin/perl + +use strict; +use warnings; +use FindBin qw($Bin); +use Getopt::Long; +use XML::Generator; +use Data::Dumper; +use HTML::Entities; + +BEGIN { + unshift @INC, "$Bin/../../conf"; + unshift @INC, "$Bin/../../"; + require SiteDefs; + unshift @INC, $_ for @SiteDefs::ENSEMBL_LIB_DIRS; + require EnsEMBL::Web::Hub; +} + +my $dir = '.'; +my $index_list; + +GetOptions( + 'dir=s' => \$dir, + 'index=s' => \$index_list +); + +my @indices = $index_list ? map {ucfirst} split(/,/, $index_list) : qw(Species Seqregion); +my $hub = EnsEMBL::Web::Hub->new; +my $species_defs = $hub->species_defs; +my @species = $species_defs->valid_species; +(my $division = $species_defs->ENSEMBL_SITETYPE) =~ s/ //; +my $genomic_unit = $species_defs->GENOMIC_UNIT; +my $release = $species_defs->SITE_RELEASE_VERSION; +my %core_dbs = map { $species_defs->get_config($_, 'databases')->{DATABASE_CORE}->{NAME} => 1 } (@species); +my $dbh = $hub->database('core', $species[0])->db_handle; +my $file; + +my $dispatch = { + Species => \&print_species, + Seqregion => \&print_seqregions, +}; + +foreach my $index (@indices) { + print "\n--- $index ---\n"; + + my $filename = "$dir/${index}_$division.xml"; + print "starting $filename\n"; + + open $file, '>' , $filename or die "Cannot open index file $filename: $!"; + print_header(); + $dispatch->{$index}->(); + print_footer(); + close $file; + + print "wrote $filename\n"; +} + +exit; + +#------------------------------------------------------------------------------ + +sub escape { return encode_entities($_[0], q{&<>"'\''}) } + +sub print_header { + print $file qq{ +]> + + $division + $release + }; +} + +sub print_footer { + print $file qq{ + + + } +} + +sub print_species { + + my @meta_keys = qw( + assembly.accession + assembly.default + assembly.name + species.common_name + species.display_name + species.division + species.production_name + species.scientific_name + species.taxonomy_id + ); + + my @meta_keys_multi = qw( + species.alias + species.classification + ); + + my $key_to_field_name = sub { + my $str = shift; + $str =~ s/^species\.//; + $str =~ s/\./_/; + return $str; + }; + + foreach my $db_name (sort keys %core_dbs) { + print "$db_name\n"; + + # prepare the data + + my ($collection) = $db_name =~ /^(.+)_collection/i; + + $dbh->do("use $db_name"); + + my $meta = $dbh->selectall_hashref( + 'SELECT species_id, meta_key, meta_value FROM meta WHERE meta_key IN ("' . join('", "', @meta_keys) . '") + ORDER BY species_id, meta_key, meta_id DESC', + ['species_id', 'meta_key'] + ); + + my $meta_multi = $dbh->selectall_hashref( + 'SELECT species_id, meta_key, meta_value FROM meta WHERE meta_key IN ("' . join('", "', @meta_keys_multi) . '") + ORDER BY species_id, meta_key, meta_id DESC', + ['species_id', 'meta_key', 'meta_value'] + ); + + # add entries + + foreach my $species_id (keys %$meta) { + my $production_name = $meta->{$species_id}->{'species.production_name'}->{meta_value}; + my $display_name = $meta->{$species_id}->{'species.display_name'}->{meta_value}; + my $tax_id = delete $meta->{$species_id}->{'species.taxonomy_id'}->{meta_value}; + + my $fields; + $fields .= qq{$collection\n} if $collection; + $fields .= qq{$genomic_unit\n}; + + foreach my $key (@meta_keys) { + if (my $value = escape($meta->{$species_id}->{$key}->{meta_value})) { + my $name = $key_to_field_name->($key); + $fields .= qq{$value\n}; + } + } + + foreach my $key (@meta_keys_multi) { + if (my $hash = $meta_multi->{$species_id}->{$key}) { + my $name = $key_to_field_name->($key); + foreach my $value (map {escape($_)} keys %$hash) { + $fields .= qq{$value\n}; + } + } + } + + print $file qq{ + + $display_name + + + + + $fields + + }; + } + } +} + +sub print_seqregions { + + my $max_len = 100000; + + foreach my $db_name (sort keys %core_dbs) { + print "$db_name\n"; + + $dbh->do("use $db_name"); + + my %production_name = map {@$_} @{ $dbh->selectall_arrayref("SELECT species_id, meta_value FROM meta WHERE meta_key = 'species.production_name'") }; + my %display_name = map {@$_} @{ $dbh->selectall_arrayref("SELECT species_id, meta_value FROM meta WHERE meta_key = 'species.display_name'") }; + my %tax_id = map {@$_} @{ $dbh->selectall_arrayref("SELECT species_id, meta_value FROM meta WHERE meta_key = 'species.taxonomy_id'") }; + + # get seq regions from top 2 levels - along with mapping to top level + my $seq_regions = $dbh->selectall_arrayref( + "SELECT DISTINCT sr.name AS seq_region_name, sr.length, asm.name AS asm_seq_region_name, cmp.asm_start, cmp.asm_end, cs.name AS coord_system_name, cs.species_id + FROM seq_region sr JOIN coord_system cs USING (coord_system_id) + LEFT JOIN assembly cmp ON cmp.cmp_seq_region_id = sr.seq_region_id + LEFT JOIN seq_region asm ON asm.seq_region_id = cmp.asm_seq_region_id + LEFT JOIN seq_region_attrib sra ON sra.seq_region_id = asm.seq_region_id + LEFT JOIN attrib_type `at` USING(attrib_type_id) + WHERE (at.name = 'Top Level' OR at.name IS NULL) + AND cs.name != 'chunk' AND cs.name != 'ignored' + AND FIND_IN_SET('default_version', cs.attrib)", + { Slice => {} } + ); + + foreach my $sr (@$seq_regions) { + my $id = $sr->{species_id}; + + my $location = $sr->{asm_seq_region_name} ? + sprintf('%s:%s-%s', $sr->{asm_seq_region_name}, $sr->{asm_start}, $sr->{asm_end} > $sr->{asm_start} + $max_len - 1 ? $sr->{asm_start} + $max_len - 1 : $sr->{asm_end}) : + sprintf('%s:%s-%s', $sr->{seq_region_name}, '1', $sr->{length} > $max_len ? $max_len : $sr->{length}); + + print $file qq{ + + $sr->{seq_region_name} + + + + + $display_name{$id} + $production_name{$id} + $sr->{length} + $location + $sr->{coord_system_name} + $genomic_unit + + }; + } + } +} + + diff --git a/utils/update_userdata_dbs.pl b/utils/update_userdata_dbs.pl new file mode 100755 index 00000000..cb5cb857 --- /dev/null +++ b/utils/update_userdata_dbs.pl @@ -0,0 +1,121 @@ +#!/usr/local/bin/perl + +use strict; +use warnings; +use DBI; +use Data::Dumper; +use Getopt::Long; + +my ($uhost, $uport, $uuser, $upass); # userdata +my ($chost, $cport, $cuser, $cpass); # core + +GetOptions( + "uhost=s" => \$uhost, "uport=s" => \$uport, "uuser=s" => \$uuser, "upass=s" => \$upass, + "chost=s" => \$chost, "cport=s" => \$cport, "cuser=s" => \$cuser, "cpass=s" => \$cpass, +) || die "Invalid options\n"; + +die "Please supply userdata mysql credentials: --uhost --uport --uuser [--upass]\n" if !(defined $uhost and defined $uport and defined $uuser); +die "Please supply core mysql credentials: --chost --cport --cuser [--cpass]\n" if !(defined $chost and defined $cport and defined $cuser); + +# global db handles + +my $udbh = DBI->connect("DBI:mysql:database=test;host=$uhost;port=$uport", $uuser, $upass); +my $cdbh = DBI->connect("DBI:mysql:database=test;host=$chost;port=$cport", $cuser, $cpass); + +my $species_db; + +# get userdata dbs + +my $udbs = $udbh->selectcol_arrayref("show databases like '%_userdata'"); + +foreach (@$udbs) { + next if /_collection_/; + my ($species) = split /_userdata/; + $species_db->{$species}->{userdata} = $_ ; +} + +# get core dbs + +my $cdbs = $cdbh->selectcol_arrayref("show databases like '%_core_%'"); + +foreach (@$cdbs) { + next if /_collection_/; + my ($species) = split /_core_/; + $species_db->{$species}->{core} = $_ ; +} + +print "Checking for missing userdata dbs...\n"; + +my @missing = sort grep { ! exists $species_db->{$_}->{userdata} } keys %$species_db; + +if (@missing) { + print "Creating missing userdata dbs...\n"; + + my $core_db = $species_db->{$missing[0]}->{'core'}; + `mysqldump --no-data --lock-tables=false -h $chost -P $cport -u $cuser $core_db > /tmp/core_schema.sql`; + + foreach my $species (@missing) { + create_userdata_db($species_db->{$species}->{core}); + $species_db->{$species}->{userdata} = "${species}_userdata"; + } +} + +print "Checking for changed assemblies...\n"; + +foreach my $species (sort keys %$species_db) { + if (!$species_db->{$species}->{core}) { + print "ORPHANED - userdata db $species doesn't have matching core db\n"; + next; + } + + #print "DB $species\n"; + my $core_db = $species_db->{$species}->{core}; + my $userdata_db = $species_db->{$species}->{userdata}; + + $cdbh->do("USE $core_db"); + $udbh->do("USE $userdata_db"); + + my $species_ids = $udbh->selectcol_arrayref('SELECT DISTINCT species_id FROM meta WHERE species_id IS NOT NULL'); + + foreach my $species_id (@$species_ids) { + my $sql = 'SELECT meta_value FROM meta WHERE species_id = ? AND meta_key = "assembly.name"'; + my $old_assembly = $udbh->selectrow_array($sql, undef, $species_id); + my $new_assembly = $cdbh->selectrow_array($sql, undef, $species_id); + if ($old_assembly ne $new_assembly) { + print "ASSEMBLY MISMATCH - $species, old: '$old_assembly', new: '$new_assembly'\n" ; + #upgrade_assembly($species_id, $core_db, $userdata_db); + } + } +} + +$udbh->disconnect; +$cdbh->disconnect; + +#------------------------------------------------------------------------------ + +sub create_userdata_db { + my ($core_db) = @_; + return unless $core_db; + my ($species, $rest ) = split /_core_/, $core_db; + my $user_db = $species ."_userdata"; + + warn "CREATING $user_db \n"; + `mysqldump --single_transaction -h $chost -P $cport -u $cuser $core_db analysis meta meta_coord coord_system seq_region > /tmp/${user_db}.sql`; + + # creating userdata database for a new species + $udbh->do("CREATE DATABASE IF NOT EXISTS $user_db"); + $udbh->do("use $user_db") or die $udbh->errstr; + `mysql -h $uhost -P $uport -u $uuser --password=$upass $user_db < /tmp/core_schema.sql`; # add an error handler here + `mysql -h $uhost -P $uport -u $uuser --password=$upass $user_db < /tmp/${user_db}.sql`; # add an error handler here + + # check if tables exist + my $tables_exist = $udbh->selectcol_arrayref('show tables'); + warn "ERROR: Table structure wasn't loaded into $user_db\n" if !@$tables_exist + #`rm /tmp/{$user_db}.sql`; +} + +#sub upgrade_assembly { +# my ($species_id, $core_db, $userdata_db) = @_; +#} + +