Permalink
Browse files

Import scripts from CVS eg-plugins/utils and drop some old versions

  • Loading branch information...
1 parent 1b40b1e commit dcb6108253938f6e3c1ed1e7d00d88cf4cfcc3be @nicklangridge nicklangridge committed Mar 6, 2014
Showing with 1,317 additions and 2,040 deletions.
  1. +0 −203 utils/create_user_data_dbs
  2. +0 −22 utils/eg_ebi_search_dump.ini
  3. +0 −1,815 utils/eg_ebi_search_dump.pl
  4. +975 −0 utils/search_dump.pl
  5. +221 −0 utils/search_dump_extra.pl
  6. +121 −0 utils/update_userdata_dbs.pl
View
203 utils/create_user_data_dbs
@@ -1,203 +0,0 @@
-#!/usr/bin/env perl
-# Copyright [2009-2014] EMBL-European Bioinformatics Institute
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-### create_user_data_dbs
-use lib '/nfs/ensembl/perllib/catalystlibs/lib/perl5/site_perl/5.8.8';
-use lib '/nfs/ensembl/perllib/catalystlibs/lib64/perl5/site_perl/5.8.8/x86_64-linux-thread-multi';
-
-
-use strict;
-
-# use EnsEMBL::Web::SpeciesDefs;
-use FindBin qw($Bin);
-use File::Basename qw( dirname );
-# Load libraries needed for reading config
-use Data::Dumper;
-use File::Slurp qw(slurp);
-use File::Find::Object::Rule;
-
-BEGIN {
-
- unshift @INC, "$Bin/../../../conf";
- unshift @INC, "$Bin/../../..";
- unshift @INC, "$Bin/../../utils";
- eval { require SiteDefs };
- if ($@) { die "Can't use SiteDefs.pm - $@\n"; }
- map { unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
-}
-
-
-
-use utils::Tool;
-
-my $SPECIES_DEFS = EnsEMBL::Web::SpeciesDefs->new();
-
-
-die "cannot find user_data_template.ddl. We can\'t run without it\n" unless -e('user_data_template.ddl');
-my $DDL = slurp('user_data_template.ddl');
-chomp $DDL;
-my @table_create_statments = split /;/ ,$DDL;
-
-my @SPECIES;
-
-
-
-# # Get Species List from SiteDefs
-
-@SPECIES = @{ utils::Tool::all_species()};
-
-
-
-foreach my $species (@SPECIES) {
- print "Getting credentientials for $species\n" . '='x20 . "\n";
-
- my $user_db_name = user_db_name($species);
- check_user_db_exists($species) eq 'No' ? create_from_new($species, $user_db_name) : print patch_existing($species);
-
-}
-
-sub user_db_name {
- my $species = shift;
-
- my $user_data_name = $SPECIES_DEFS->GENOMIC_UNIT eq 'bacteria' ? '_collection_userdata' : '_userdata';
-
- my $user_db_name = lcfirst($species) . $user_data_name;
-
- return $user_db_name;
-}
-
-
-sub check_user_db_exists {
- my $species = shift;
- my $user_db_name = user_db_name($species);
-
- my $dbh = get_dbh_from_species_type($species, 'core');
- my $query = qq{SELECT IF(EXISTS (SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '$user_db_name'), 'Yes','No') as 'exists'};
- # warn $query;
- my $array_ref = $dbh->selectall_arrayref($query);
- #warn Dumper $array_ref;
- return $array_ref->[0]->[0];
-}
-
-
-
-
-
-sub get_dbh_from_species_type {
- my ($species, $type) = @_;
-
- die "need species and database type" unless $species && $type;
-
-
- my $database = $type eq 'core' ? $SPECIES_DEFS->get_config($species, 'databases')->{'DATABASE_CORE'}->{'NAME'}: user_db_name($species);
-
- # make the DB handle, get the credentials from SPECIES_DEFS
- my $db_credentials = $SPECIES_DEFS->get_config($species, 'databases')->{'DATABASE_CORE'};
- my $host = $db_credentials->{HOST} || (warn "** no HOST in config for [ $species ]");
- print "Using HOST: [$host]\n". '='x20 . "\n";
-
- my $port = $db_credentials->{PORT} || warn "** no PORT in config for [ $species ]";
- my $write_user = $SPECIES_DEFS->DATABASE_WRITE_USER || die "** no WRITE_USER in config ";
- my $write_pass = $SPECIES_DEFS->DATABASE_WRITE_PASS || die "** no WRITE_PASS in config ";
-
- my $dsn = "DBI:mysql:database=$database;host=$host";
- $dsn .= ";port=$port" if ($port);
- my $dbh = DBI->connect( $dsn, $write_user, $write_pass );
-
- return $dbh;
-
-}
-
-
-
-sub create_from_new {
- my ($species, $user_db_name) = @_;
-
- warn "creating from new [$species]";
- # my $db_stub = qq{mysql -u $write_user -p}. $write_pass . qq{ -h $host -P $port};
- # # my $db_stub = "mysql -uadmin -pMotxY157 -hmysql-eg-live-1.ebi.ac.uk -P4159";
- # my $db_create_st = $db_stub . " -e 'create database IF NOT EXISTS $user_db_name'";
- # print 'Stub is:' . $db_create_st, "\n";
- my $dbh = get_dbh_from_species_type($species, 'core');
-
-
- $dbh->do("create database $user_db_name");
-
- my $userdata_dbh = get_dbh_from_species_type($species,'userdata');
-
- #load the schema from the DDL
-
- map { $userdata_dbh->do($_);} @table_create_statments;
-
- # need to populate the analysis,meta,meta_coord and coord_system
- # tables with data. This script does this by copying the data from
- # the relevant tables on the core DB.
- my @tables = qw( analysis meta meta_coord coord_system);
- my $core_db_name = $SPECIES_DEFS->get_config($species, 'databases')->{'DATABASE_CORE'}->{'NAME'};
-
- foreach my $table (@tables){
- # copy meta table
- $userdata_dbh->do("create table if not exists $user_db_name.$table select * from $core_db_name.$table");
- }
-
-}
-
-
-sub calculate_applicable_patches {
-
- my $species = shift;
-
- my $userdata_dbh = get_dbh_from_species_type($species,'userdata');
- my $core_dbh = get_dbh_from_species_type($species, 'core');
-
-
- # look in the meta table, find out what the schema versions is
- # look in patch directory and choose the right ones
- # return an array of pathes to use in the order they must be applied
-
-
- # work out the patch dir relative to this one
-
- my $patch_dir = "$Bin/../../../ensembl/sql";
-
- my $current_core_schema = $core_dbh->selectrow_array(qq{select meta_value from meta where meta_key = 'schema_version'});
- my $user_db_schema_version = $userdata_dbh->selectrow_array(qq{select meta_value from meta where meta_key = 'schema_version'});
-
-
- my @found = File::Find::Object::Rule->file()->name( '*.sql' )->in( $patch_dir);
- die "didn't find any Patch files in dir:'$patch_dir'\n" unless @found > 0;
-
- my @schema_range = ($user_db_schema_version + 1)..$current_core_schema;
- my $regex;
- map {$regex .= "patch_$_|"} @schema_range;
- chop $regex;
- my @patch_list = grep/$regex/, @found;
- return \@patch_list;
-
-
-}
-
-
-
-
-__END__
-
-=head1 NAME - create_user_data_dbs
-
-=head1 AUTHOR
-
-Stephen Keenan B<email> keenan@ebi.ac.uk
-
View
22 utils/eg_ebi_search_dump.ini
@@ -1,22 +0,0 @@
-# mysql -uadmin -piPBi22yI -hmysql-eg-production-1 -P4161
-
-#$user='admin';
-#$pass='iPBi22yI';
-#$port='4161';
-#$host='mysql-eg-production-1.ebi.ac.uk';
-#$dir = '/nas/panda/ensembl/indexingScratch/multispecies';
-
-#$user='admin';
-#$pass='tGc3Vs2O';
-#$port='4126';
-#$host='mysql-eg-devel-1.ebi.ac.uk';
-#$dir = '/nas/panda/ensembl/indexingScratch';
-
-#mysql -uensrw -pwrit3r -hmysql-eg-staging-1 -P4159 ensembl_blast
-$user='ensrw';
-$pass='writ3r';
-$port='4159';
-$host='mysql-eg-staging-1.ebi.ac.uk';
-$dir = '/nfs/ensembl/scratch/indexing';
-
-
View
1,815 utils/eg_ebi_search_dump.pl
@@ -1,1815 +0,0 @@
-#!/usr/local/bin/perl
-# Copyright [2009-2014] EMBL-European Bioinformatics Institute
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-package ebi_search_dump;
-
-use strict;
-use DBI;
-use Carp;
-use Getopt::Long;
-use IO::Zlib;
-use FindBin qw($Bin);
-use File::Basename qw( dirname );
-use Data::Dumper;
-
-
-my (
- $host, $user, $pass, $port, $species, $ind,
- $release, $max_entries, $nogzip, $parallel, $dir, $inifile, $genomic_unit, @SPECIES
-);
-
-# Load libraries needed for reading config
-# -----------------------------------
-BEGIN{
- unshift @INC, "$Bin/../../../conf";
- unshift @INC, "$Bin/../../..";
- eval{ require SiteDefs };
- if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
- map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
-}
-
-use Bio::EnsEMBL::DBSQL::DBAdaptor;
-use Bio::EnsEMBL::DBLoader;
-use Bio::SeqIO;
-use utils::Tool;
-use EnsEMBL::Web::SpeciesDefs;
-
-
-
-
-my %rHash = map { $_ } @ARGV;
-if ( $inifile = $rHash{'-inifile'} ) {
- my $icontent = `cat $inifile`;
- warn $icontent;
- eval $icontent;
-}
-
-GetOptions(
- "host=s", \$host, "port=i", \$port,
- "user=s", \$user, "pass=s", \$pass,
- "species=s", \$species, "release=s", \$release,
- "index=s", \$ind, "nogzip!", \$nogzip,
- "max_entries=i", \$max_entries, "parallel", \$parallel,
- "dir=s", \$dir, "help", \&usage,
- "inifile=s", \$inifile
-);
-
-$species ||= 'ALL';
-
-@SPECIES = split ',', $species;
-
-# Get Species List from SiteDefs
-if( @SPECIES ) {
- @SPECIES = @{ utils::Tool::check_species(\@SPECIES) };
-} else {
- @SPECIES = @{ utils::Tool::all_species()};
-}
-
-$dir ||= ".";
-$release ||= 'LATEST';
-#$port ||= 3306;
-
-
-my $SPECIES_DEFS = EnsEMBL::Web::SpeciesDefs->new();
-$host ||= $SPECIES_DEFS->DATABASE_HOST;
-$port ||= $SPECIES_DEFS->DATABASE_HOST_PORT;
-$user ||= 'ensro';
-
-
-usage() and exit unless ($release && $ind);
-
-my $entry_count;
-my $global_start_time = time;
-my $total = 0;
-my $FAMILY_DUMPED;
-
-my $fh;
-## HACK 1 - if the INDEX is set to all grab all dumper methods...
-my @indexes = split ',', $ind;
-@indexes = map { /dump(\w+)/ ? $1 : () } keys %ebi_search_dump::
- if $ind eq 'ALL';
-
-warn Dumper \@indexes;
-
-my $dbHash = get_databases();
-warn Dumper $dbHash;
-
-#warn Dumper $dbcHash;
-
-warn "SPECIES : ", Dumper @SPECIES;
-
-
-foreach my $species ( @SPECIES ) {
-warn $species;
-$species = lc($species);
- my $conf = $dbHash->{$species};
- foreach my $index (@indexes) {
-
- my $function = "dump$index";
- no strict "refs";
-
- $species =~ s/_/ /g;
- if ($index ne 'Family'){
- &$function( ucfirst($species), $conf);
- print $function,"\n";
- } elsif ($index eq 'Family' && !$FAMILY_DUMPED) {
- &dumpFamily($conf);
-
- }
-
- }
-
-}
-
-print_time($global_start_time);
-warn " Dumped $total entries ...\n";
-
-# -------------------------------------------------------------------------------
-
-sub text_month {
-
- my $m = shift;
-
- my @months = qw[JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC];
-
- return $months[$m];
-
-}
-
-# -------------------------------------------------------------------------------
-
-sub print_time {
-
- my $start = shift;
-
- my $t = time - $start;
- my $s = $t % 60;
- $t = ( $t - $s ) / 60;
- my $m = $t % 60;
- $t = ( $t - $m ) / 60;
- my $h = $t % 60;
-
- print "Time taken: " . $h . "h " . $m . "m " . $s . "s\n";
-
-}
-
-#------------------------------------------------------------------------------------------------
-sub usage {
- print <<EOF; exit(0);
-
-Usage: perl $0 <options>
-
- -host Database host to connect to. Defaults to ens-staging.
- -port Database port to connect to. Defaults to 3306.
- -species Species name. Defaults to ALL.
- -index Index to create. Defaults to ALL.
- -release Release of the database to dump. Defaults to 'latest'.
- -user Database username. Defaults to ensro.
- -pass Password for user.
- -dir Directory to write output to. Defaults to /lustre/scratch1/ensembl/gp1/xml.
- -nogzip Don't compress output as it's written.
- -help This message.
- -inifile First take the arguments from this file. Then overwrite with what is provided in the command line
-
-EOF
-
-}
-
-sub get_databases {
-
- my ( $dbHash, $dbcHash );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
-
- my $db = DBI->connect( $dsn, $user, $pass );
-
- warn "DSN: $dsn";
-
- my @dbnames =
- map { $_->[0] } @{ $db->selectall_arrayref("show databases") };
-
- $db->disconnect();
- warn Dumper \@dbnames;
-
- my $latest_release = 0;
- my ( $db_species, $db_release, $db_type );
- my $compara_hash;
- for my $dbname (@dbnames) {
- if ( ( $db_species, $db_type, $db_release ) =
- $dbname =~ /^([a-z]+_[a-z0-9]+)_([a-z]+)_(\d+)_\w+$/ )
- {
-
- warn join ' * ', $db_species, $db_type, $db_release, $species, "\n";
-
- next if ( $species ne 'ALL' ) && ( $db_species ne lc($species) );
- $latest_release = $db_release if ( $db_release > $latest_release );
- $dbHash->{$db_species}->{$db_type}->{$db_release} = $dbname;
-
- }
- if ( ($db_release) = $dbname =~ /ensembl_compara_(\d+)/ ) {
-
- #N.B Re:COMAPARA for now using
- #ensembl_compara_VERSION. Others will follow
- $compara_hash->{$db_release} = $dbname;
- }
-
- }
-
- map { $dbHash->{$_}->{'compara'} = $compara_hash } keys %$dbHash;
- $release = $latest_release if ( $release eq 'LATEST' );
-
- return $dbHash;
-
-}
-
-sub footer {
- my ($ecount) = @_;
- p("</entries>");
- p("<entry_count>$ecount</entry_count>");
- p("</database>");
-
- print "Dumped $ecount entries\n";
- if ($nogzip) {
- close(FILE) or die $!;
- }
- else {
- $fh->close();
- }
- $total += $ecount;
-}
-
-sub header {
- my ( $dbname, $dbspecies, $dbtype ) = @_;
-
- p("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>");
- p("<!DOCTYPE database [ <!ENTITY auml \"&#228;\">]>");
- p("<database>");
- p("<name>$dbname</name>");
- p("<description>Ensembl $dbspecies $dbtype database</description>");
- p("<release>$release</release>");
- p("");
- p("<entries>");
-}
-
-sub p {
- my ($str) = @_;
-
- # TODO - encoding
- $str .= "\n";
- if ($nogzip) {
- print FILE $str or die "Can't write to file ", $!;
- }
- else {
- print $fh $str or die "Can't write string: $str";
- }
-}
-
-sub format_date {
- my $t = shift;
-
- my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ];
- $y += 1900;
- $d = "0" . $d if ( $d < 10 );
- my $mm = text_month($m);
- return "$d-$mm-$y";
-}
-
-sub format_datetime {
- my $t = shift;
-
- my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ];
- $y += 1900;
- $d = "0" . $d if ( $d < 10 );
- my $ms = text_month($m);
- return sprintf "$d-$ms-$y %02d:%02d:%02d", $hh, $mm, $ss;
-}
-
-sub dumpFamily {
- my ( $conf ) = @_;
-
- my $FAMDB = $conf->{'compara'}->{$release} or next;
-
-
-# my $dbname = $conf->{'core'}->{$release} or next;
-
- my $file = "$dir/Family_all_species_core_$FAMDB.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $FAMDB to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
- header( $FAMDB, 'compara_all_species', $FAMDB );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$FAMDB", $user, $pass ) or die "DBI::error";
-
-
-
-# my $CORE = $conf->{'core'}->{$release};
-# my $t_sth = $dbh->prepare( qq{select meta_value from $CORE.meta where meta_key='species.taxonomy_id'});
-# $t_sth->execute;
-# my $taxon_id = ( $t_sth->fetchrow );
-
-# return unless $taxon_id;
-
- $dbh->do("SET SESSION group_concat_max_len = 100000");
- my $sth = $dbh->prepare(
-qq{ select f.family_id as id, f.stable_id as fid , f.description, group_concat(m.stable_id, unhex('1D') ,m.source_name) as IDS
-from $FAMDB.family as f, $FAMDB.family_member as fm, $FAMDB.member as m
- where fm.family_id = f.family_id and fm.member_id = m.member_id group by fid}
- );
- $sth->execute;
- foreach my $xml_data ( @{ $sth->fetchall_arrayref( {} ) } ) {
-
- my @bits = split /,/, delete $xml_data->{IDS};
- map { push @{ $xml_data->{IDS} }, [ split /\x1D/ ] } @bits;
-# $xml_data->{species} = $dbspecies;
- $xml_data->{species} = '';
- p familyLineXML($xml_data);
-
- }
-
- footer( $sth->rows );
- $FAMILY_DUMPED = 1;
-}
-
-sub familyLineXML {
- my ( $xml_data ) = @_;
-
- my $members = scalar @{ $xml_data->{IDS} };
-
-
- my $description = $xml_data->{description};
-
- $description =~ s/</&lt;/g;
- $description =~ s/>/&gt;/g;
- $description =~ s/'/&apos;/g;
- $description =~ s/&/&amp;/g;
-
-
-
-
- my $xml = qq{
-<entry id="$xml_data->{fid}">
-<name>$xml_data->{fid}</name>
- <description>$description</description>
- <cross_references>} .
- (
- join "",
- (
- map {
- qq{
- <ref dbname="$1" dbkey="$_->[0]"/>} if $_->[1] =~ /(Uniprot|ENSEMBL).*/
- } @{ $xml_data->{IDS} }
- )
- )
- .
- qq{
- </cross_references>
- <additional_fields>
- <field name="familymembers">$members</field>
- <field name="species">$xml_data->{species}</field>
- <field name="featuretype">Ensembl_protein_family</field>
- </additional_fields>
-</entry>};
- return $xml;
-
-}
-
-sub dumpGene {
-warn "in dumpGene";
- my ( $dbspecies, $conf) = @_;
-
- foreach my $DB ( 'core', 'vega' ) {
-# foreach my $DB ( 'core' ) {
- my $counter = make_counter(0);
- my $SNPDB = eval {$conf->{variation}->{$release}};
- my $DBNAME = $conf->{$DB}->{$release}
- or warn "$dbspecies $DB $release: no database not found";
- next unless $DBNAME;
-
- print "START... $DB";
- my $file = "$dir/Gene_$DBNAME.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
- header( $DBNAME, $dbspecies, $DB );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
-
- warn "Dumping $DBNAME to $file ... ", format_datetime($start_time),
- "\n";
- my $extra = $DB ne 'core' ? ";db=$DB" : '';
-
- my $dbh = DBI->connect( "$dsn:$DBNAME", $user, $pass )
- or die "DBI::error";
-
- # determine genomic unit
- my $division = $dbh->selectrow_array("SELECT meta_value FROM meta WHERE meta_key = 'species.division'");
- (my $genomic_unit = $division) =~ s/^ensembl//; # eg EnsemblProtists -> protists
- warn "Genomic unit [$genomic_unit]";
- die "Genomic unit not found" unless $genomic_unit;
-
- # SNP query
- my $snp_sth = eval {$dbh->prepare("select distinct(vf.variation_name) from $SNPDB.transcript_variation as tv, $SNPDB.variation_feature as vf where vf.variation_feature_id = tv.variation_feature_id and tv.transcript_stable_id in(?)");};
-
- my %xrefs = ();
- my %xrefs_desc = ();
- my %disp_xrefs = ();
- foreach my $type (qw(Gene Transcript Translation)) {
- my $T = $dbh->selectall_arrayref(
- "select ox.ensembl_id,
- x.display_label, x.dbprimary_acc, ed.db_name, es.synonym, x.description
- from ($DBNAME.object_xref as ox, $DBNAME.xref as x, $DBNAME.external_db as ed) left join $DBNAME.external_synonym as es on es.xref_id = x.xref_id
- where ox.ensembl_object_type = '$type' and ox.xref_id = x.xref_id and x.external_db_id = ed.external_db_id"
- );
- foreach (@$T) {
-
- $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[1] } = 1 if $_->[1];
- $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[2] } = 1 if $_->[2];
-# $xrefs{$type}{ $_->[0] }{ $_->[3] }{ $_->[4] } = 1 if $_->[4];
- $xrefs_desc{$type}{ $_->[0] }{ $_->[5] } = 1 if $_->[5];
- if (my $syn = $_->[4]) {
- $syn =~ s/^\'|\'$//g;
- next if ($syn =~ /^(FBtr|FBpp)\d+/);
- next if ($syn =~ /^CG\d+\-/);
- $xrefs{$type}{ $_->[0] }{ $_->[3] . "_synonym" }{ $syn } = 1;
- }
-
-
- }
-
- warn "XREF $type query...";
- }
- warn "Exons query...";
- my %exons = ();
- my $T = $dbh->selectall_arrayref(
- "select distinct t.gene_id, esi.stable_id
- from transcript as t, exon_transcript as et, exon_stable_id as esi
- where t.transcript_id = et.transcript_id and et.exon_id = esi.exon_id"
- );
- foreach (@$T) {
- $exons{ $_->[0] }{ $_->[1] } = 1;
- }
- warn "Domains query...";
- my %domains;
- $T = $dbh->selectall_arrayref('
- SELECT DISTINCT g.gene_id, pf.hit_name
- FROM gene g, transcript t, translation tl, protein_feature pf
- WHERE g.gene_id = t.gene_id AND t.transcript_id = tl.transcript_id AND tl.translation_id = pf.translation_id
- ');
- foreach (@$T) {
- $domains{$_->[0]}{$_->[1]} = 1;
- }
-
- my $gene_info = $dbh->selectall_arrayref( "
- select gsi.gene_id, tsi.transcript_id, trsi.translation_id,
- gsi.stable_id as gsid, tsi.stable_id as tsid, trsi.stable_id as trsid,
- g.description, ed.db_name, x.dbprimary_acc,x.display_label, ad.display_label, ad.description, g.source, g.status, g.biotype
- from (((( $DBNAME.gene_stable_id as gsi, $DBNAME.gene as g,
- $DBNAME.transcript_stable_id as tsi,
- $DBNAME.analysis_description as ad,
- $DBNAME.transcript as t) left join
- $DBNAME.translation as tr on t.transcript_id = tr.transcript_id) left join
- $DBNAME.translation_stable_id as trsi on tr.translation_id = trsi.translation_id) left join
- $DBNAME.xref as x on g.display_xref_id = x.xref_id) left join
- $DBNAME.external_db as ed on ed.external_db_id = x.external_db_id
- where t.gene_id = gsi.gene_id and t.transcript_id = tsi.transcript_id and t.gene_id = g.gene_id
- and g.analysis_id = ad.analysis_id
- order by gsi.stable_id, tsi.stable_id;
- " );
- warn "Gene query...";
-
- my %hash = map { $_->[0] } @$gene_info;
- my $ecount = scalar keys %hash, "\n\n";
-
- my %old;
-
- foreach my $row (@$gene_info) {
-
- # g = gene_id, t = transcript_id , tr = translation_id ,
- # gs = gene_stable_id, ts = transcript_stable_id , trs =
- # translation_stable_id, d = description,
- # ddb= external_db_dispay_name,
- # dpa = xref_primary_accession,
- # dn = xref display_label,
- # a = analysis_description
- # display label,
- # ad = analysis description descripion,
- #s = gene.source, st = gene.status, bt = gene.biotype
-
- my (
- $gene_id, $transcript_id,
- $translation_id, $gene_stable_id,
- $transcript_stable_id, $translation_stable_id,
- $gene_description, $extdb_db_display_name,
- $xref_primary_acc, $xref_display_label,
- $analysis_description_display_label, $analysis_description,
- $gene_source, $gene_status,
- $gene_biotype
- ) = @$row;
- if ( $old{'gene_id'} != $gene_id ) {
- if ( $old{'gene_id'} ) {
-
- if ($SNPDB && $DB eq 'core') {
- my @transcript_stable_ids = keys %{$old{transcript_stable_ids}};
- $snp_sth->execute("@transcript_stable_ids");
- my $snps = $snp_sth->fetchall_arrayref;
- $old{snps} = $snps;
- }
-
-
-# my @transcript_stable_ids = keys $old{'transcript_ids'};
-# $snp_sth->execute(@transcript_stable_ids);
-# my $snps = $snp_sth->fetchall_arrayref;
-# die Dumper($snps);
- p geneLineXML( $dbspecies, \%old, $counter, $genomic_unit );
-
- }
- %old = (
- 'gene_id' => $gene_id,
- 'gene_stable_id' => $gene_stable_id,
- 'description' => $gene_description,
- 'translation_stable_ids' => {
- $translation_stable_id ? ( $translation_stable_id => 1 )
- : ()
- },
- 'transcript_stable_ids' => {
- $transcript_stable_id ? ( $transcript_stable_id => 1 )
- : ()
- },
- 'transcript_ids' => {
- $transcript_id ? ( $transcript_id => 1 )
- : ()
- },
- 'exons' => {},
- 'domains' => {},
- 'external_identifiers' => {},
- 'alt' => $xref_display_label
- ? "($analysis_description_display_label: $xref_display_label)"
- : "(novel gene)",
- 'ana_desc_label' => $analysis_description_display_label,
- 'ad' => $analysis_description,
- 'source' => ucfirst($gene_source),
- 'st' => $gene_status,
- 'biotype' => $gene_biotype
- );
- $old{'source'} =~ s/base/Base/;
-
- $old{'exons'} = $exons{$gene_id};
- foreach my $K ( keys %{ $exons{$gene_id} } ) {
- $old{'i'}{$K} = 1;
- }
- $old{'domains'} = $domains{$gene_id};
-
- foreach my $db ( keys %{ $xrefs{'Gene'}{$gene_id} || {} } ) {
- foreach my $K ( keys %{ $xrefs{'Gene'}{$gene_id}{$db} } ) {
- $old{'external_identifiers'}{$db}{$K} = 1;
-
- }
- }
- foreach my $db (
- keys %{ $xrefs{'Transcript'}{$transcript_id} || {} } )
- {
- foreach my $K (
- keys %{ $xrefs{'Transcript'}{$transcript_id}{$db} } )
- {
- $old{'external_identifiers'}{$db}{$K} = 1;
-
- }
- }
- foreach my $db (
- keys %{ $xrefs{'Translation'}{$translation_id} || {} } )
- {
- foreach my $K (
- keys %{ $xrefs{'Translation'}{$translation_id}{$db} } )
- {
- $old{'external_identifiers'}{$db}{$K} = 1;
- }
- }
-
- }
- else {
- $old{'transcript_stable_ids'}{$transcript_stable_id} = 1;
- $old{'transcript_ids'}{$transcript_id} = 1;
- $old{'translation_stable_ids'}{$translation_stable_id} = 1;
-
- foreach my $db (
- keys %{ $xrefs{'Transcript'}{$transcript_id} || {} } )
- {
- foreach my $K (
- keys %{ $xrefs{'Transcript'}{$transcript_id}{$db} } )
- {
- $old{'external_identifiers'}{$db}{$K} = 1;
- }
- }
- foreach my $db (
- keys %{ $xrefs{'Translation'}{$translation_id} || {} } )
- {
- foreach my $K (
- keys %{ $xrefs{'Translation'}{$translation_id}{$db} } )
- {
- $old{'external_identifiers'}{$db}{$K} = 1;
-
- }
- }
- }
- }
-
-
-# my @transcript_stable_ids = keys %$old{'transcript_ids'};
-# $snp_sth->execute(@transcript_ids);
-# my $snps = $snp_sth->fetchall_arrayref;
-# die Dumper($snps);
-
-
-
-# $snp_sth->execute(keys %$old{transcript_ids});
-# my $snps = $snp_sth->fetchall_arrayref($snp_sth);
- if ($SNPDB && $DB eq 'core') {
- my @transcript_stable_ids = keys %{$old{transcript_stable_ids}};
- $snp_sth->execute("@transcript_stable_ids");
- my $snps = $snp_sth->fetchall_arrayref;
- $old{snps} = $snps;
- }
-
- p geneLineXML( $dbspecies, \%old, $counter, $genomic_unit );
-
- footer( $counter->() );
- warn "FINISHED...... genes $DB ...";
-
- }
-
-}
-
-sub geneLineXML {
- my ( $species, $xml_data, $counter, $genomic_unit ) = @_;
-
- return warn "gene id not set" if $xml_data->{'gene_stable_id'} eq '';
- die "genomic_unit not passed to geneLineXML" unless $genomic_unit;
-
- my $gene_id = $xml_data->{'gene_stable_id'};
- my $altid = $xml_data->{'alt'} or die "altid not set";
- my $transcripts = $xml_data->{'transcript_stable_ids'}
- or die "transcripts not set";
-
- my $snps = $xml_data->{'snps'};
-
- my $peptides = $xml_data->{'translation_stable_ids'}
- or die "peptides not set";
- my $exons = $xml_data->{'exons'} or warn "!!!! exons not set for gene_id: $gene_id";
- my $domains = $xml_data->{'domains'};# or warn "!!!! domains not set for gene_id: $gene_id"; <-- not always present?
- my $external_identifiers = $xml_data->{'external_identifiers'}
- or die "external_identifiers not set";
- my $description = $xml_data->{'description'};
- my $type = $xml_data->{'source'} . ' ' . $xml_data->{'biotype'}
- or die "problem setting type";
-
- my $exon_count = scalar keys %$exons;
- my $domain_count = scalar keys %$domains;
- my $transcript_count = scalar keys %$transcripts;
- $description =~ s/</&lt;/g;
- $description =~ s/>/&gt;/g;
- $description =~ s/'/&apos;/g;
- $description =~ s/&/&amp;/g;
-
- $gene_id =~ s/</&lt;/g;
- $gene_id =~ s/>/&gt;/g;
-
- $altid =~ s/</&lt;/g;
- $altid =~ s/>/&gt;/g;
-
- my $xml = qq{
- <entry id="$gene_id">
- <name>$gene_id $altid</name>
- <description>$description</description>};
-
- my $cross_references = qq{
- <cross_references>};
-
- foreach my $ext_db_name ( keys %$external_identifiers ) {
- if ($ext_db_name =~ /(Uniprot|GO|Interpro|Medline|Sequence_Publications|EMBL)/) {
-
- map { $cross_references .= qq{
- <ref dbname="$1" dbkey="$_"/>}; } keys %{ $external_identifiers->{$ext_db_name} }
-
- } else {
- foreach my $key (keys %{ $external_identifiers->{$ext_db_name} }) {
- $key =~ s/</&lt;/g;
- $key =~ s/>/&gt;/g;
- $key =~ s/&/&amp;/g;
- $ext_db_name =~s/^Ens*/ENSEMBL/;
- $cross_references .= qq{
- <ref dbname="$ext_db_name" dbkey="$key"/>};
- }
-
- }
- }
-
- $cross_references .= (
- join "",
- (
- map {
- qq{
- <ref dbname="ensemblvariation" dbkey="$_->[0]"/>}
- } @$snps
- )
- );
-
- $cross_references .= qq{</cross_references>};
-
-my $sp = $species =~ /1163/ ? 'Aspergillus fumigatus A1163' : $species;
-
- my $additional_fields .= qq{
- <additional_fields>
- <field name="species">$sp</field>
- <field name="featuretype">Gene</field>
- <field name="source">$type</field>
- <field name="genomic_unit">$genomic_unit</field>
- <field name="transcript_count">$transcript_count</field> }
-
- . (
- join "",
- (
- map {
- qq{
- <field name="transcript">$_</field>}
- } keys %$transcripts
- )
- )
-
- . qq{ <field name="exon_count">$exon_count</field> }
-
- . (
- join "",
- (
- map {
- qq{
- <field name="exon">$_</field>}
- } keys %$exons
- )
- )
- . qq{ <field name="domain_count">$domain_count</field> }
-
- . (
- join "",
- (
- map {
- qq{
- <field name="domain">$_</field>}
- } keys %$domains
- )
- )
-
- . (
- join "",
- (
- map {
- qq{
- <field name="peptide">$_</field>}
- } keys %$peptides
- )
- )
- . qq{
- </additional_fields>};
-
-
-
- $counter->();
- return $xml . $cross_references . $additional_fields . '</entry>';
-
-}
-
-
-
-
-
-
-sub dumpGenomicAlignment {
-warn "in dump Genomic";
- my ( $dbspecies, $conf ) = @_;
-
- # warn Dumper $conf;
-
- warn "\n", '*' x 20, "\n";
- my %tables = (
- 'dna_align_feature' => [ 'DnaAlignFeature', 'DNA alignment feature' ],
- 'protein_align_feature' =>
- [ 'ProteinAlignFeature', 'Protein alignment feature' ]
- );
- my $ecount;
-
- foreach my $db ( 'core', 'cdna', 'otherfeatures' ) {
-
- my $ecount = 0;
- my $DB_NAME = $conf->{$db}->{$release} or next;
- my $file = "$dir/GenomicAlignment_$DB_NAME.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $DB_NAME to $file ... ", format_datetime($start_time),
- "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die "Can't open compressed stream to $file: ", $!;
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: ", $!;
- }
-
- header( $DB_NAME, $dbspecies, $db );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
-
- my $dbh = DBI->connect( "$dsn:$DB_NAME", $user, $pass )
- or die "DBI::error";
- foreach my $table ( keys %tables ) {
- my $source = $tables{$table}[0];
-
- # $source .= ";db=$db" unless $db eq 'core';\
-
-
-
- # Due to the sheer number of features - generating this
- # dump causes temp table to be written on the mysqld
- # filesys. /tmp space can be easily filled.
- # Have split the feature fetching to happen by an
- # analysis_id at a time. The changes below gave a x3
- # speed up on XML dumping as compared to fetching all
- # features in one query at once.
-
- # make a lookup for the analysis display labels.
- my $type = $tables{$table}[1];
- my $logic_name_lookup = $dbh->selectall_hashref(
- "select a.analysis_id, a.logic_name
- from $DB_NAME.analysis as a
- ", [qw(analysis_id)]
- ) or die $DBI::Err;
-
- my $display_label_lookup = $dbh->selectall_hashref(
- "select ad.analysis_id, ad.display_label
- from $DB_NAME.analysis_description as ad
- ", [qw(analysis_id)]
- ) or die $DBI::Err;
-
- my $sth = $dbh->prepare(
- "select t.analysis_id,t.hit_name, count(*) as hits
- from $DB_NAME.$table as t where t.analysis_id = ?
- group by t.analysis_id, t.hit_name"
- ) or die $DBI::Err;
-
- foreach my $ana_id (
- @{$dbh->selectall_arrayref("select distinct distinct(analysis_id) from $DB_NAME.$table")
- }
- )
- {
-
- my $adesc =
- ( $display_label_lookup->{ $ana_id->[0] }->{display_label}
- || $logic_name_lookup->{ $ana_id->[0] }->{logic_name} );
-
- $sth->execute( $ana_id->[0] ) or die $DBI::Err;
-
- my $rows = []; # cache for batches of rows
- while (
- my $row = (
- shift(@$rows) || # get row from cache,
- # or reload cache:
- shift(
- @{
- $rows =
- $sth->fetchall_arrayref( undef, 100_000 )
- || []
- }
- )
- )
- )
- {
-
- my $hid = $row->[1];
- my $count = $row->[2];
-
- my $xml = qq{
- <entry id="$hid">
- <additional_fields>
- <field name="species">$dbspecies</field>
- <field name="featuretype">$source</field>
- <field name="db">$db</field>
- <field name="genome_hits">$count</field>
- <field name="adesc">$adesc</field>
- </additional_fields>
- </entry>};
- p($xml);
-
- }
-
- $ecount += $sth->rows;
-
- }
-
- }
- print_time($start_time);
- footer($ecount);
- }
-}
-
-
-sub dumpMarker {
- warn "in dump MArker";
- my ( $dbspecies, $conf ) = @_;
-
- # my $xml_data;
- # $xml_data->{species} = $dbspecies;
-
- my $db = 'core';
- my $dbname = $conf->{$db}->{$release} or next;
- my $file = "$dir/Marker_$dbname.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("can't open compressed stream to $file: $!");
- }
- else {
- open( file, ">$file" ) || die "can't open $file: $!";
- }
- header( $dbname, $dbspecies, $db );
- my $dsn = "dbi:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error";
-
-# my $sth = $dbh->prepare(q{
-# SELECT @rownum := @rownum+1 AS rownum, ms2.name as marker, ms1.name
-# from (select @rownum := 0) r, (marker_synonym as ms1, marker as m) left join
-# marker_synonym as ms2 on ms2.marker_synonym_id = m.display_marker_synonym_id
-# where ms1.marker_id = m.marker_id
-# order by m.marker_id}
-# );
-# $sth->execute( );
- my $sth = $dbh->prepare(
- q{ SELECT ms2.name as marker, ms1.name
- from (marker_synonym as ms1, marker as m) left join
- marker_synonym as ms2 on ms2.marker_synonym_id = m.display_marker_synonym_id
- where ms1.marker_id = m.marker_id
- order by m.marker_id}
- );
-
- my $data = $dbh->selectall_hashref( $sth, [ 'marker', 'name' ] );
- foreach my $marker ( keys %$data ) {
- p markerXML( $marker, $data, $dbspecies );
- }
-
- footer( scalar keys(%$data) );
-
-}
-
-sub markerXML {
- my ( $marker, $xml_data, $species ) = @_;
-
- my $xml;
-
- my @keys = keys %{ $xml_data->{$marker} };
- my $desc =
- 'A marker with '
- . scalar @keys
- . ' synonym'
- . ( scalar @keys > 1 ? 's ' : ' ' ) . '('
- . join( " ", @keys ) . ')';
-
-
- $desc =~ s/</&lt;/g;
- $desc =~ s/>/&gt;/g;
-
- $xml = qq{
-<entry id="$marker">
- <additional_fields>};
-
- foreach (@keys) {
- s/</&lt;/g;
- s/>/&gt;/g;
-
- $xml .= qq{
- <field name="synonym">$_</field>}
-
- }
- $xml .= qq{
- <field name="species">$species</field>
- <field name="featuretype">Marker</field>
- </additional_fields>
-</entry>};
-
- return $xml;
-
-}
-
-
-
-sub dumpOligoProbe {
- warn "in dump Oligo";
- my ( $dbspecies, $conf ) = @_;
-
- my $db = 'core';
- my $dbname = $conf->{$db}->{$release} or next;
- my $file = "$dir/OligoProbe_$dbname.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
- header( $dbname, $dbspecies, $db );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error";
-
- my $sth = $dbh->prepare(
- "select p.probeset, count(*) as hits, a.type
- from oligo_probe as p, oligo_feature as f, oligo_array as a
- where p.oligo_probe_id = f.oligo_probe_id and p.oligo_array_id = a.oligo_array_id
- group by p.probeset"
- );
- $sth->execute();
-
-
- while ( my $rowcache = $sth->fetchall_arrayref( undef, '10_000' ) ) {
- my $xml;
- while ( my $data = shift( @{$rowcache} ) ) {
- $xml .= OligoProbeXML( $data, $dbspecies );
-
- }
- p $xml;
- }
-
- footer( $sth->rows );
-
-}
-
-sub OligoProbeXML {
- my ( $xml_data, $dbspecies ) = @_;
-
-# my $desc =qq{$xml_data->[0], $xml_data->[2] oligo probeset $xml_data->[0] hits the genome in $xml_data->[1] locations.};
-
- return qq{
-<entry id="$xml_data->[0]">
- <additional_fields>
- <field name="type">$xml_data->[2]</field>
- <field name="species">$dbspecies</field>
- <field name="featuretype">OligoProbe</field>
- <field name="genome_hits">$xml_data->[1]</field>
- </additional_fields>
-</entry>};
-
-}
-
-
-sub dumpQTL {
- warn "in dumpQTL";
-
- my ( $dbspecies, $conf ) = @_;
-
- # print Dumper($conf);
- my $xml_data;
- $xml_data->{species} = $dbspecies;
- my $db = 'core';
- my $dbname = $conf->{$db}->{$release} or next;
- my $file = "$dir/QTL_$dbname.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
-
- header( $dbname, $dbspecies, $db );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error";
-
- my $sth = $dbh->prepare(
- "select c.name as chr, qf.seq_region_start, qf.seq_region_end,
- a.logic_name as analysis, q.qtl_id,
- q.trait, qs.source_database, qs.source_primary_id,
- fms1.source as fm1_source, fms1.name as fm1_name,
- fms2.source as fm2_source, fms2.name as fm2_name,
- pms.source as pm_source, pms.name as pm_name
- from ((((((seq_region as c, qtl_feature as qf, qtl_synonym as qs,
- analysis as a, qtl as q) left join marker as fm1 on
- fm1.marker_id = q.flank_marker_id_1) left join marker_synonym as fms1 on
- fm1.display_marker_synonym_id = fms1.marker_synonym_id) left join marker as fm2 on
- fm2.marker_id = q.flank_marker_id_2) left join marker_synonym as fms2 on
- fm2.display_marker_synonym_id = fms2.marker_synonym_id) left join marker as pm on
- pm.marker_id = q.peak_marker_id) left join marker_synonym as pms on
- pm.display_marker_synonym_id = pms.marker_synonym_id
- where c.seq_region_id = qf.seq_region_id and qs.qtl_id = q.qtl_id and
- qf.analysis_id = a.analysis_id and qf.qtl_id = q.qtl_id
- "
- );
- $sth->execute();
- my $desc = '';
- my $old_qtl = 0;
- my $old_ID = '';
- my $old_pos = '';
- my $counter = make_counter(0);
- while ( my $T = $sth->fetchrow_hashref() ) {
-
- if ( $T->{qtl_id} eq $old_qtl ) {
-
- # $IDS .= " $T->{source_primary_id}";
- $desc .= " $T->{source_database}:$T->{source_primary_id}";
-
- $xml_data->{cross_ref}->{ $T->{source_database} } =
- $T->{source_primary_id};
-
- # $xml_data->{source_primary_id} = $T->{source_primary_id};
- # print Dumper($T);
- }
- else {
- $xml_data->{pm_name} = $T->{pm_name};
- $old_pos =
- "$T->{chr}:"
- . ( $T->{seq_region_start} - 1e4 ) . '-'
- . ( $T->{seq_region_end} + 1e4 );
- $desc = "QTL exhibiting '$T->{trait}' has ";
- my $f2 = $T->{pm_name} ? 1 : 0;
- if ( $T->{fm1_name} || $T->{fm2_name} ) {
- my $f1 = ( $T->{fm1_name} ) && ( $T->{fm2_name} ) ? 1 : 0;
-
- $desc .=
- 'flanking marker'
- . ( $f1 ? 's ' : ' ' )
- . $T->{fm1_name}
- . ( $f1 ? ' and ' : '' )
- . $T->{fm2_name}
- . ( $f2 ? '; ' : '' );
- $xml_data->{f1} = $T->{fm1_name};
- $xml_data->{f2} = $T->{fm2_name};
-
- }
- if ($f2) {
- $desc .= "peak marker $T->{pm_name};";
- $xml_data->{pm} = $T->{pm_name};
- }
- $desc .=
- " and names: $T->{source_database}:$T->{source_primary_id}";
-
- #my $sd = $T->{source_database};
- $xml_data->{description} = $desc;
- $xml_data->{cross_ref}->{ $T->{source_database} } =
- $T->{source_primary_id};
- $old_qtl = $T->{qtl_id};
- $xml_data->{pos} = $old_pos;
- if ( $xml_data->{pm_name} ) {
- p( QTLXML( $xml_data, $counter ) );
- }
- }
- }
-
- $xml_data->{description} = $desc;
-
- if ( $xml_data->{pm_name} ) {
- p( QTLXML( $xml_data, $counter ) );
- }
- $dbh->disconnect();
- footer( $counter->() );
-}
-
-sub QTLXML {
- my ( $xml_data, $counter ) = @_;
-
- my $xml = qq{
-<entry id="$xml_data->{pm_name}">
- <name>$xml_data->{pm_name}</name>
- <description>$xml_data->{description}</description>
- <additional_fields>
- <field name="species">$xml_data->{species}</field>
- <field name="flanking">$xml_data->{f1}</field>
- <field name="flanking">$xml_data->{f2}</field>
- <field name="peak_marker">$xml_data->{pm}</field>
- <field name="pos">$xml_data->{pos}</field>
- <field name="featuretype">QTL</field>
- </additional_fields>
- <cross_references> };
-
- foreach ( keys( %{ $xml_data->{cross_ref} } ) ) {
- $xml .=
- qq{\n <ref dbname="$_" dbkey="$xml_data->{cross_ref}->{$_}"/>};
- }
-
- $xml .= qq{\n </cross_references>
-</entry>};
- $counter->();
- return $xml;
-
-}
-
-
-sub dumpSequence {
-warn "in dump Sequence";
- my ( $dbspecies, $conf ) = @_;
-
- # my $sanger = sanger_project_names( $conf );
- my $sanger = 'SANGER STUFF';
- my %config = (
- "Homo sapiens" => [
- [
- 'Clone',
- 'tilepath, cloneset_1mb, cloneset_30k, cloneset_32k',
-'name,well_name,clone_name,synonym,embl_acc,sanger_project,alt_well_name,bacend_well_name'
- ],
- [ 'NT Contig', 'ntctgs', 'name' ],
- [ 'Encode region', 'encode', 'name,synonym,description' ],
- ],
- "Mus musculus" => [
- [
- 'BAC',
- 'cloneset_0_5mb,cloneset_1mb,bac_map,tilingpath_cloneset',
- 'embl_acc,name,clone_name,well_name,synonym,alt_embl_acc'
- ],
- [ 'Fosmid', 'fosmid_map', 'name,clone_name' ],
- [ 'Supercontig', 'superctgs', 'name' ],
- ],
- "Anopheles gambiae" => [
- [ 'BAC', 'bacs', 'name,synonym,clone_name' ],
- [ 'BAC band', 'bacs_bands', 'name,synonym,clone_name' ],
- ],
- "Gallus gallus" => [
- [ 'BAC', 'bac_map', 'name,synonym,clone_name' ],
- [
- 'BAC ends', 'bacends',
- 'name,synonym,clone_name', 'otherfeatures'
- ]
- ]
- );
-
- my $dbname = $conf->{'core'}->{$release} or next;
-
- my $file = "$dir/Sequence_$dbname.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
- header( $dbname, $dbspecies, 'core' );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error";
-
- my $COREDB = $dbname;
- my $ESTDB = $conf->{otherfeatures}->{$release};
-
- my @types = @{ $config{$dbspecies} || [] };
- my $ecounter;
- foreach my $arrayref (@types) {
-
- my ( $TYPE, $mapsets, $annotationtypes, $DB ) = @$arrayref;
-
- my $DB = $DB eq 'otherfeatures' ? $ESTDB : $COREDB;
- my @temp = split( ',', $mapsets );
- my @mapsets;
- foreach my $X (@temp) {
- my $ID = $dbh->selectrow_array(
- "select misc_set_id from $DB.misc_set where code = ?",
- {}, $X );
- push @mapsets, $ID if ($ID);
- }
-
- next unless @mapsets;
- @temp = split( ',', $annotationtypes );
- my @mapannotationtypes;
- foreach my $X (@temp) {
- my $ID = $dbh->selectrow_array(
- "select attrib_type_id from $DB.attrib_type where code = ?",
- {}, $X );
- push @mapannotationtypes, $ID if ($ID);
- }
- next unless @mapannotationtypes;
- my $Z = " ma.value";
- my $MAPSETS = join ',', @mapsets;
- my $sth = $dbh->prepare(
- "select mf.misc_feature_id, sr.name,
- ma.value, mf.seq_region_end-mf.seq_region_start+1 as len,
- at.code
- from $DB.misc_feature_misc_set as ms,
- $DB.misc_feature as mf,
- seq_region as sr,
- $DB.misc_attrib as ma,
- $DB.attrib_type as at
- where mf.seq_region_id = sr.seq_region_id and mf.misc_feature_id = ms.misc_feature_id and ms.misc_set_id in ($MAPSETS) and
- mf.misc_feature_id = ma.misc_feature_id and ma.attrib_type_id = at.attrib_type_id
- order by mf.misc_feature_id, at.code"
- );
- $sth->execute();
- my ( $oldtype, $old_ID, $oldchr, $emblaccs, $oldlen, $synonyms, $NAME );
-
- while ( my ( $ID, $chr, $val, $len, $type ) = $sth->fetchrow_array() ) {
-
- if ( $ID == $old_ID ) {
- $NAME = $val
- if $type eq 'well_name'
- || $type eq 'clone_name'
- || $type eq 'name'
- || $type eq 'non_ref';
- $NAME = $val if !$NAME && $type eq 'embl_acc';
- $NAME = $val if !$NAME && $type eq 'synonym';
- $NAME = $val if !$NAME && $type eq 'sanger_project';
- push @{$emblaccs}, $val if $val;
- }
- else {
- p seqLineXML(
- $dbspecies, $TYPE, $NAME, $oldchr,
- $emblaccs, $oldlen, $sanger
- ) if $old_ID;
- $NAME = undef;
- $emblaccs = undef;
- $NAME = $val
- if $type eq 'well_name'
- || $type eq 'clone_name'
- || $type eq 'name'
- || $type eq 'non_ref';
- $NAME = $val if !$NAME && $type eq 'embl_acc';
- $NAME = $val if !$NAME && $type eq 'synonym';
- $NAME = $val if !$NAME && $type eq 'sanger_project';
- $emblaccs->[0] = $val;
- ( $old_ID, $oldchr, $oldlen ) = ( $ID, $chr, $len );
- $ecounter += 1;
- }
- }
- p seqLineXML( $dbspecies, $TYPE, $NAME, $oldchr, $emblaccs, $oldlen,
- $sanger )
- if $old_ID;
- }
-
- footer($ecounter);
-
-# my $sth = $conf->{'dbh'}->prepare(
-# "select c.name, c.length, cs.name
-# from seq_region as c, coord_system as cs
-# where c.coord_system_id = cs.coord_system_id" );
-# $sth->execute();
-# while( my($name,$length,$type) = $sth->fetchrow_array() ) {
-# my $extra_IDS = ''; mysql $extra_desc = '';
-# if( %{$sanger->{$name}||{}} ) {
-# $extra_IDS = join ' ', '',sort keys %{$sanger->{$name}};
-# $extra_desc = " and corresponds to the following Sanger projects: ".join( ', ',sort keys %{$sanger->{$name}});
-# }
-# print_time O join "\t",
-# (INC_SPECIES?"$conf->{'species'}\t":"").ucfirst($type), $name,
-# ($type eq 'chromosome' && length( $name ) < 5) ?
-# "/$conf->{'species'}/mapview?chr=$name" :
-# ($length > 0.5e6 ? "/$conf->{'species'}/cytoview?region=$name" :
-# "/$conf->{'species'}/contigview?region=$name" ),
-# "$name$extra_IDS", "$name isnull a @{[ucfirst($type)]} (of length $length)$extra_desc\n";
-# }
-}
-
-sub seqLineXML {
- my ( $species, $type, $name, $chr, $val, $len, $sanger ) = @_;
-
- pop @$val;
-# my $description = "$type $name is mapped to Chromosome $chr" .
-
-# (
-# @$val > 0
-# ? ' and has '
-# . @$val
-# . " EMBL accession"
-# . (
-# @$val > 1
-# ? 's'
-# : ''
-# )
-# . "/synonym"
-# . (
-# @$val > 1
-# ? 's '
-# : ' '
-# )
-# . "@$val"
-# : ''
-# )
-
-# . " and length $len bps\n";
-
- my $xml = qq{
- <entry id="$name">
- <cross_references>}
-
- . (
- join "",
- (
- map {
- qq{
- <ref dbname="EMBL" dbkey="$_"/>}
- } @$val
- )
- )
-
- . qq{</cross_references>
- <additional_fields>
- <field name="species">$species</field>
- <field name="type">$type</field>
- <field name="chromosome">$chr</field>
- <field name="length">$len</field>
- <field name="featuretype">Genomic</field>
- </additional_fields>
- </entry>};
-
- return $xml;
-
-}
-
-
-
-
-
-sub dumpSNP {
- my ( $dbspecies, $conf ) = @_;
-
- # warn Dumper $conf;
-
- warn "\n", '*' x 20, "\n";
-
- my $COREDB = my $dbname = $conf->{'core'}->{$release};
-
-
- my $dbname = $conf->{variation}->{$release} or next;
- my $file = "$dir/SNP_$dbname.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $dbname to $file ... ", format_datetime($start_time),
- "\n";
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die "Can't open compressed stream to $file: $!";
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
-
- header( $dbname, $dbspecies, $dbname );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass )
- or die "DBI::error";
- my $source_hash = $dbh->selectall_hashref(qq{SELECT source_id, name FROM source} ,[qw(source_id)]);
-
-# my $tid_to_gene = $dbh->selectall_hashref(qq{select t.transcript_id, gsi.stable_id from $COREDB.gene as g, $COREDB.gene_stable_id as gsi, $COREDB.transcript as t where gsi.gene_id = g.gene_id and t.gene_id = g.gene_id limit 10;},[qw(transcript_id)]);
-
-
-# my $sth = $dbh->prepare("select vf.variation_name, vf.source_id, group_concat(vs.source_id, ' ',vs.name), vf.variation_feature_id,vf.variation_id from variation_feature vf , transcript_variation tv
-# left join variation_synonym vs on vf.variation_id = vs.variation_id where tv.transcript_variation_id = vf.variation_feature_id group by vf.variation_id");
-
- my $sth = $dbh->prepare("select vf.variation_name, vf.source_id, group_concat(vs.source_id, ' ',vs.name), vf.consequence_type from variation_feature vf left join variation_synonym vs on vf.variation_id = vs.variation_id group by vf.variation_id");
-
-
- # my $vfi2gene_sth = $dbh->prepare(qq{select distinct(gsi.stable_id) from $COREDB.gene as g, $COREDB.gene_stable_id as gsi, $COREDB.transcript as t where gsi.gene_id = g.gene_id and t.gene_id = g.gene_id and transcript_id in
-# (select tv.transcript_id from transcript_variation tv , variation_feature vf where vf.variation_feature_id =tv.variation_feature_id and vf.variation_feature_id = ?)});
-
- $sth->execute() or die "Error:", $DBI::errstr;
-
- while (my $rowcache = $sth->fetchall_arrayref(undef, 10_000)) {
-
-
- my $xml;
- while (my $row = shift(@{$rowcache})) {
-# $vfi2gene_sth->execute($row->[3]);
-# my $gsi = $vfi2gene_sth->fetchall_arrayref;
- my $name = $row->[0];
- my @synonyms = split /,/, @$row->[2];
- my $snp_source = $source_hash->{$row->[1]}->{name};
-
-# my $description =
-# "A $snp_source SNP with "
-# . scalar @synonyms
-# . ' synonym'
-# . ( @synonyms > 1 | @synonyms < 1 ? 's ' : ' ' )
-# . ( @synonyms > 0 ? "( " . (join "", map{ map{ $source_hash->{$_->[0]}->{name} , ':', $_->[1] , ' ' } [split] } @synonyms ) . ")" : '' );
-
- $xml .= qq{<entry id="$name">
- <additional_fields>
- <field name="species">$dbspecies</field>
- <field name="featuretype">SNP</field>
- <field name="consequence">$row->[3]</field>};
-
- foreach my $syn(@synonyms) {
- my @syn_bits = split / /, $syn;
- $syn_bits[1] =~ s/:/ /;
-
- my $source = $source_hash->{$syn_bits[0]}->{name};
- $xml .= qq{
- <field name="synonym">$syn_bits[1] [source; $source]</field>};
- }
-$xml .= qq{
- </additional_fields>
-</entry>
-};
-
- }
-
- p($xml);
- }
-
- footer($sth->rows);
- print_time($start_time);
-
-}
-
-
-
-
-
-
-
-sub dumpUnmappedFeatures {
- my ( $dbspecies, $conf ) = @_;
-
- my $db = 'core';
- my $COREDB = $conf->{$db}->{$release} or next;
- my $file = "$dir/UnmappedFeature_$COREDB.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $COREDB to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
- header( $COREDB, $dbspecies, $db );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$COREDB", $user, $pass ) or die "DBI::error";
-
- my %unmapped_queries = (
- 'None' => qq(
- select a.logic_name, e.db_display_name,
- uo.identifier, ur.summary_description,
- 'Not mapped'
- from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo,
- $COREDB.unmapped_reason as ur
- where a.analysis_id = uo.analysis_id and
- uo.external_db_id = e.external_db_id and
- uo.unmapped_reason_id = ur.unmapped_reason_id and
- uo.ensembl_id = 0
- ),
- 'Transcript' => qq(
- select a.logic_name, e.db_display_name,
- uo.identifier, ur.summary_description,
- concat( 'Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id )
- from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo,
- $COREDB.unmapped_reason as ur, $COREDB.transcript_stable_id as tsi,
- $COREDB.transcript as t, $COREDB.gene_stable_id as gsi
- where a.analysis_id = uo.analysis_id and
- uo.external_db_id = e.external_db_id and
- uo.unmapped_reason_id = ur.unmapped_reason_id and
- uo.ensembl_id = t.transcript_id and
- uo.ensembl_object_type = 'Transcript' and
- t.transcript_id = tsi.transcript_id and
- t.gene_id = gsi.gene_id
- ),
- 'Translation' => qq(
- select a.logic_name, e.db_display_name, uo.identifier, ur.summary_description,
- concat( 'Translation: ',trsi.stable_id,'; Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id )
- from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo,
- $COREDB.unmapped_reason as ur, $COREDB.transcript_stable_id as tsi,
- $COREDB.translation as tr, $COREDB.translation_stable_id as trsi,
- $COREDB.transcript as t, $COREDB.gene_stable_id as gsi
- where a.analysis_id = uo.analysis_id and
- uo.external_db_id = e.external_db_id and
- uo.unmapped_reason_id = ur.unmapped_reason_id and
- uo.ensembl_id = tr.translation_id and
- tr.transcript_id = t.transcript_id and
- trsi.translation_id = tr.translation_id and
- uo.ensembl_object_type = 'Translation' and
- t.transcript_id = tsi.transcript_id and
- t.gene_id = gsi.gene_id
- )
- );
- my $entry_count = 0;
- foreach my $type ( keys %unmapped_queries ) {
- my $SQL = $unmapped_queries{$type};
- my $sth = $dbh->prepare($SQL);
- $sth->execute;
- while ( my $T = $sth->fetchrow_arrayref() ) {
-
- # print join "\t", ("$species\t") . qq(Unmapped feature),
- # "$T->[1] $T->[2]",
- # "$dbspecies/featureview?type=Gene;id=$T->[2]", "$T->[2] $T->[4]",
- # "$T->[3]; $T->[4]\n";
- p unmappedFeatureXML( $T, $dbspecies )
-
- }
- $entry_count += $sth->rows
-
- }
-
- footer($entry_count);
-
-}
-
-sub unmappedFeatureXML {
- my ( $xml_data, $dbspecies ) = @_;
-
- return qq{
- <entry id="$xml_data->[2]">
- <name>$xml_data->[1] $xml_data->[2]</name>
- <description>$xml_data->[3]; $xml_data->[4]</description>
- <additional_fields>
- <field name="species">$dbspecies</field>
- <field name="featuretype">UnmappedFeature</field>
- </additional_fields>
- </entry>};
-
-}
-
-sub dumpUnmappedGenes {
- my ( $dbspecies, $conf ) = @_;
-
- my $db = 'core';
- my $dbname = $conf->{$db}->{$release} or next;
-
- my $file = "$dir/UnmappedGene_$dbname.xml";
- $file .= ".gz" unless $nogzip;
- my $start_time = time;
- warn "Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
-
- unless ($nogzip) {
- $fh = new IO::Zlib;
- $fh->open( "$file", "wb9" )
- || die("Can't open compressed stream to $file: $!");
- }
- else {
- open( FILE, ">$file" ) || die "Can't open $file: $!";
- }
- header( $dbname, $dbspecies, $db );
- my $dsn = "DBI:mysql:host=$host";
- $dsn .= ";port=$port" if ($port);
- my $ecount;
- my $dbh = DBI->connect( "$dsn:$dbname", $user, $pass ) or die "DBI::error";
-
- my $COREDB = $conf->{$db}->{$release};
-
- my %current_stable_ids = ();
- foreach my $type (qw(gene transcript translation)) {
- $current_stable_ids{$type} = {
- map { @$_ } @{
- $dbh->selectall_arrayref(
- "select stable_id,1 from $COREDB." . $type . "_stable_id"
- )
- }
- };
- }
- my $species = $dbspecies;
- my $sth = $dbh->prepare(
- qq(
- select sie.type, sie.old_stable_id, if(isnull(sie.new_stable_id),'NULL',sie.new_stable_id), ms.old_release*1.0 as X, ms.new_release*1.0 as Y
- from $COREDB.mapping_session as ms, $COREDB.stable_id_event as sie
- where ms.mapping_session_id = sie.mapping_session_id and ( old_stable_id != new_stable_id or isnull(new_stable_id) )
- order by Y desc, X desc
- )
- );
-
- $sth->execute();
- my %mapping = ();
- while ( my ( $type, $osi, $nsi ) = $sth->fetchrow_array() ) {
- next
- if $current_stable_ids{$type}{ $osi
- }; ## Don't need to cope with current IDS already searchable...
- $mapping{$type}{$osi}{$nsi} = 1;
- if ( $mapping{$type}{$nsi} ) {
- foreach ( keys %{ $mapping{$type}{$nsi} } ) {
- $mapping{$type}{$osi}{$_} = 1;
- }
- }
- }
-
- foreach my $type ( keys %mapping ) {
- $ecount += scalar keys %{ $mapping{$type} }, ' ';
-
- foreach my $osi ( keys %{ $mapping{$type} } ) {
-
- my @current_sis = ();
- my @deprecated_sis = ();
- foreach ( keys %{ $mapping{$type}{$osi} } ) {
- if ( $current_stable_ids{$_} ) {
- push @current_sis, $_;
- }
- elsif ( $_ ne 'NULL' ) {
- push @deprecated_sis, $_;
- }
- }
- if (@current_sis) {
-
- my $description =
-qq{$type $osi is no longer in the Ensembl database but it has been mapped to the following current identifiers: @current_sis}
- . (
- @deprecated_sis
- ? "; and the following deprecated identifiers: @deprecated_sis"
- : ''
- );
- p unmappedGeneXML( $osi, $dbspecies, $description, lc($type) );
-
- }
- elsif (@deprecated_sis) {
-
- my $description =
-qq($type $osi is no longer in the Ensembl database but it has been mapped to the following identifiers: @deprecated_sis);
- p unmappedGeneXML( $osi, $dbspecies, $description, lc($type) );
- }
- else {
-
- my $description =
-qq($type $osi is no longer in the Ensembl database and it has not been mapped to any newer identifiers);
- p unmappedGeneXML( $osi, $dbspecies, $description, lc($type) );
- }
- }
- }
-
- footer($ecount);
-}
-
-sub unmappedGeneXML {
- my ( $id, $dbspecies, $description, $type ) = @_;
-
- return qq{
- <entry id="$id">
- <description>$description</description>
- <additional_fields>
- <field name="species">$dbspecies</field>
- <field name="featuretype">Unmapped$type</field>
- </additional_fields>
- </entry>};
-
-}
-
-sub make_counter {
- my $start = shift;
- return sub { $start++ }
-}
-
-sub FamilyDumped {
- my $is_dumped;
- return sub { $is_dumped }
-}
View
975 utils/search_dump.pl
@@ -0,0 +1,975 @@
+#!/usr/local/bin/perl
+
+#
+# Dump search XML files for indexing by the EBEye search engine.
+#
+
+package ebi_search_dump;
+
+use strict;
+use DBI;
+use Carp;
+use File::Basename qw( dirname );
+use File::Find;
+use FindBin qw($Bin);
+use Getopt::Long;
+use IO::Zlib;
+use Data::Dumper;
+use HTML::Entities;
+
+BEGIN{
+ unshift @INC, "$Bin/../../conf";
+ unshift @INC, "$Bin/../..";
+ eval{ require utils::Tool };
+ if ($@){ warn "Can't use utils::Tool (required for ensemblgenomes)\n"; }
+}
+
+my (
+ $host, $user, $pass, $port, $species, $ind,
+ $release, $max_entries, $nogzip, $parallel, $dir, $inifile,
+ $nogenetrees, $novariation, $noxrefs, $skip_existing, $format, $noortholog
+);
+
+my %rHash = map { $_ } @ARGV;
+if ( $inifile = $rHash{'-inifile'} ) {
+ my $icontent = `cat $inifile`;
+ warn $icontent;
+ eval $icontent;
+}
+
+GetOptions(
+ "host=s", \$host, "port=i", \$port,
+ "user=s", \$user, "pass=s", \$pass,
+ "species=s", \$species, "release=s", \$release,
+ "index=s", \$ind, "nogzip!", \$nogzip,
+ "max_entries=i", \$max_entries, "parallel", \$parallel,
+ "dir=s", \$dir, "help", \&usage,
+ "inifile=s", \$inifile,
+ "nogenetrees", \$nogenetrees,
+ "novariation", \$novariation,
+ "noxrefs", \$noxrefs,
+ "skipexisting", \$skip_existing,
+ "format=s", \$format,
+ "noortholog", \$noortholog,
+ );
+
+$format ||= 'ensembl';
+$ind ||= 'ALL';
+$dir ||= ".";
+$release ||= 'LATEST';
+
+
+if ($format eq 'solr') {
+ $novariation = 1;
+ $nogenetrees = 1;
+ $nogzip = 1;
+ $ind = 'Gene';
+ $noortholog = 1;
+}
+
+
+usage() and exit unless ( $host && $port && $user);
+
+## HACK 1 - if the INDEX is set to all grab all dumper methods...
+my @indexes = split ',', $ind;
+@indexes = map { /dump(\w+)/ ? $1 : () } keys %ebi_search_dump:: if $ind eq 'ALL';
+#warn Dumper \@indexes;
+
+my $dbHash = get_databases();
+print "*** No databases found ***\n" unless %{$dbHash};
+#warn Dumper $dbHash;
+
+my @datasets = split ',', $species;
+# restrict species to only those defined in the current eg site
+@datasets = @datasets ? @{ utils::Tool::check_species(\@datasets) } : @{ utils::Tool::all_species() };
+
+print "\nDatasets to process: \n " . join("\n ", @datasets) . "\n";
+
+my $entry_count;
+my $global_start_time = time;
+my $total = 0;
+my $fh;
+
+foreach my $dataset ( @datasets ) {
+ my $conf = $dbHash->{lc($dataset)};
+
+ foreach my $index (@indexes) {
+ my $function = "dump$index";
+ no strict "refs";
+
+ $dataset =~ s/_/ /g;
+
+ if ( $index eq 'Gene' ) {
+ &$function( ucfirst($dataset), $conf );
+ }
+ }
+}
+
+print_time($global_start_time);
+warn " Dumped $total entries ...\n";
+
+#------------------------------------------------------------------------------
+
+sub text_month {
+ my $m = shift;
+ my @months = qw[JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC];
+ return $months[$m];
+}
+
+sub print_time {
+ my $start = shift;
+ my $t = time - $start;
+ my $s = $t % 60;
+ $t = ( $t - $s ) / 60;
+ my $m = $t % 60;
+ $t = ( $t - $m ) / 60;
+ my $h = $t % 60;
+ print "Time taken: " . $h . "h " . $m . "m " . $s . "s\n";
+}
+
+sub usage {
+ print <<EOF; exit(0);
+
+Usage: perl $0 <options>
+
+ -host REQUIRED. Database host to connect to.
+ -port REQUIRED. Database port to connect to.
+ -user Database username. Defaults to ensro.
+ -species Species name. Defaults to ALL.
+ -index Index to create. Defaults to ALL.
+ -release Release of the database to dump. Defaults to 'latest'.
+ -pass Password for user.
+ -dir Directory to write output to. Defaults to /lustre/scratch1/ensembl/gp1/xml.
+ -nogzip Don't compress output as it's written.
+ -help This message.
+ -inifile First take the arguments from this file. Then overwrite with what is provided in the command line
+
+EOF
+}
+
+sub get_databases {
+
+ my ( $dbHash, $dbcHash );
+
+ my $dsn = "DBI:mysql:host=$host";
+ $dsn .= ";port=$port" if ($port);
+
+ my $dbh = DBI->connect( $dsn, $user, $pass );
+ my @dbnames = map { $_->[0] } @{ $dbh->selectall_arrayref("show databases") };
+ $dbh->disconnect();
+
+ my $latest_release = 0;
+ my ( $db_species, $db_release, $db_type );
+ my $compara_hash;
+ for my $dbname (@dbnames) {
+
+ if ( ($db_type, $db_release) = $dbname =~ /^ensembl_compara_(\w+)_(\d+)_\w+/ ) {
+
+ $compara_hash->{$db_type}->{$db_release} = $dbname;
+
+ } elsif ( ( $db_species, $db_type, $db_release ) = $dbname =~ /^([a-z]+_[a-z0-9]+)(?:_collection)?_([a-z]+)_(\d+)_\w+$/ ) {
+
+ $db_species =~ s/_collection$//;
+ $latest_release = $db_release if ( $db_release > $latest_release );
+ $dbHash->{$db_species}->{$db_type}->{$db_release} = $dbname;
+
+ }
+ }
+
+ map { $dbHash->{$_}->{'compara'} = $compara_hash } keys %$dbHash;
+ $release = $latest_release if ( $release eq 'LATEST' );
+
+ return $dbHash;
+}
+
+sub footer {
+ my ($ecount) = @_;
+
+ unless ($format eq 'solr') {
+ p("</entries>");
+ p("<entry_count>$ecount</entry_count>");
+ p("</database>");
+}
+ print "Dumped $ecount entries\n";
+ if ($nogzip) {
+ close(FILE) or die $!;
+ }
+ else {
+ $fh->close();
+ }
+ $total += $ecount;
+}
+
+sub header {
+ my ( $dbname, $dataset, $dbtype ) = @_;
+
+ p("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>");
+ p("<!DOCTYPE database [ <!ENTITY auml \"&#228;\">]>");
+ p("<database>");
+ p("<name>$dbname</name>");
+ p("<description>Ensembl Genomes $dataset $dbtype database</description>");
+ p("<release>$release</release>");
+ p("");
+ p("<entries>");
+}
+
+sub p {
+ my ($str) = @_;
+ # TODO - encoding
+ $str .= "\n";
+ if ($nogzip) {
+ print FILE $str or die "Can't write to file ", $!;
+ }
+ else {
+ print $fh $str or die "Can't write string: $str";
+ }
+}
+
+sub format_date {
+ my $t = shift;
+ my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ];
+ $y += 1900;
+ $d = "0" . $d if ( $d < 10 );
+ my $mm = text_month($m);
+ return "$d-$mm-$y";
+}
+
+sub format_datetime {
+ my $t = shift;
+ my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ];
+ $y += 1900;
+ $d = "0" . $d if ( $d < 10 );
+ my $ms = text_month($m);
+ return sprintf "$d-$ms-$y %02d:%02d:%02d", $hh, $mm, $ss;
+}
+
+sub dumpGene {
+
+ my ( $dataset, $conf ) = @_;
+
+ #foreach my $DB ( 'core') { #, 'vega' ) {
+ my $DB = 'core';
+ my $SNPDB = $novariation ? undef : eval { $conf->{variation}->{$release} };
+ my $FUNCGENDB = eval { $conf->{funcgen}->{$release} };
+ my $DBNAME = $conf->{$DB}->{$release} or warn "$dataset $DB $release: no database not found";
+ next unless $DBNAME;
+
+ print "\nSTART dumpGene\n";
+ print "Database: $DBNAME\n";
+
+ my $dsn = "DBI:mysql:host=$host";
+ $dsn .= ";port=$port" if ($port);
+
+ my $dbh;
+ my $attempt = 0;
+ my $max_attempts = 100;
+ while (!$dbh and ++$attempt <= $max_attempts) {
+ eval { $dbh = DBI->connect( "$dsn:$DBNAME", $user, $pass ) };
+ warn "DBI connect error: $@" if $@;
+ if (!$dbh) {
+ warn "Failed DBI connect attempt $attempt of $max_attempts\n" if !$dbh;
+ sleep 5;
+ }
+ }
+
+ $dbh->do("SET sql_mode = 'NO_BACKSLASH_ESCAPES'"); # metazoa have backslahes in thier gene names and synonyms :.(
+
+ # determine genomic unit
+ my $division = $dbh->selectrow_array("SELECT meta_value FROM meta WHERE meta_key = 'species.division'");
+ (my $genomic_unit = lc($division)) =~ s/^ensembl//; # eg EnsemblProtists -> protists
+ print "Genomic unit: " . $genomic_unit . "\n";
+
+ my $genetree_lookup = $nogenetrees ? {} : get_genetree_lookup($genomic_unit, $conf);
+
+ my $haplotypes = $dbh->selectall_hashref(
+ "SELECT gene_id FROM gene g, assembly_exception ae WHERE g.seq_region_id=ae.seq_region_id AND ae.exc_type='HAP'",
+ [qw(gene_id)]
+ );