modules/Bio/EnsEMBL/Pipeline/Alignment/InformativeSites.pm

=head1 LICENSE

# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2018] EMBL-European Bioinformatics Institute
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.

  Questions may also be sent to the Ensembl help desk at
  <http://www.ensembl.org/Help/Contact>.

=cut

=head1 NAME

Bio::EnsEMBL::Pipeline::Alignment::InformativeSites - 

=head1 SYNOPSIS


=head1 DESCRIPTION


=head1 METHODS

=cut

package Bio::EnsEMBL::Pipeline::Alignment::InformativeSites;

use warnings ;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment;
use Bio::EnsEMBL::Utils::Exception qw(throw warning info); 
use Bio::EnsEMBL::Utils::Argument qw(rearrange);

# Object preamble - inherits from Bio::Root::Object;

@ISA = qw(Bio::EnsEMBL::Root);


sub new {
  my ($class, @args) = @_;

  my $self = bless {},$class;

  my ($translatable_seqs,
      $alignments
     ) = rearrange([qw(TRANSLATABLE_SEQS
		       ALIGNMENTS
                      )],@args);

  throw("No sequences to align.")
    unless $translatable_seqs;

  $self->_seqs($translatable_seqs);

  throw("No alignments to reconcile.")
    unless (defined $alignments);

  $self->_alignments($alignments);

  throw("Ids of translatable sequences do not fully match with " . 
	"alignment ids.  This sometimes happens because one of " . 
	"the clustered genes has zero EST matches and hence no " . 
	"alignment.")
    unless $self->_check_ids;

  return $self
}

sub informative_sites {
  my $self = shift;

  my $gene_informative_sites = $self->_informative_site_strings;

  my %inform_site_aligns;

  foreach my $seq (@{$self->_seqs}) {
    # Avoid making sequences where no informative sites exist
    # ie. identical sequences.
    next 
      unless $gene_informative_sites->{$seq->display_id} =~ /\*/; 

    $inform_site_aligns{$seq->display_id} = 
      $self->_derive_informative_site_alignment(
	 $seq, 
         $gene_informative_sites->{$seq->display_id});
  }

  return \%inform_site_aligns
}


sub _seqs {
  my $self = shift;

  if (@_) {
    $self->{_seqs} = shift;

    throw("Was expecting the translatable sequences to be Bio::Seq objects.")
      unless (scalar @{$self->{_seqs}} > 0 &&
	      defined $self->{_seqs}->[0] &&
	      $self->{_seqs}->[0]->isa("Bio::Seq")); 

    foreach my $seq (@{$self->{_seqs}}){
      throw("Gaps found in translatable sequence.  These should be gap-free.")
	if $seq->seq =~ /-/;
    }
  }

  return $self->{_seqs}
}

sub _alignments {
  my $self = shift;

  if (@_) {
    $self->{_aligns} = shift;

    throw("Was expecting alignments to be " .
	  "Bio::EnsEMBL::Pipeline::Alignment objects.")
      unless (scalar @{$self->{_aligns}} > 0 &&
	      defined $self->{_aligns}->[0] &&
	      $self->{_aligns}->[0]->isa("Bio::EnsEMBL::Pipeline::Alignment"));

    $self->{_align_vs_id} = undef;
  }

  return $self->{_aligns}
}

sub _retrieve_align_by_id {
  my ($self, $id) = @_;

  unless (defined $self->{_align_vs_id}) {
    foreach my $align (@{$self->_alignments}){
      $self->{_align_vs_id}->{$align->alignment_name} = $align
    }
  }

  warning ("Alignment with identifier [$id] does not exist.")
    unless $self->{_align_vs_id}->{$id};

  return $self->{_align_vs_id}->{$id}
}

sub _check_ids {
  my $self = shift;

  my %alignment_names;
  my $verdict = 1;

  foreach my $align (@{$self->_alignments}){
    $alignment_names{$align->alignment_name}++;
  }

  foreach my $seq (@{$self->_seqs}){
    $verdict = 0 
      unless $alignment_names{$seq->display_id}
  }

  return $verdict
}

sub _informative_site_strings {
  my $self = shift;

  # Do a codon based alignment on our translatable sequences

  my $cba = 
    Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment->new(
      -genetic_code => 1);

  $cba->sequences($self->_seqs); 
  my $aligned_seqs = $cba->run_alignment;

  # Work through the length of this alignment marking sequence
  # variations/informative sites.  Informative site information
  # is stored as a long string, the same length as each UNALIGNED
  # sequence.  Informative sites are marked as '*' and uninformative
  # sites are marked as '-'

  my @seq_arrays;
  my @seq_name;
  foreach my $realigned_seq (@$aligned_seqs) {
    my @seq_array = split //, $realigned_seq->seq;
    push @seq_arrays, \@seq_array;
    push @seq_name, $realigned_seq->display_id;
  }

  my $alignment_length = length($aligned_seqs->[0]->seq);

  my %informative_site_string;
  for (my $i = 0; $i < $alignment_length; $i++) {
    my $informative_site = 0;
    my $reference_base = $seq_arrays[0]->[$i];
    for (my $j = 1; $j < scalar @seq_arrays; $j++) {
      if ($seq_arrays[$j]->[$i] ne $reference_base) {
	$informative_site = 1;
	last
      }
    }
    for (my $j = 0; $j < scalar @seq_arrays; $j++) {
      if ($informative_site && $seq_arrays[$j]->[$i] ne '-') {
	$informative_site_string{$seq_name[$j]} .= '*';
      } elsif ($seq_arrays[$j]->[$i] ne '-') {
	$informative_site_string{$seq_name[$j]} .= '-';
      }
    }
  }

  @seq_arrays = undef;

  return \%informative_site_string
}

sub _derive_informative_site_alignment {
  my ($self, $seq, $informative_sites_string) = @_;

  # Put gene sequence somewhere handy

  my $gene_string = $seq->seq;
  my $gene_length = length($gene_string);
#print STDERR ">gene seq\n" . $gene_string . "\n";
  # Take our existing external EvidenceAlignment and
  # put the evidence sequences somewhere accessible.

  my $ext_align = $self->_retrieve_align_by_id($seq->display_id);

  my %evidence_strings;
  foreach my $align_seq (@{$ext_align->fetch_AlignmentSeqs}){
    $evidence_strings{$align_seq->name} = $align_seq->seq;
  }

  my $align_length = length($evidence_strings{'genomic_sequence'});

  # Place the sequence for gene onto the identical sequence in the
  # external alignment.  Start by finding the 5 prime end of the 
  # gene and slide along the length of the alignment, matching identical
  # bases and hopping gaps in the sequence and intron.  Where
  # an informative site is apparently, splice out these bases from all
  # evidence sequences to create strings of informative sites.

  my %aligned_informative_site_strings;
  my $exon_string = $evidence_strings{'exon_sequence'};
  $exon_string =~ s/intron-truncated/----------------/g;
#print STDERR ">exon seq\n$exon_string\n";
  my $align_coord = 0;
  my $lost = 1;

  for (my $gene_coord = 0; $gene_coord < $gene_length; $gene_coord++) {
#print STDERR "Gene coord : " . $gene_coord . "\n";
    # This prone-to-breakage code loop shuffles along the alignment
    # and tries and place the unaligned gene.  As the gene sequence should be
    # base-for-base identical to the exonic sequence in the alignment (barring
    # missing UTR sequence), the code tries to place 10bp substrs of the gene
    # against the aligned exonic sequence.
    if ($lost) {
      my $gene_seed_string = substr($gene_string, $gene_coord, 10);
#print STDERR "  Gene seed : " . $gene_seed_string . "\n";
    LOST:
      while (1){
	my $exon_seed_string = substr($exon_string, $align_coord, 10);

	# Move along if we have starting gaps
	if ($exon_seed_string =~ /^(-+)/){
	  $align_coord += length($1);
	  next LOST;
	}

	# Make sure we havent run off the end of the sequence
	throw("Have run off the end of the alignment without matching the gene seed.  Bugger.")
	  if ($align_coord >= $align_length);

	# Get on with extending the exon seed, without gaps.

	$exon_seed_string =~ s/-//g;

#print STDERR "    Align coord : " . $align_coord . "  Seed : $exon_seed_string\n";
	my $increment = 0;

      EXTENTION:
	while (length ($exon_seed_string) < 8){
	  $increment++;
	  $exon_seed_string = substr($exon_string, $align_coord, 10 + $increment);

	  if ($exon_seed_string =~ /-$/){
	    $increment++;
	    $exon_seed_string =~ s/-//g;
	    next EXTENTION
	  }

	  $exon_seed_string =~ s/-//g;

	  throw("Have run off the end of the alignment without matching the gene seed.  Bugger.")
	    if ($align_coord + 10 + $increment >= $align_length);
#print STDERR "      Extending exon seed : " . $exon_seed_string . "\n";
	}
	
	if ($gene_seed_string =~ /^$exon_seed_string/) {
#print STDERR "Sequence FOUND\n";
	  $lost = 0;
	  last LOST
	}
	$align_coord++;
      }

    }

    # Sanity check
#print STDERR "Gene neighbourhood : " . substr($gene_string, $gene_coord, 10) . " Exon neighbourhood : " . substr($exon_string, $align_coord, 10) . "\n";
    throw("Gene and aligned exon sequence have unexpectedly become unaligned.")
      unless substr($gene_string, $gene_coord, 1) eq substr($exon_string, $align_coord, 1);

    # Do our work while we are still aligned
#print STDERR "Trying to get some work done.\n";
    if (substr($informative_sites_string, $gene_coord, 1) eq '*'){
#print STDERR "  Found an informative site.\n";
      foreach my $evidence_id (keys %evidence_strings) {
#	next if (($evidence_id eq 'genomic_sequence') ||
#	  ($evidence_id eq 'exon_sequence'));
	$aligned_informative_site_strings{$evidence_id} .= 
	  substr($evidence_strings{$evidence_id}, $align_coord, 1)
      }
    } 
else {
#print STDERR "Not an informative site, not interested and moving right along.\n";
}
    # Do a quick check that the next base of the gene will 
    # align to the alignment.  If there is gap, try hopping
    # over it, before assuming that we are lost again.

    my $hop = 1;
    if (($gene_coord + 1 <= $gene_length)&&
       (substr($exon_string, $align_coord + $hop, 1) ne '-')) {
#print STDERR "Next base is not a gap, hence not trying to hop it.\n";
      # Gene coord is about to be incremented in the next iteration 
      # of this loop, so had better do this to $align_coord as well.
      $align_coord++;
      next
    } else {
#print STDERR "Next base is a gap.  Trying to hop.\n";
      while (substr($exon_string, $align_coord + $hop, 1) eq '-'){
	$hop++;
#print STDERR "     Little hop.\n";
	while (substr($exon_string, $align_coord + $hop, 10) eq '-' x 10) {
	  $hop += 10;
#print STDERR "      Big hop.\n";
	  while (substr($exon_string, $align_coord + $hop, 100) eq '-' x 100) {
	    $hop += 100;
#print STDERR "      Really big hop.\n";
	  }
	}
      }
      $align_coord += $hop;
#print STDERR "Align coord after hop is $align_coord.\n";
#print STDERR "Gene neighbourhood : " . substr($gene_string, $gene_coord + 1, 10) . " Exon neighbourhood : " . substr($exon_string, $align_coord, 10) . "\n";
      my $exon_neighbourhood = substr($exon_string, $align_coord, 10);
      $exon_neighbourhood =~ s/-//g;
      if ((! substr($gene_string, $gene_coord + 1, 10) =~ /^$exon_neighbourhood/) 
	  && ($gene_coord < (length($gene_string) - 1))) {
	warning("Erk.  Have managed to become lost.");
	$lost = 1;
      }
    }
  }

#print STDERR "DONE.\n";

  # Finally, quickly make an alignment object out of the
  # informative site strings.

  my $informative_sites_alignment = 
    Bio::EnsEMBL::Pipeline::Alignment->new(
      -name => $seq->display_id);

  foreach my $evidence_id (keys %aligned_informative_site_strings) {
    my $new_is_seq = 
      Bio::EnsEMBL::Pipeline::Alignment::AlignmentSeq->new(
        -name => $evidence_id,
        -seq  => $aligned_informative_site_strings{$evidence_id});

    $informative_sites_alignment->add_sequence($new_is_seq);
  }

  return $informative_sites_alignment;
}