# Overview

Simple notebook made to create a table with RBHs between infered ORFs and annotated features by employing R's library *orthologr*. Homolog groups are then filtered in order to consider only those homolog groups that have at least one of its members being annotated in the original genome annotation (all other members being inferred ORFs). This procedure allows to include in our analyses ORFs not present in the original annotation if they show signal of homology to at least one feature of the original annotations.

In [1]:
## loading libraries
library(tidyverse)
library(magrittr)
library(glue)
library(bioseq)
library(orthologr)

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘magrittr’


The following object is masked from ‘package:purrr’:

    set_names


The following object is masked from ‘package:tidyr’:

    extract



Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse




ERROR: Error in library(orthologr): there is no package called ‘orthologr’


In [None]:
# loading homologous groups and subsetting for those with more than two sequences
relevant_groups.tibble = tibble(homologous_group_file = list.files('../results/MEs_predicted_orfs_homologues/ME_f0_0taxa_algOMCL_e1_C70_S35_', full.names = T, pattern = '.faa$')) %>%
  dplyr::mutate(getting_sequence = pmap(list(homologous_group_file), ~{
    sequences = bioseq::read_fasta(..1, type = 'AA') 
    labels = names(sequences)
    #sequences %>% as_tibble() %>% dplyr::rename(sequence = 'value') %>% dplyr::mutate(label = labels, n_seqs = length(labels)) 
    sequences %>% as_tibble() %>% dplyr::mutate(label = labels, n_seqs = length(labels))
  }
                                       )
               ) %>%
  tidyr::unnest() %>%
  dplyr::filter(n_seqs > 1) %>%
  dplyr::mutate(label = label %>% str_split(' ') %>% purrr::map_chr(1))

In [None]:
# performing search for BRHs among predicted ORFs and annotated proteins. filtering with very stringent criteria in order to get good correspondence table
## an example
# listing annotated features
# listing PATRIC annotated ORFs
patric_annotated.tibble = tibble(annotated_features = list.files('../results/ME_PATRIC_annotated_features', pattern = '.faa$', full.names = T)) %>%
    dplyr::mutate(tag = annotated_features %>% str_split('/') %>% purrr::map_chr(4) %>% str_replace_all(., '.PATRIC.*$', ''))

# listing getorf ORFs
getorf_annotated.tibble = tibble(predicted_ORFs = list.files('../results/MEs_predicted_orfs', pattern = '.faa$', full.names = T)) %>%
    dplyr::mutate(tag = predicted_ORFs %>% str_split('/') %>% purrr::map_chr(4) %>% str_replace_all(., '.MAG.predicted_orfs.faa|.predicted_orfs.faa|.metagenome.predicted_orfs.faa', ''))

# joining them
joint_features.tibble = patric_annotated.tibble %>%
    dplyr::left_join(x = .,
                     y = getorf_annotated.tibble,
                     by = c('tag')) %>%
    dplyr::select(-tag)

RBH.tibble = joint_features.tibble %>%
rowwise() %>%
dplyr::transmute(., parsing_tables = pmap(list(annotated_features, predicted_ORFs), ~{
                                annotated_features = ..1
                                predicted_ORFs = ..2
                                
                              RBH.tibble = orthologr::orthologs(query_file = annotated_features,
                                        subject_files   = predicted_ORFs,
                                        seq_type        = "protein", 
                                        ortho_detection = "RBH",
                                        comp_cores      = 1,
                                        clean_folders   = FALSE)
                              
                              # filtering table
                              RBH.tibble 
                              })
                 ) %>%
  tidyr::unnest()

# saving RBH.tibble
RBH.tibble %>% readr::write_tsv(., '../results/MEs_annotated_features_vs_predicted_orfs_BRHs.tsv', col_names = T)

In [None]:
# filtering relevant homologous groups in order to get those with at least one member belonging to the annotated set of proteins
filtered_groups = relevant_groups.tibble %>%
  dplyr::filter(label %in% RBH.tibble$subject_id) %>% 
  .$homologous_group_file %>% unique()

system('mkdir ../results/MEs_predicted_orfs_filtered_homologous_groups')

# copying those groups to folder
for(i in seq_along(unique(filtered_groups))) {
  # copying
  file = unique(filtered_groups)[i]
  filename = file %>% str_split('/') %>% purrr::map_chr(5)
  system(glue('cp {file} ../results/MEs_predicted_orfs_filtered_homologous_groups/{filename}'))
  }