# Summarize results more for pub

Some numbers are missing from other analyses. This notebook calculates those numbers.

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
setwd("..")

In [5]:
inputs <- read_tsv("inputs/2024-11-25-top-positive-significant-clusters-orthogroups-annotations.tsv.gz", show_col_types = FALSE)

In [12]:
initial_predictions <- read_tsv("outputs/ToT_20241125/predictions/peptide_predictions.tsv", show_col_types = F) %>%
  rename_with(.cols = everything(), function(x){paste0("peptigate_", x)}) %>%
  mutate(peptigate_peptide_class = ifelse(is.na(peptigate_peptide_class), "sORF", peptigate_peptide_class)) %>%
  mutate(locus_tag = gsub("_start.*", "", peptigate_peptide_id)) %>% 
  left_join(inputs, by = "locus_tag")

In [16]:
predictions <- read_tsv("outputs/notebooks/20241125_predictions_with_metadata.tsv", show_col_types = FALSE)

## How many input proteins were there? From how many orthogroups?

In [6]:
nrow(inputs)

In [7]:
length(unique(inputs$orthogroup))

## Before filtering propeptides, how many peptides did we predict?

In [9]:
nrow(initial_predictions)

In [10]:
length(unique(initial_predictions$peptigate_protein_sequence))

In [14]:
length(unique(initial_predictions$orthogroup))

## How many peptides of each type are there from the trait mapping data after filting propeptides?

In [17]:
predictions %>% 
 group_by(prediction_tool) %>%
 tally()

nrow(predictions)

prediction_tool,n
<chr>,<int>
deeppeptide,103
less_than_100aa,201
nlpprecursor,7


In [18]:
length(unique(predictions$protein_sequence))

## How many peptides are there for each orthogroup initially?

In [19]:
predictions %>%
  group_by(traitmapping_orthogroup) %>%
  tally() %>%
  ungroup()

traitmapping_orthogroup,n
<chr>,<int>
OG0000079,55
OG0000143,10
OG0000189,21
OG0000194,23
OG0000305,26
OG0000385,9
OG0000746,5
OG0000880,82
OG0001663,6
OG0001774,45


In [7]:
predictions %>%
  group_by(traitmapping_orthogroup) %>%
  summarize(num_predicted_signal_peptides_on_peptides = sum(traitmapping_deepsig_feature == "Signal peptide"))


traitmapping_orthogroup,num_predicted_signal_peptides_on_peptides
<chr>,<int>
OG0000079,1
OG0000143,2
OG0000189,0
OG0000194,0
OG0000385,0
OG0000746,0
OG0000880,39
OG0001663,0
OG0001774,36
OG0002194,0


In [8]:
predictions %>%
  select(traitmapping_orthogroup, traitmapping_coefficient, fraction_of_orthogroup_with_predicted_peptide, ) %>%
  arrange(desc(fraction_of_orthogroup_with_predicted_peptide)) %>%
  distinct() 

traitmapping_orthogroup,traitmapping_coefficient,fraction_of_orthogroup_with_predicted_peptide
<chr>,<dbl>,<dbl>
OG0000880,0.31400766,0.7311828
OG0008102,0.89907992,0.7
OG0001774,1.05408448,0.66129032
OG0000079,0.08290626,0.13959391
OG0011284,2.24534051,0.11111111
OG0000194,0.35313516,0.09704641
OG0001663,0.29542423,0.09375
OG0000189,0.50428119,0.0875
OG0008888,1.69892817,0.0625
OG0000385,0.25070199,0.05844156


In [9]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS/LAPACK: /Users/taylorreiter/miniconda3/envs/tidyjupyter/lib/libopenblasp-r0.3.26.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
 [5] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
 [9] ggplot2_3.5.0   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] bit_4.0.5        gtable_0.3.4     jsonlite_1.8.8   compiler_4.3.3  
 [5] crayon_1.5.2     tidyselect_1.2.0 IRdisplay_1.1    parallel_4.3.3  
 [9] scales_1.3.0     uuid_1.2-0       fastmap_1.1.1    IRkernel_1.3.2  
[13] R6_2.5.1         generics_0.1.3   munsell_0.5.1  