# Combine peptide predictions with metadata

This notebook combines peptide predictions with other metadata to help assess which peptides would be best suited for experimental followup.
The final output is a TSV file with the peptide predictions and their metadata.

## Notebook setup

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
setwd("..")

## Define some descriptive variables associated with trait mapping

In [3]:
evidence_of_itch_suppression_species <- c("Sarcoptes scabiei",
                                          "Psoroptes ovis",
                                          "Amblyomma americanum",
                                          "Amblyomma sculptum",
                                          "Dermacentor andersoni",
                                          "Dermacentor silvarum",
                                          "Dermacentor variabilis",
                                          "Haemaphysalis longicornis",
                                          "Hyalomma asiaticum",
                                          "Ixodes persulcatus",
                                          "Ixodes ricinus",
                                          "Ixodes scapularis",
                                          "Rhipicephalus microplus",
                                          "Rhipicephalus sanguineus")

In [4]:
ticks <- c("Amblyomma americanum",
           "Amblyomma sculptum",
           "Dermacentor andersoni",
           "Dermacentor silvarum",
           "Dermacentor variabilis",
           "Haemaphysalis longicornis",
           "Hyalomma asiaticum",
           "Ixodes persulcatus",
           "Ixodes ricinus",
           "Ixodes scapularis",
           "Rhipicephalus microplus",
           "Rhipicephalus sanguineus")

## Read in & format data

In [5]:
# read in the summarized orthogroup information
orthogroup_peptide_summary_filtered <- read_tsv("outputs/notebooks/20241125_orthogroup_peptide_summary.tsv", show_col_types = FALSE)

In [6]:
# read in the peptigate predictions
peptigate_predictions <- read_tsv("outputs/ToT_20241125/predictions/peptide_predictions.tsv", show_col_types = F) %>%
  mutate(peptide_length = nchar(protein_sequence),
         locus_tag = gsub("_start.*", "", peptide_id)) %>%
  mutate(peptide_class = ifelse(is.na(peptide_class), "sORF", peptide_class)) %>%
  filter(peptide_class != "Propeptide")

nrow(peptigate_predictions)
length(unique(peptigate_predictions$peptide_id))
length(unique(peptigate_predictions$protein_sequence))

In [7]:
# read in the per-peptide trait mapping (itch suppression) metadata
trait_mapping_metadata <- read_tsv("inputs/2024-11-25-top-positive-significant-clusters-orthogroups-annotations.tsv.gz", show_col_types = F) %>%
  mutate(species = gsub("-", " ", species)) %>%
  rename_with(.cols = everything(), function(x){paste0("traitmapping_", x)})

table(peptigate_predictions$locus_tag %in% trait_mapping_metadata$traitmapping_locus_tag)


TRUE 
 356 

In [8]:
# read in and format the peptigate annotations
peptigate_annotations <- read_tsv("outputs/ToT_20241125/predictions/peptide_annotations.tsv", show_col_types = F) %>%
  filter(peptide_id %in% peptigate_predictions$peptide_id)        

nrow(peptigate_annotations)
length(unique(peptigate_annotations$peptide_id))
length(unique(peptigate_annotations$sequence))

In [9]:
# read in and format clustering information (mmseqs2, 80% identity)
clustering <- read_tsv("outputs/analysis/clustering/all_peptides_0.8_cluster.tsv",
                       show_col_types = FALSE,
                       col_names = c("mmseqs2_representative_sequence",
                                     "mmseqs2_cluster_member")) %>%
  filter(mmseqs2_cluster_member %in% peptigate_predictions$peptide_id)

cluster_summary <- clustering %>% 
  group_by(mmseqs2_representative_sequence) %>%
  tally() %>%
  select(mmseqs2_representative_sequence, mmseqs2_num_peptides_in_cluster = n)

clustering <- left_join(clustering, cluster_summary, by = "mmseqs2_representative_sequence")

In [10]:
# read in the anti-inflammatory prediction information
antiinflammatory <- read_tsv("outputs/analysis/predict_antiinflammatory/autopeptideml_antiinflammatory_predictions.tsv", show_col_types = F) %>%
  select(-sequence)

In [11]:
# transcriptome shotgun assembly (TSA) salivary gland (sg) transcriptome peptide prediction BLAST hits
tsa_sg_blastp <- read_tsv("outputs/analysis/compare_tsa_sg/tsa_sg_peptides_blastp_matches.tsv", show_col_types = FALSE) %>%
  # select only one blast hit for each query peptide
  group_by(qseqid) %>%
  slice_max(bitscore) %>%
  slice_min(evalue) %>%
  slice_head(n = 1) %>%
  ungroup() %>%
  # filter out hits to propeptides
  filter(qseqid %in% peptigate_predictions$peptide_id) %>%
  # rename columns so they're easier to interpret
  rename_with(.cols = everything(), function(x){paste0("sgpeptide_blast_", x)})

## Join data together

In [12]:
predictions <- left_join(peptigate_predictions, trait_mapping_metadata, by = c("locus_tag" = "traitmapping_locus_tag")) %>%
  mutate(traitmapping_species = gsub("-", " ", traitmapping_species)) %>%
  mutate(evidence_of_itch_suppression = ifelse(traitmapping_species %in% evidence_of_itch_suppression_species, 
                                               "evidence of itch suppression", 
                                               "no evidence of itch suppression"))
nrow(predictions)

In [13]:
# note that this is a right join because the orthogroup_peptide_summary_filtered data frame 
# removes peptides that didn't have a hit in their orthogroup to peptides expressed in tick salivary gland transcriptomes
predictions <- right_join(predictions, orthogroup_peptide_summary_filtered,
                          by = c("traitmapping_cluster", "traitmapping_orthogroup", 
                                 "traitmapping_signif_level", "traitmapping_signif_fdr", "traitmapping_coefficient"))
nrow(predictions)

In [14]:
predictions <- left_join(predictions, peptigate_annotations, by = "peptide_id")
nrow(predictions)

In [15]:
predictions <- left_join(predictions, antiinflammatory, by = "peptide_id")
nrow(predictions)

In [16]:
predictions <- left_join(predictions, clustering, by = c("peptide_id" = "mmseqs2_cluster_member"))
nrow(predictions)

In [17]:
predictions <- left_join(predictions, tsa_sg_blastp, by = c("peptide_id" = "sgpeptide_blast_qseqid"))
nrow(predictions)

In [18]:
# remove duplicate amino acid sequences. Just select the first as they should be roughly equivalent in metadata
predictions <- predictions %>%
  group_by(protein_sequence) %>%
  slice_head(n = 1) %>%
  ungroup()
nrow(predictions)
length(unique(predictions$protein_sequence))

In [19]:
write_tsv(predictions, "outputs/notebooks/20241125_predictions_with_metadata.tsv")

In [20]:
predictions %>% 
 group_by(prediction_tool) %>%
 tally()

prediction_tool,n
<chr>,<int>
deeppeptide,103
less_than_100aa,201
nlpprecursor,7


In [21]:
length(unique(predictions$traitmapping_orthogroup))

In [22]:
length(unique(predictions$mmseqs2_representative_sequence))

In [23]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS/LAPACK: /Users/taylorreiter/miniconda3/envs/tidyjupyter/lib/libopenblasp-r0.3.28.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: system (macOS)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
 [5] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
 [9] ggplot2_3.5.1   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] bit_4.5.0.1       gtable_0.3.6      jsonlite_1.8.9    compiler_4.3.3   
 [5] crayon_1.5.3      tidyselect_1.2.1  IRdisplay_1.1     parallel_4.3.3   
 [9] scales_1.3.0      uuid_1.2-1        fastmap_1.2.0     IRkernel_1.3.2   
[13] R6_2.5.1          generics_0.1.