This notebook creates a metadata file for a biological test data set for pipeline development.
The metadata can be used as input to the snakemake pipeline for identifying recent horizontal gene transfer.
It focuses on fungi genera of interest to Arcadians.

This notebook runs on files that are produced by the notebook `notebooks/20230227-genomes-and-transcriptomes.ipynb`, but all input files have been added to the repository.

In [1]:
setwd("..")

In [2]:
library(readr)
library(tidyr)
library(dplyr, warn.conflicts = F)

In [3]:
tsa <- read_tsv("inputs/20230227_tsa.tsv.gz", show_col_types = F)
genbank_genomes_cds <- read_tsv("inputs/20230227_genbank_genomes_cds.tsv.gz", show_col_types = F)

In [4]:
emilys_organisms <- c("Amanita muscaria", "Fomitopsis pinicola", "Ganoderma polychromum",
                      "Hericium erinaceus", "Pleurotus columbinus", "Pleurotus djamor", 
                      "Pleurotus ostreatus", "Trametes veriscolor", "Stropharia rugosoannulata",
                      "Agrocybe aegerita", "Psilocybe cubensis", "Psilocybe cyanescens",
                      "Claviceps purpurea", "Epichloe hybrida", "Erynia aphidis",
                      "Ophiocordyceps caloceroides", "Entomophaga grylli",
                      "Ophiocordyceps amazonica", "Pandora formicae",
                      "Termitomyces titanicus", "Termitomyces eurrhizus",
                      "Termitomyces reticulatus", "Cordyceps")
emilys_genera <- gsub(" .*", "", emilys_organisms) %>% unique()

In [5]:
# add a transcriptome that was mislabelled as a metagenome
tsa2 <- read_csv("inputs/20230228_wgs_selector.csv.gz", show_col_types = F) %>%
  mutate(organism_an = gsub("insect metagenome", "Pandora formicae", organism_an))
tax <- taxize::classification(tsa2$organism_an, db = "ncbi")

[34m══[39m  1 queries  [34m═══════════════[39m



Retrieving data for taxon 'Pandora formicae'




[32m✔  Found: [39m Pandora+formicae
[90m══[39m  Results  [90m═════════════════[39m

• Total: [32m1[39m 
• Found: [32m1[39m 
• Not Found: [32m0[39m


In [6]:
# transform the tax_list object into a long-formatted data frame
# some NCBI taxonomies have a bunch of clades, so the filter command only keeps the main lineage levels
tax_long <- tax %>%
  purrr::imap(~mutate(.x, batch = .y)) %>%
  bind_rows() %>%
  filter(rank %in% c("superkingdom", "kingdom", "phylum", "subphylum", "class", 
                     "order", "family", "genus", "species"))

# make a wide-formatted data frame with lineage names recorded
tax_names <- tax_long %>%
  select(name, rank, batch) %>%
  pivot_wider(id_cols = batch, names_from = rank, values_from = name) %>%
  select(batch, superkingdom, kingdom, phylum, subphylum, class,
         order, family, genus, species)

tsa2 <- left_join(tsa2, tax_names, by = c("organism_an" = "batch"))

In [7]:
tsa <- bind_rows(tsa, tsa2)

In [8]:
tsa_filtered <- tsa %>%
  filter(genus %in% emilys_genera)     # keep only genera we are interested in

tsa_filtered_formatted <- tsa_filtered %>%
  mutate(source = "transcriptome") %>% # add a column to designate these accessions as transcriptomes
  select(organism_name = organism_an, accession = prefix_s, source,
         superkingdom, kingdom, phylum, subphylum, class, subclass, 
         order, suborder, superfamily, family, genus, species) # select a subset of columns

In [9]:
genbank_genomes_cds_filtered <- genbank_genomes_cds %>%
  filter(genus %in% emilys_genera) # keep only genera we are interested in

genbank_genomes_cds_filtered_formatted <- genbank_genomes_cds_filtered %>%
  mutate(source = "genome") %>%    # add a column to designate these accessions as genomes
  select(organism_name, accession = number_assembly_accession, source,
         superkingdom, kingdom, phylum, subphylum, class, subclass, 
         order, suborder, superfamily, family, genus, species) # select a subset of columns

In [10]:
# combine metadata for transcriptomes and genomes
out <- bind_rows(tsa_filtered_formatted, genbank_genomes_cds_filtered_formatted)

# summarize how many observations we have for each genus of interest
out %>%
  group_by(genus) %>% 
  tally()

genus,n
<chr>,<int>
Agrocybe,4
Amanita,6
Claviceps,51
Cordyceps,7
Epichloe,3
Fomitopsis,2
Ganoderma,2
Hericium,2
Ophiocordyceps,12
Pandora,1


In [11]:
write_tsv(out, "inputs/candidate_fungi_for_bio_test_data_set.tsv")

In [12]:
sessionInfo()

R version 4.1.3 (2022-03-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur/Monterey 10.16

Matrix products: default
BLAS/LAPACK: /Users/taylorreiter/miniconda3/envs/pltenv/lib/libopenblasp-r0.3.21.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.0 tidyr_1.3.0 readr_2.1.3

loaded via a namespace (and not attached):
 [1] pbdZMQ_0.3-8      zoo_1.8-11        tidyselect_1.2.0  repr_1.1.5       
 [5] taxize_0.9.100    purrr_1.0.1       lattice_0.20-45   vctrs_0.5.2      
 [9] generics_0.1.3    htmltools_0.5.4   base64enc_0.1-3   utf8_1.2.3       
[13] rlang_1.0.6       pillar_1.8.1      glue_1.6.2        httpcode_0.3.0   
[17] withr_2.5.0       bit64_4.0.5       uuid_1.1-0        foreach_1.5.2    
[21] lifecycle_1.0.3   plyr_1.8.8        stringr_1.5.0     codetools_0.2-18 
[25] evaluate_0.20     