# Filter BLAST results to reads that didn't hit human

## Notebook setup

In [2]:
library(readr)
library(dplyr, warn.conflicts = F)
library(tidyr)
library(janitor, warn.conflicts = F)
library(stringr)
library(purrr)

In [3]:
setwd("..")

## Read in NCBI BLAST results

These results are from BLASTing the reads that hit viral capsids against the full NCBI nt database to make sure that they weren't actually of human/clone origin, and instead are more likely to be viruses.

In [4]:
files <- Sys.glob("outputs/capsid_blast_pident90_blastn/*lineages.tsv")

In [6]:
blast <- files %>%
  set_names() %>%
  map_dfr(read_tsv, col_names = c('qseqid', 'sseqid', 'pident', 'length', 'mismatch',
                                  'gapopen', 'qstart', 'qend', 'qlen', 'sstart',
                                  'send', "slen", "evalue", "bitscore", "sgi",
                                  "sacc", 'staxids', 'sscinames', 'scomnames', 'slineage'),
          col_types = "ccdddddddddddccdccc", .id = "vog", show_col_types = F) %>%
  mutate(vog = gsub("_lineages.tsv", "", basename(vog)),
         vog = gsub("^[^_]*_", "", vog)) %>%
  separate(slineage, into = c("superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species", "strain"), sep = ";") %>%
  separate(qseqid, into = c("run", "read"), sep = "\\.", remove = F) %>%
  separate(read, into = c("read", "pair"), sep = "\\/") %>%
  mutate(run_vog = paste0(run, "_", vog))


“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)”
“[1m[22mOne or more parsing issues, call `problems()` on your data frame for deta

In [7]:
# rescue empty files that had no BLAST results, as these should not be disqualified as non virus just bc they weren't in the nt database
files_empty <- files[file.size(files) == 0L]
files_empty_df <- data.frame(vog = gsub("_lineages.tsv", "", basename(files_empty))) %>%
  mutate(run = gsub("_.*", "", vog),
         vog = gsub("^[^_]*_", "", vog),
         superkingdom = "maybe Viruses")

In [8]:
blast <- bind_rows(blast, files_empty_df)

## Filter out human and vector hits

In [9]:
# filter to samples/VOGs that contained human at all
human <- blast %>%
  filter(staxids == 9606) # remove human, which has taxid 9606

# remove sample/VOG combinations that matched to human
blast_human_rm <- blast %>%
  filter(!run_vog %in% human$run_vog)

In [10]:
# filter to sample/VOGs that matched vectors
vector <- blast_human_rm %>%
  filter(str_detect(string = species, pattern = "[Vv]ector"))

# remove sample/VOG combinations that matched vectors
blast_human_rm <- blast_human_rm %>%
  filter(!run_vog %in% vector$run_vog)

In [11]:
blast_human_rm_tally <- blast_human_rm %>%
  group_by(run, vog, superkingdom) %>% #, kingdom, phylum, class, order, family, genus, species) %>%
  tally() 

In [12]:
blast_has_virus <- blast_human_rm_tally %>%
  filter(str_detect(string = superkingdom, pattern = "Viruses")) %>%
  arrange(desc(vog))

In [14]:
# rejoin with sample metadata to get an idea of how long it will take to run and sequencing type
metadata <- read_csv("inputs/concat_accessions_brain_03302023.csv", show_col_types = F) %>%
  clean_names() %>%
  mutate(gb = round(bytes * 1e-9, digits = 2)) %>%
  mutate(maybe_sc = ifelse(bio_project %in% c("PRJNA736951", "PRJNA245456", "PRJEB24579", "PRJNA273155"), 
                           "possibly_sc", "not_sc"))

In [16]:
virus_abund_pident90 <- read_tsv("virus_abundances_pident90.tsv", show_col_types = F) %>% # note this file is produced by the explore_capsid_blast_results.ipynb
  select(lca_lineage_named, vog = sseqid)

# mark samples that were already qc'd (saves time since intermediate files have already been produced)
already_qcd <- c('SRR14925054', 'SRR14862871', 'ERR2262623', 'SRR9292628', 'SRR1779080')

In [17]:
blast_has_virus <- blast_has_virus %>%
  left_join(metadata, by = "run") %>%
  left_join(virus_abund_pident90, by = "vog") %>%
  mutate(already_qcd = ifelse(run %in% already_qcd, "qcd", "not_qcd")) %>%
  relocate(run, vog, lca_lineage_named, superkingdom, n, bio_project, maybe_sc, gb, already_qcd)

blast_has_virus
# write_tsv(blast_has_virus, "blast_has_virus_new.tsv")

run,vog,lca_lineage_named,superkingdom,n,bio_project,maybe_sc,gb,already_qcd,assay_type,⋯,insdc_center_alias,insdc_center_name,insdc_first_public,insdc_last_update,insdc_status,sample_name_2,submitter_id,disease_stage,treatment,invididual
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,⋯,<chr>,<chr>,<dttm>,<dttm>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
SRR14788345,1891718.YP_007346963.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Alphapolyomavirus;Alphapolyomavirus cardiodermae;unclassified Alphapolyomavirus cardiodermae subspecies/strain,maybe Viruses,1,PRJNA736951,possibly_sc,25.13,not_qcd,WGS,⋯,,,,,,,,,,
SRR14862871,1891718.YP_007346963.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Alphapolyomavirus;Alphapolyomavirus cardiodermae;unclassified Alphapolyomavirus cardiodermae subspecies/strain,maybe Viruses,1,PRJNA736951,possibly_sc,87.33,qcd,WGS,⋯,,,,,,,,,,
SRR14862884,1891718.YP_007346963.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Alphapolyomavirus;Alphapolyomavirus cardiodermae;unclassified Alphapolyomavirus cardiodermae subspecies/strain,maybe Viruses,1,PRJNA736951,possibly_sc,49.02,not_qcd,WGS,⋯,,,,,,,,,,
SRR14999724,1891718.YP_007346963.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Alphapolyomavirus;Alphapolyomavirus cardiodermae;unclassified Alphapolyomavirus cardiodermae subspecies/strain,maybe Viruses,1,PRJNA736951,possibly_sc,23.44,not_qcd,WGS,⋯,,,,,,,,,,
SRR8750801,1891718.YP_007346963.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Alphapolyomavirus;Alphapolyomavirus cardiodermae;unclassified Alphapolyomavirus cardiodermae subspecies/strain,Viruses,52,PRJNA527986,not_sc,1.94,not_qcd,RNA-Seq,⋯,,,,,,,,,,
SRR1778915,1277649.YP_007354884.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Deltapolyomavirus;Deltapolyomavirus undecihominis;unclassified Deltapolyomavirus undecihominis subspecies/strain,Viruses,1315,PRJNA273155,possibly_sc,1.86,not_qcd,WGS,⋯,,,,,,,,grade IV,untreated,PRJNA273155_Age_65
SRR1779200,1277649.YP_007354884.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Sepolyvirales;Polyomaviridae;Deltapolyomavirus;Deltapolyomavirus undecihominis;unclassified Deltapolyomavirus undecihominis subspecies/strain,maybe Viruses,1,PRJNA273155,possibly_sc,0.33,not_qcd,WGS,⋯,,,,,,,,grade IV,untreated,PRJNA273155_Age_65
SRR14862871,10798.YP_004928146.1,Viruses;Shotokuvirae;Cossaviricota;Quintoviricetes;Piccovirales;Parvoviridae;Erythroparvovirus;unclassified Erythroparvovirus species;unclassified Erythroparvovirus subspecies/strain,Viruses,396,PRJNA736951,possibly_sc,87.33,qcd,WGS,⋯,,,,,,,,,,
SRR8750456,10617.NP_040895.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Gammapapillomavirus;Gammapapillomavirus 1;unclassified Gammapapillomavirus 1 subspecies/strain,Viruses,40,PRJNA527986,not_sc,2.17,not_qcd,RNA-Seq,⋯,,,,,,,,,,
SRR8750473,10617.NP_040895.1,Viruses;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Gammapapillomavirus;Gammapapillomavirus 1;unclassified Gammapapillomavirus 1 subspecies/strain,Viruses,56,PRJNA527986,not_sc,1.83,not_qcd,RNA-Seq,⋯,,,,,,,,,,


## Identify the best viral hit per each read from BLAST results

In [18]:
best_viral_hit_per_read <- blast_human_rm %>%
  filter(superkingdom == "Viruses") %>%
  group_by(vog, qseqid) %>%
  slice_max(bitscore) %>%
  slice_min(evalue) %>%
  slice_head(n = 1) %>%
  ungroup()

best_viral_hit_per_read %>%
  group_by(run, scomnames) %>%
  tally()

run,scomnames,n
<chr>,<chr>,<int>
SRR14862871,"Human parvovirus B19 isolate B19-490, complete genome",2
SRR1778915,"Deltapolyomavirus undecihominis isolate STL2825 major capsid protein (VP1) gene, partial cds",26
SRR1778915,"Deltapolyomavirus undecihominis isolate STL3091 major capsid protein (VP1) gene, partial cds",1
SRR1778915,"Deltapolyomavirus undecihominis isolate STL3150 major capsid protein (VP1) gene, partial cds",4
SRR1778915,"STL polyomavirus isolate 11ww, complete genome",8
SRR1778915,"STL polyomavirus isolate HB124, complete genome",4
SRR1778915,"STL polyomavirus isolate HB201, complete genome",63
SRR1778915,"STL polyomavirus isolate Y57, complete genome",1
SRR1778915,"STL polyomavirus strain WD972, complete genome",1
SRR8750456,"Human papillomavirus type 4, complete genome",4


From these results, we chose the following genomes to download and map against:
* STL polyomavirus isolate HB201, complete genome (GCF_000904055.1) (https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/904/055/GCF_000904055.1_ViralProj186434/)
* Human papillomavirus type 4, complete genome (GCF_000864845.1) https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/864/845/GCF_000864845.1_ViralProj15492/
* Human papillomavirus isolate HPV-mSK_013, complete genome (MH777161.1)
* Human parvovirus B19 isolate B19-490, complete genome (MZ695225.1)
* Severe acute respiratory syndrome coronavirus 2 genome assembly, chromosome: 1 (GCF_009858895.2) https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/
* Pbunalikevirus phiFenriz, complete genome (GCA_002597305.1) https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/597/305/GCA_002597305.1_ASM259730v1/

These are in addition to the genomes that matched the VOGs

## sessionInfo

In [21]:
sessionInfo()

R version 4.2.3 (2023-03-15)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS/LAPACK: /Users/taylorreiter/miniconda3/envs/sandbox/lib/libopenblasp-r0.3.21.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] purrr_1.0.1   stringr_1.5.0 janitor_2.2.0 tidyr_1.3.0   dplyr_1.1.2  
[6] readr_2.1.4  

loaded via a namespace (and not attached):
 [1] pillar_1.9.0     compiler_4.2.3   base64enc_0.1-3  tools_4.2.3     
 [5] bit_4.0.5        digest_0.6.31    uuid_1.1-0       jsonlite_1.8.4  
 [9] lubridate_1.9.2  evaluate_0.20    lifecycle_1.0.3  tibble_3.2.1    
[13] timechange_0.2.0 pkgconfig_2.0.3  rlang_1.1.0      IRdisplay_1.1   
[17] cli_3.6.1        parallel_4.2.3   IRkernel_1.3.2   fastmap_1.1.1   
[21] withr_2.5.0      repr_1.1.6       generics_0.1.3   vctrs_0.6.1     