# Data Wrangling
###  COVIRT19 microbial subgroup seqscreen analysis 
#### Date : 21 DEC 2020 
#### Maintained by :
    Jochum, Michael D. 
    Baylor College of Medicine 
    michael.jochum@bcm.edu

#### The purpose of this code is to:
- take the raw seqscreen GO Term counts and convert them into a working phyloseq object
- conduct some preprocessing on the phyloseq object that:
- filters out batch effect samples and GO terms with little to no abundance

In [1]:
library(tidyverse)
library(phyloseq)
setwd("../")

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



load the seqscreen count table and fix come naming issues

In [2]:
raw<-as_tibble(read.table("./datasets/Combined_BALF_GO_Terms_parent_propagated.tsv", sep = "\t", row.names = NULL, header = T, quote = "", comment.char = ""))
colnames(raw)<-gsub("NA_tax","unclass", colnames(raw))%>%str_replace_all("NC1_SRR7796663", "NC1.SRR7796663")

“cannot open file './datasets/Combined_BALF_GO_Terms_parent_propagated.tsv': No such file or directory”


ERROR: Error in file(file, "rt"): cannot open the connection


In [None]:
df<-raw %>%
  select(GO_term,namespace,depth,name,ends_with("_counts"))%>%
  pivot_longer(cols = -c(GO_term,namespace,depth,name),
               names_to =  c("sample","type","abund"),
               names_pattern = "(.*)_(.*)_(.*)")%>%
  select(-abund)%>%
  filter(value>1)%>%
  pivot_wider(names_from = sample, values_from=value, values_fill=0)

In [3]:
df$depth<-as.character(df$depth)

ERROR: Error in df$depth: object of type 'closure' is not subsettable


In [4]:
term<-df%>%filter(type!="NA")%>%filter(type%in%c("bac","arc","vir"))%>%group_by(GO_term,namespace,depth,name)%>%
  summarise(across(.cols = where(is.numeric), sum))

“`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help


ERROR: Error in UseMethod("filter_"): no applicable method for 'filter_' applied to an object of class "function"


In [5]:
term_tax<-term%>%select(GO_term,namespace,depth,name)
term_tax<-data.frame(term_tax, row.names=1)
term_counts<-data.frame(term[5:172], row.names = term$GO_term)

ERROR: Error in select(., GO_term, namespace, depth, name): object 'term' not found


In [6]:
term_counts_phy <- otu_table(term_counts, taxa_are_rows=TRUE)
term_tax_phy <- tax_table(as.matrix(term_tax), errorIfNULL=TRUE)

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'otu_table': object 'term_counts' not found


In [7]:
term_sam<-as.data.frame(read.table("../Combined_BALF_GO_Terms_metadata2.txt",header = T, sep = "\t",row.names = 1))
rownames(term_sam)<-rownames(term_sam)%>%str_replace_all("NC1_SRR7796663", "NC1.SRR7796663")
term_sam$accession<-rownames(term_sam)

“cannot open file '../Combined_BALF_GO_Terms_metadata2.txt': No such file or directory”


ERROR: Error in file(file, "rt"): cannot open the connection


In [8]:
term_sam$outcome<-term_sam$outcome%>%
  str_replace_all("recovered", "Recovered")%>%
  str_replace_all("deceased","Deceased")%>%
  str_replace_all('stabilized',"Stabilized")
term_sam$sex<-term_sam$sex%>%
  str_replace_all("M", "male")%>%
  str_replace_all("F", "female")%>%
  str_replace_all("na", "<NA>") # this is mixing the string "<NA>" with actual NAs, probably not related to our problem, but def not a good idea in general

ERROR: Error in stri_replace_all_regex(string, pattern, fix_replacement(replacement), : object 'term_sam' not found


In [9]:
term_pseq <- phyloseq(term_counts_phy, term_tax_phy, sample_data(term_sam))
term_pseq# [ 14581 taxa and 167 samples ] [ 27077 taxa and 167 samples ]

ERROR: Error in phyloseq(term_counts_phy, term_tax_phy, sample_data(term_sam)): object 'term_counts_phy' not found


In [10]:
filtme<-c("GO:0003674")
term_pseq <- prune_taxa(taxa=taxa_names(term_pseq)!=filtme, term_pseq)
filtme<-c("GO:0008150")
term_pseq <- prune_taxa(taxa=taxa_names(term_pseq)!=filtme, term_pseq)
term_pseq #[ 14579 taxa and 167 samples ]

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'taxa' in selecting a method for function 'prune_taxa': error in evaluating the argument 'physeq' in selecting a method for function 'taxa_names': object 'term_pseq' not found


In [11]:
term_pseq_no_neg<-term_pseq
term_pseq_no_neg<-subset_samples(term_pseq, sample_type!="neg_control")
term_pseq_no_neg# [ 14579 taxa and 162 samples ]:
term_pseq_no_neg<-subset_samples(term_pseq_no_neg, sample_type!="Unknown")
term_pseq_no_neg#  [ 14579 taxa and 141 samples ]:
term_pseq_no_neg<-subset_samples(term_pseq_no_neg, case!="Control_Sick")
term_pseq_no_neg# [ 14597 taxa and 105 samples ]
term_pseq_no_neg<-subset_samples(term_pseq_no_neg,publication!="Michalovich")
term_pseq_no_neg# [ 14597 taxa and 102 samples ]
term_pseq_no_neg<-subset_samples(term_pseq_no_neg, bioproject!="PRJNA605907")
term_pseq_no_neg# [ 14597 taxa and 86 samples ]
term_pseq_no_neg<-prune_taxa(taxa = taxa_sums(term_pseq_no_neg)>0,x = term_pseq_no_neg)
term_pseq_no_neg# [ 13534 taxa and 86 samples ]
term_pseq_no_neg<-prune_samples(samples = sample_sums(term_pseq_no_neg)>0,x = term_pseq_no_neg)
term_pseq_no_neg# [ 13534 taxa and 86 samples ] # [ 25426 taxa and 86 samples ]
term_pseq_no_neg_gonames<-term_pseq_no_neg

ERROR: Error in eval(expr, envir, enclos): object 'term_pseq' not found


In [12]:
tax<-data.frame(tax_table(term_pseq_no_neg))
names<-paste(rownames(tax),tax$name,sep="-")
taxa_names(term_pseq_no_neg)<-names

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'tax_table': object 'term_pseq_no_neg' not found


In [13]:
save.image(file = "./images/0_preprocessing.RDA")

“cannot open compressed file './images/0_preprocessing.RDATmp', probable reason 'No such file or directory'”


ERROR: Error in gzfile(file, "wb"): cannot open the connection


“cannot remove file './images/0_preprocessing.RDATmp', reason 'No such file or directory'”
