###  COVIRT19 microbial subgroup seqscreen analysis 
#### Date : 21 DEC 2020 
#### Maintained by :
    Jochum, Michael D. 
    Baylor College of Medicine 
    michael.jochum@bcm.edu

#### The purpose of this code is to:
- take the raw seqscreen GO Term counts and convert them into a working phyloseq object
- conduct some preprocessing on the phyloseq object that:
- filters out batch effect samples and GO terms with little to no abundance

In [53]:
library(tidyverse)
library(phyloseq)
setwd("../")

load the seqscreen count table and fix come naming issues

In [37]:
raw<-as_tibble(read.table("./datasets/Combined_BALF_GO_Terms_parent_propagated.tsv", sep = "\t", row.names = NULL, header = T, quote = "", comment.char = ""))
colnames(raw)<-gsub("NA_tax","unclass", colnames(raw))%>%str_replace_all("NC1_SRR7796663", "NC1.SRR7796663")

In [38]:
df<-raw %>%
  select(GO_term,namespace,depth,name,ends_with("_counts"))%>%
  pivot_longer(cols = -c(GO_term,namespace,depth,name),
               names_to =  c("sample","type","abund"),
               names_pattern = "(.*)_(.*)_(.*)")%>%
  select(-abund)%>%
  filter(value>1)%>%
  pivot_wider(names_from = sample, values_from=value, values_fill=0)

In [39]:
df$depth<-as.character(df$depth)

In [40]:
term<-df%>%filter(type!="NA")%>%filter(type%in%c("bac","arc","vir"))%>%group_by(GO_term,namespace,depth,name)%>%
  summarise(across(.cols = where(is.numeric), sum))

`summarise()` regrouping output by 'GO_term', 'namespace', 'depth' (override with `.groups` argument)



In [41]:
term_tax<-term%>%select(GO_term,namespace,depth,name)
term_tax<-data.frame(term_tax, row.names=1)
term_counts<-data.frame(term[5:172], row.names = term$GO_term)

In [42]:
term_counts_phy <- otu_table(term_counts, taxa_are_rows=TRUE)
term_tax_phy <- tax_table(as.matrix(term_tax), errorIfNULL=TRUE)

In [43]:
term_sam<-as.data.frame(read.table("../Combined_BALF_GO_Terms_metadata2.txt",header = T, sep = "\t",row.names = 1))
rownames(term_sam)<-rownames(term_sam)%>%str_replace_all("NC1_SRR7796663", "NC1.SRR7796663")
term_sam$accession<-rownames(term_sam)

In [44]:
term_sam$outcome<-term_sam$outcome%>%
  str_replace_all("recovered", "Recovered")%>%
  str_replace_all("deceased","Deceased")%>%
  str_replace_all('stabilized',"Stabilized")
term_sam$sex<-term_sam$sex%>%
  str_replace_all("M", "male")%>%
  str_replace_all("F", "female")%>%
  str_replace_all("na", "<NA>") # this is mixing the string "<NA>" with actual NAs, probably not related to our problem, but def not a good idea in general

In [45]:
term_pseq <- phyloseq(term_counts_phy, term_tax_phy, sample_data(term_sam))
term_pseq# [ 14581 taxa and 167 samples ] [ 27077 taxa and 167 samples ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14581 taxa and 167 samples ]
sample_data() Sample Data:       [ 167 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14581 taxa by 3 taxonomic ranks ]

In [46]:
filtme<-c("GO:0003674")
term_pseq <- prune_taxa(taxa=taxa_names(term_pseq)!=filtme, term_pseq)
filtme<-c("GO:0008150")
term_pseq <- prune_taxa(taxa=taxa_names(term_pseq)!=filtme, term_pseq)
term_pseq #[ 14579 taxa and 167 samples ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14579 taxa and 167 samples ]
sample_data() Sample Data:       [ 167 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14579 taxa by 3 taxonomic ranks ]

In [47]:
term_pseq_no_neg<-term_pseq
term_pseq_no_neg<-subset_samples(term_pseq, sample_type!="neg_control")
term_pseq_no_neg# [ 14579 taxa and 162 samples ]:
term_pseq_no_neg<-subset_samples(term_pseq_no_neg, sample_type!="Unknown")
term_pseq_no_neg#  [ 14579 taxa and 141 samples ]:
term_pseq_no_neg<-subset_samples(term_pseq_no_neg, case!="Control_Sick")
term_pseq_no_neg# [ 14597 taxa and 105 samples ]
term_pseq_no_neg<-subset_samples(term_pseq_no_neg,publication!="Michalovich")
term_pseq_no_neg# [ 14597 taxa and 102 samples ]
term_pseq_no_neg<-subset_samples(term_pseq_no_neg, bioproject!="PRJNA605907")
term_pseq_no_neg# [ 14597 taxa and 86 samples ]
term_pseq_no_neg<-prune_taxa(taxa = taxa_sums(term_pseq_no_neg)>0,x = term_pseq_no_neg)
term_pseq_no_neg# [ 13534 taxa and 86 samples ]
term_pseq_no_neg<-prune_samples(samples = sample_sums(term_pseq_no_neg)>0,x = term_pseq_no_neg)
term_pseq_no_neg# [ 13534 taxa and 86 samples ] # [ 25426 taxa and 86 samples ]
term_pseq_no_neg_gonames<-term_pseq_no_neg

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14579 taxa and 162 samples ]
sample_data() Sample Data:       [ 162 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14579 taxa by 3 taxonomic ranks ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14579 taxa and 141 samples ]
sample_data() Sample Data:       [ 141 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14579 taxa by 3 taxonomic ranks ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14579 taxa and 105 samples ]
sample_data() Sample Data:       [ 105 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14579 taxa by 3 taxonomic ranks ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14579 taxa and 102 samples ]
sample_data() Sample Data:       [ 102 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14579 taxa by 3 taxonomic ranks ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 14579 taxa and 86 samples ]
sample_data() Sample Data:       [ 86 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 14579 taxa by 3 taxonomic ranks ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 13534 taxa and 86 samples ]
sample_data() Sample Data:       [ 86 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 13534 taxa by 3 taxonomic ranks ]

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 13534 taxa and 86 samples ]
sample_data() Sample Data:       [ 86 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 13534 taxa by 3 taxonomic ranks ]

In [48]:
tax<-data.frame(tax_table(term_pseq_no_neg))
names<-paste(rownames(tax),tax$name,sep="-")
taxa_names(term_pseq_no_neg)<-names

In [49]:
save.image(file = "./images/0_preprocessing.RDA")