06-A-unconstrained-ordination-bonus.Rmd

---
title: "Prepping data for unconstrained ordination"
author: "Anna Rasmussen"
date: "11/1/2017"
output: html_document
---

```{r include = FALSE}
# This code block sets up the r session when the page is rendered to html
# include = FALSE means that it will not be included in the html document

# Write every code block to the html document 
knitr::opts_chunk$set(echo = TRUE)

# Write the results of every code block to the html document 
knitr::opts_chunk$set(eval = TRUE)

# Define the directory where images generated by knit will be saved
knitr::opts_chunk$set(fig.path = "images/06-A/")

# Set the web address where R will look for files from this repository
# Do not change this address
repo_url <- "https://raw.githubusercontent.com/fukamilab/BIO202/master/"
```


```{r defining terms, echo=FALSE, message=FALSE, warning=FALSE}
## Defining terms, samples, month names, etc
salinities <- c("0 - <.5", ".5 - <5", "5 - <18", "18 -  <30", "30 - <40" )
system.order<-c("Fresh", "Oligohaline", "Mesohaline", "Polyhaline", "Euhaline")
stat.order<-as.character(c(657, 649, 3, 6, 9, 13, 15, 18, 21, 24, 27, 30, 34, 36))
seas.order <- c("Spring", "Summer", "Fall", "Winter")
surf.samp<- c("C29_657S", "C29_3S", "C29_6S", "C29_9S")
months <- c("Apr", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar")
cruise.dates <- c("4/23/2013", "7/23/2013", "8/28/2013","9/26/2013","10/24/2013","11/19/2013", "12/3/2013", "1/14/2014","2/11/2014","3/11/2014")
cruise.names <- c("C29", "C30","C31","C32","C33","C34","C35","C36","C37","C38")
```


```{r, echo=FALSE, message=FALSE, warning=FALSE}
## make dataframe of environmental variables
envmat.all <-read.csv("~/Documents/SFB_JGI_first_plate/timeseries_all_data.csv", header=T, row.names = 1)

envmat.all$Station.Number<-as.factor(envmat.all$Station.Number)
envmat.all$system<-factor(envmat.all$system, levels=system.order)
```

```{r, echo=FALSE, message=FALSE, warning=FALSE}
variables.cols <- c(31, 7,9, 11, 14,15, 21, 22, 34:36, 72, 74, 29, 73, 5, 32)

envdat.corr <- na.omit(envmat.all %>% 
  select (., variables.cols))
rownames(envdat.corr) = envdat.corr$sample

envdat.corr <- envdat.corr[,-c(1)]
```

```{r V4, echo=F, message=FALSE, warning=FALSE}
### Making a V4 phyloseq object
#Phyloseq object and environmental variables for sequenced samples
library(phyloseq)
library(ggplot2)

otumat.V4 <- read.csv("~/Documents/SFB_JGI_first_plate/SFBayDeltaitags/itags/16S-V4-ver2/2-1992333/otu.tsv", sep= "") %>% 
  select(-taxonomy) ## remove non-numeric data
rownames(otumat.V4) <- paste(otumat.V4$OTU) ##make OTU numbers the row names
otumat.V4 <- otumat.V4 %>% 
  select(-OTU) ##remove OTU column

taxmat.V4 <- read.table("~/Documents/SFB_JGI_first_plate/SFBayDeltaitags/itags/16S-V4-ver2/2-1992333/tax.tsv", sep= "") %>%
  select(c(1,3))
colnames(taxmat.V4) <- c("OTU", "tax") ## Define columns
taxmat.V4<-taxmat.V4 %>% 
  separate(tax, into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), sep= ",") ## separate taxa names
rownames(taxmat.V4) <- paste0(taxmat.V4$OTU) ### Make rownames from the column "OTU"
taxmat.V4 <- taxmat.V4 %>% 
  select(-OTU) %>% ## remove column with OTU names
  as.matrix(.)

JGItre.V4<-read_tree("~/Documents/SFB_JGI_first_plate/SFBayDeltaitags/itags/16S-V4-ver2/2-1992333/otu.tre")

OTU.V4 = otu_table(otumat.V4, taxa_are_rows = TRUE)
TAX.V4 = tax_table(taxmat.V4)
ENV.V4= sample_data(envdat.corr)

physeq.V4 <- phyloseq(OTU.V4, TAX.V4, ENV.V4, JGItre.V4)
physeq.nd<- physeq.V4 %>% 
  filter_taxa(., function(x) sum(x > 2) > 1, TRUE) ### remove doubltons
```


```{r , echo=FALSE, message=FALSE, warning=FALSE}
### DESeq 2 variance stabilization and geometric mean stabilizes variance from different library size.
### This normailizationn and transformation process is based on a negative binomial hierarchical model. 
### In the formula, differences in libary size are accounted for. This transformation is basically a log 
### transformation at larger values with wider spread in lower count values. See "Waste not want not: why rarefying ### microbiome data is inadmissable" paper by McMurdie and Holmes to get the following info in more detail:
### The negative binomial distribution is used to model the counts, K, for OTU i, in sample j according to:
### Kij ~ NB(sj, mui, phii)
### where sj is a linear scaling factor for sample j that accounts for its library size, mui is the mean proportion ### for OTU i, and phii is the dispersion parameter for OTU i. The variance is vi = sj*mui+phii*sj^2*mui^2 , with the ### NB distribution becoming Poisson when phi = 0

library(DESeq2)

gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) ##geometric mean function
}

ps_dds1 = phyloseq_to_deseq2(physeq.nd, ~system) ##convert phyloseq object to deseq object, "treatments" are salinity level
geoMeans = apply(counts(ps_dds1), 1, gm_mean)
ps_dds1 = estimateSizeFactors(ps_dds1, geoMeans = geoMeans)
ps_dds1 = estimateDispersions(ps_dds1)
abund=getVarianceStabilizedData(ps_dds1) #this is the long processeing step, stabilizing the variance
rownames(abund)=substr(rownames(abund),1,5) %>% make.names(unique=T)

otusf1 <- as.numeric(row.names(otu_table(physeq.nd))) #Get original OTU names
abund2 <- abund
rownames(abund2) <- otusf1 #change new OTU names back to old OTU names so matches taxonomy and tree files
physeq.VST <- physeq.nd
otu_table(physeq.VST) <- otu_table(abund2, taxa_are_rows=T) #update the otu table of phyloseq object

```

```{r}
otusnorm <-as.data.frame(abund2)
write.csv(otusnorm, row.names=TRUE, "~/Documents/Stanford/Eco_stats_202/data/OTUsnormalized.csv")
```

```{r, echo=TRUE, message=FALSE, warning=FALSE}
envdat.phy <- as.data.frame(sample_data(physeq2))
write.csv(envdat.phy, row.names = TRUE, "~/Documents/Stanford/Eco_stats_202/data/envdat_phy.csv")
```