## R notebook for producing figures as in "Viromes from Northern Minnesota peat reveal distinct terrestrial and aquatic niches for globally distributed viral populations"
Anneliek ter Horst

- Accumulation plots
- Waffle plots
- Venn diagrams
- PCOA plots
- Lollipop chart
- Worldmap chart

In [None]:
# Load libraries and set working directory
setwd("/Users/Anneliek/R_scripts/")

library(vegan)
library(labdsv)
library(ggplot2)
library(plyr)
library(phyloseq)
library(pheatmap)  
library(tidyr)
library(RColorBrewer)
library(scales)
library(Ecfun)
library(waffle)
library(grid)
library(broom)
library(tidyverse)
library(ape)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(forcats)
library(scales)
library(ggpmisc)
library(maps)

## Accumulation plots
- in: viral OTU table from viromes
- viral OTU table from metagenomes from transect
- viral OTU table from metagenomes from chambers

In [None]:
# Load in the virome OTU data (this table contains integers instead of values with a comma)
votu1 <- read.table("../Data/virome_ints.txt",
                   sep="\t", header=T, row.names=1)

# transpose the data
votu <- t(votu1)

# Load in the bulk OTU data 
otu1 <- read.table("../Data/bulk_ints_own.txt", 
                  sep="\t", header=T, row.names=1)

# transpose the data
otu <- t(otu1)


# Load in the bulk OTU data (bulk metaGs JGI)
all_otu1 <- read.table("../Data/bulk_ints_82.txt", 
                  sep="\t", header=T, row.names=1)

# transpose the data
all_otu <- t(all_otu1)


# Calculate accumulaton curves using the random method with 100 permutations
# for vOTUs
sp <- specaccum(votu, method = "random", permutations = 100)

# for 5 metaG
sp.otu <- specaccum(otu, method = "random", permutations = 100)

# for 82 metaG
sp.all_otu <- specaccum(all_otu, method = "random", permutations = 100)

# Extract each of the permutations and reformat
perm <- sp$perm

perm.tidy <- as.tibble(perm) %>% 
  mutate(Sites = 1:nrow(.)) %>% 
  gather(key = "Permutation", value = "Species", -Sites) 

# Extract the consensus richness per number of sites sampled and reformat
richness <- data.frame(Sites = sp$sites, Species = sp$richness)

# Extract each of the permutations and reformat
perm.otu <- sp.otu$perm

perm.otu.tidy <- as.tibble(perm.otu) %>% 
  mutate(Sites = 1:nrow(.)) %>% 
  gather(key = "Permutation", value = "Species", -Sites) 

# Extract the consensus richness per number of sites sampled and reformat
richness.otu <- data.frame(Sites = sp.otu$sites, Species = sp.otu$richness)

# Extract each of the permutations and reformat
perm.all_otu <- sp.all_otu$perm

perm.all_otu.tidy <- as.tibble(perm.all_otu) %>% 
  mutate(Sites = 1:nrow(.)) %>% 
  gather(key = "Permutation", value = "Species", -Sites) 

# Extract the consensus richness per number of sites sampled and reformat
richness.all_otu <- data.frame(Sites = sp.all_otu$sites, Species = sp.all_otu$richness)

all.perm  <- rbind(mutate(perm.tidy, Extraction = "Viromes"),
                  mutate(perm.otu.tidy, Extraction = "Bulk_own"),
                   mutate(perm.all_otu.tidy, Extraction = "Bulk_all"))

all.richness  <- rbind(mutate(richness, Extraction = "Viromes"),
                     mutate(richness.otu, Extraction = "Bulk_own"),
                      mutate(richness.all_otu, Extraction = "Bulk_all"))

# create the actual plot
pdf('accumplot_all_permute.pdf')
ggplot(all.perm, aes(Sites, Species, color = Extraction)) +
  geom_point(alpha = 0.5, shape = 1, size = 2.5) +
  #scale_color_brewer(palette = my_palette) +
  scale_colour_manual(values = palet) +
  geom_line(data = all.richness, size = 1) +
  theme_bw() +
  theme(text = element_text(size = 18))
dev.off()


## Statistical tests
- Mantel using pearson method
- PERMANOVA

In [None]:
# mantel test with pearson method.
mantel(otu.dist, env.dist, permutations=99999, method="pearson", na.rm = T)


# run permanova for continious data
pmanova <- adonis(as.dist(otu.dist) ~ depth, data = cat_meta.ordered)

## Waffle plots 
- Square form of a pie chart
- You can alternatively load in a dataframe 

In [None]:
# Load in your data or hardcode it like me  #sorrynotsorry
parts <- c("Previously unknown (61.8%)"=2717, "Unclassified, previously uncovered (36.6%)"=1609, "Taxonomically classified (1.5%)"=67)

# Produce waffle plot
pdf('figures/200819_waffle_plot.pdf')
waffle(parts/10, rows=19, size=.2, colors=c("#005a32", "#41ab5d", "#a1d99b"))+
theme(axis.title.x=element_text(size = 14), text = element_text(size = 14))
dev.off()

## Venn diagrams
- in: csv file with 1 and 0 for each of the clusters and the categories present
- 0 means that vOTUs of that category arent present, 1 means they are

In [None]:
# load in csv with 0 and 1 data 
data_soil_water <- read.delim('../Data/200402_soil_water_ncbi.csv',header=T, sep=',')

# define the sizes of each of the circles in the venn
area1 = nrow(subset(data_soil_water, marine == 1))
area2 = nrow(subset(data_soil_water, soil == 1))
area3 = nrow(subset(data_soil_water, freshwater == 1))
n12 = nrow(subset(data_soil_water, marine == 1 & soil ==1))
n13 = nrow(subset(data_soil_water, marine == 1 & freshwater ==1))
n23 = nrow(subset(data_soil_water, soil == 1 & freshwater ==1))
n123 = nrow(subset(data_soil_water, marine == 1 & soil ==1 & freshwater == 1))

# draw the venn, hardcoded the number of singleton clusters for each
venn.plot <- draw.triple.venn(
area1 = area1 + 26018,
area2 = area2 + 15661,
area3 = area3 + 5396,
n12 = n12,
n13 = n13,
n23 = n23,
n123 = n123,
category = c("marine", "soil", "freshwater"),
fill = c("red", "blue", "green"),
lty = "blank",
cex = 2,
cat.cex = 2,
cat.col = c("red", "blue", "green"),
euler.d = TRUE,
scaled = TRUE
);

ggsave('../figures/200506_3way_marine_freshwater_soil.pdf', venn.plot) 
dev.off()



# Boxplots
- In: table with the total number of populations from each sample


In [None]:
# Load data in
total_pops <- read.table("../Data/total_populations_against_own_db.txt", sep="\t", header=T)

# print means for both bulk and virome (populations)
colMeans(total_pops)

#boxplot with a logarithmic scale
pdf("boxplot.pdf")
boxplot(total_pops, par(cex.axis=1.3), par(cex.lab=1.3), col=c("#1c5ec9","#f6b420",'#fee391', '#df3123'), # box colors
        names=c("5 bulk", '2015 bulk', '2016 bulk', " 5 viromes"), # box names
        ylab="viral genomes per sample (log scale)", # y-axes name
        log="y")+  # logaritmic
  theme_bw()
  dev.off()


## PCOA plots
- In: Viral OTU table
- Table with available metadata

In [None]:
# Load the files required and transpose
otu <- read.table("../Data/190807_JGI_samples_20152016.txt", 
              sep="\t", header=T, row.names=1)

# transpose data
otu <- t(otu)
# remove rows with all zeros
otu <- otu[apply(otu[,-1], 1, function(x) !all(x==0)),]

# categorical metadata
cat_meta <- read.table("../Data/191203_all_meta_correct.csv", 
                         sep=",", header=T, row.names=1)
                 
# print head of the metadata
head(cat_meta)
head(otu)
                 
# transform the data
otu.xform <- decostand(otu, method="hellinger")

# create dissimilarity matrix for the vOTU table with bray method
otu.dist <- as.matrix(vegdist(otu.xform, method='bray'))

# nmds
otu.nmds <- metaMDS(otu.dist)
otu.nmds$stress


# R will not automatically bind datapoints with the same name, but randomly bind them
# therefore order cat data with this
cat_meta.ordered <- cat_meta[match(row.names(otu.nmds$points), row.names(cat_meta)),]   

                 
# perform pcoa with ape package pcoa
pcoa <- pcoa(as.dist(otu.dist))

# make a dataframe named axes, put pcoa values in there
axes <- as.data.frame(pcoa$vectors)

# Give df extra column with the rownames in it 
axes$SampleID <- rownames(axes)

# put the metadata in the same dataframe, with correct sample name
cat_meta.ordered$SampleID <- rownames(cat_meta.ordered)


# calculate the eigenvalues for each pcoa axes 
eigval <- round(pcoa$values$Relative_eig * 100, digits = 2)

# merge those dfs
axes <- merge(cat_meta.ordered, axes, by.x = "SampleID", by.y = "SampleID")


# Put those eigenvalues in a df so they easy to get to. 
eigval <- data.frame( PC = 1:length(eigval), Eigval = eigval)
#head(eigval) # see top eigenvalues
eigval[[1,2]] # see first axes percentage
eigval[[2,2]] # second axes
eigval[[3,2]] # third axes
eigval[[4,2]] # fourth axes

axes
                 
# write to pdf
pdf("depth_year20152016.pdf")

# set plot
p <- ggplot(axes, aes(Axis.1, Axis.2), width = 12, height = 8) 

# set color of the points as the factor depth, shape as year, set size and see-throughness
p + geom_point(aes(colour=as.character(depth), shape=as.character(Year)), size = 4,alpha=0.7, stroke=1) +
  
  # set text for the axis lables
  xlab(paste("PCo1 (", eigval$Eigval[1], " %)", sep = "")) +  # or somthing else
  ylab(paste("PCo2 (", eigval$Eigval[2], " %)", sep = "")) +
  
  # dotted lines through the plot
  #geom_vline(xintercept = 0, linetype = 2) +
  #geom_hline(yintercept = 0, linetype = 2) +
  
  # set the colors of the points with the colorbrewer pallet
  scale_color_brewer(name = "Depth", palette = "Dark2") +

  # you can set colors manually by this: 
  #scale_color_manual(values = c("#E78AC3", "#A6D854", "#FFD92F", "#FFD92F")) +

  # set shapes of the points
  scale_shape_manual(name = "Year", values=c(16,17)) +

  # tell where the legend has to be
  guides(color = guide_legend(title.position = "top", title.hjust = 0.5)) +
  theme_bw() +
  
  # set text size for whole graph. set the background color (white with no lines)
  theme(text = element_text(size = 20), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
        legend.position = "left") 
dev.off()

## Lollipop chart
- in: Table with all the Rho values that were tested

In [None]:
# Load in the data
rho_vals <- read.table("../Data/200211_depth_comm_comp.txt", 
              sep="\t", header=T)

# keep only p vals under 0.01
rho_sign <- rho_vals[ which(rho_vals$P < 0.01),]
rho_sign

# Make figure and write to PDF
pdf('../figures/200416_Rho_depthviralcc_all.pdf') 
rho_sign %>%
  mutate(v2 = fct_reorder(v2, desc(depht_rho))) %>%
  ggplot( aes(x=v2, y=Rho)) +
    geom_line(aes(group = v2), alpha=0.7, size=0.5) +
    geom_point(aes(color = v1), size=3) +
    scale_color_brewer(palette = "Set2")+
    # you can set colors manually by this: 
    #scale_color_manual(values = c("#E78AC3", "#A6D854")) +
    xlab("") +
    theme_bw()+
    theme(axis.text.x = element_text(angle = 80, hjust=1), text = element_text(size = 18) ) 
dev.off()


## Worldmap chart
- in: Table with for each different place the x and y coordinates, the number of vOTUs found there and the color of the dot you want it

In [None]:
# read in the datapoints of clusters found at different places
VCs <- read.csv("Data/200408num_of_virus_on_map.csv", 
                    sep=",", header=T, row.names=1)
# show table
head(VCs)

# pull the world map
world_map = map_data("world")

#Creat a base plot with gpplot2
p = ggplot() + coord_fixed() +
  xlab("") + ylab("") 

#Add map to base plot
base_world_messy = p + geom_polygon(data=world_map, aes(x=long, y=lat, group=group), 
                                     colour="#b8e186", fill="#b8e186")

#Strip the map down of the lat/long lines 
cleanup = theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        panel.background = element_rect(fill = '#e7f2fb', colour = '#e7f2fb'), 
        axis.line = element_line(colour = "#e7f2fb"), legend.position="#e7f2fb",
        axis.ticks=element_blank(), axis.text.x=element_blank(),
        axis.text.y=element_blank())

# clean up the messy world
base_world = base_world_messy + cleanup

# map points to this world
map_data = base_world + geom_point(data=VCs, aes(x = lon, y = lat, size = sqrt(tot_viruses), 
                color=factor(color), alpha = color)) +
                scale_color_manual(name = 'color', labels=c("SPRUCE", "Other soils"),
                                   values = c('#de77ae', '#8e0152')) +
                scale_alpha_manual(name = 'color', values= c(0.5, 0.8)) 
    

# write to PDF
pdf('200511_viruses_on_map_green.pdf')

# print to file   
print(map_data)

dev.off()
