BCEENet_LandscapeGenomicsCURE.R

## Set up environment

devtools::install_github("stjohn3/R_packages",subdir="BceenetPCAPackage", force=TRUE)

library(BceenetPCAPackage)


library(shiny)
library(shinydashboard)
library(base)
library(datasets)
library(graphics)
library(grDevices)
library(methods)
library(stats)
library(seqinr)
library(tidyverse)
library(dplyr)
library(magrittr)
library(factoextra)
library(stringr)
library(stringi)
library(utils)
library(ggrepel)
library(fuzzyjoin)
library(RColorBrewer)
library(scales)
library(sjmisc)
library(ggpubr)
library(DT)



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Functions --------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
get.potential.voucher.numbers<-function(file.path.fasta){
  #read in Fasta file
  read.fasta(file = file.path(file.path.fasta))->temp.fasta
  
  #Get the headers and make into dataframe
  getAnnot(temp.fasta) %>%
    unlist() %>%
    as.data.frame() -> annotation.list
  
  #Change column name
  names(annotation.list) <- "annotation"
  
  #Create new column
  annotation.list$potential.catalognumber<-NULL
  
  #for loop to grab potential voucher number
  for(i in 1:nrow(annotation.list)){
    
    # the annotations are sometimes seperated by spaces and somtiems by under scores. this section and the following if else statment strips the voucher
    # numbers depending on the seperator
    scan(text = annotation.list$annotation[i], what = "", sep = c(""), quiet=TRUE)%>%
      as.data.frame()%>%nrow()->seperator.type
    
    if(seperator.type>1){
      scan(text = annotation.list$annotation[i], what = "", sep = c(""), quiet=TRUE)%>%
        as.data.frame()%>%
        dplyr::rename("potential.catalognumber"=".")%>%
        filter(!stringr::str_detect(potential.catalognumber, ">"),
               !stringr::str_detect(potential.catalognumber, "-"),
               !stringr::str_detect(potential.catalognumber, "\\("),
               stringr::str_detect(potential.catalognumber, "\\d"))%>%
        dplyr::slice(1)%>%
        as.character()->potential.catalognumber
      
      annotation.list$potential.catalognumber[i]<-potential.catalognumber
    }
    else{scan(text = annotation.list$annotation[i], what = "", sep = c("_"), quiet=TRUE)%>%
        as.data.frame()%>%
        dplyr::rename("potential.catalognumber"=".")%>%
        filter(!stringr::str_detect(potential.catalognumber, ">"),
               !stringr::str_detect(potential.catalognumber, "-"),
               !stringr::str_detect(potential.catalognumber, "\\("),
               stringr::str_detect(potential.catalognumber, "MVZ"))%>%
        dplyr::slice(1)%>%
        as.character()->potential.catalognumber
      
      annotation.list$potential.catalognumber[i]<-potential.catalognumber
    }
    
  }
  
  #make new column to remove any prefix from the voucher number and eliminate any straggler number possibilities
  annotation.list%<>%
    dplyr::mutate(catalog.number.only=str_extract(potential.catalognumber, "[0-9]+"))%>%
    filter(str_length(catalog.number.only)>3)
  
  return(annotation.list)
  
}

match.vernet.to.fasta<-function(input.file){
  
  input.file->annotation.list
  
  #grab only relevant columns from ecoregion dataframe
  lat.long<-ecoregions%>%
    dplyr::select(scientific, catalognum, decimallat,decimallon,New_label)%>%
    dplyr::mutate(catalog.number.only=str_extract(catalognum, "(?<=:)[0-9]+"))%>%
    dplyr::mutate(catalog.number.only=ifelse(is.na(catalog.number.only)==TRUE, catalognum, catalog.number.only))%>%
    unique()
  
  
  #make comparison lists to use in for loop  
  does.this.list<-lat.long$catalognum
  contain.these.values<-annotation.list$catalog.number.only
  
  #initialize data frame for loop
  matching.id.data.frame<<-data.frame(scientific=character(),
                                      catalognum=character(),
                                      decimallat=numeric(),
                                      decimallon=numeric(),
                                      New_label=character(),
                                      catalog.number.only=numeric(),
                                      annotation=character(),
                                      potential.catalognumber=character(),
                                      catalog.number.only=numeric())
  
  
  #for loop asking whether the lat long list from Vertnet contains the stripped catalog/voucher numbers from the fasta files. 
  for (i in 1:length(does.this.list)) {
    for (a in 1:length(contain.these.values)) {
      str_contains(does.this.list[i], contain.these.values[a]) -> link
      
      if (link == TRUE) {
        cbind(lat.long[i, ], annotation.list[a, ]) -> int
        matching.id.data.frame <<- rbind(matching.id.data.frame, int)
      }
    }
  }
  
  #rename columns and reorder columns
  names(matching.id.data.frame)<-c("scientific","catalognum","decimallat","decimallon","EcoLabel","vernet.catalog.number.only",
                                   "annotation","potential.catalognumber","genbank.catalog.number.only")
  
  #Make unique columns for genus species and subspecies 
  
  matching.id.data.frame$Genus<-NA
  matching.id.data.frame$Species<-NA
  matching.id.data.frame$subspecies<-NA
  matching.id.data.frame$matching<-NA
  
  #for loop to grab genus species and subspecies, and to double check that the matched IDs are with the correct genus. 
  for (i in 1:nrow(matching.id.data.frame)) {
    ### Make columns for genus species and subspecies
    scan(text = matching.id.data.frame$scientific[i], what = "", quiet = TRUE)[1] -> genus
    scan(text = matching.id.data.frame$scientific[i], what = "", quiet = TRUE)[2] -> species
    scan(text = matching.id.data.frame$scientific[i], what = "", quiet = TRUE)[3] -> subspecies
    
    ## print(genus)
    ## print(species)
    ## print(subspecies)
    
    matching.id.data.frame$Genus[i] <- genus
    matching.id.data.frame$Species[i] <- species
    matching.id.data.frame$subspecies[i] <- subspecies
    
    
    if(str_contains(matching.id.data.frame$annotation[i],matching.id.data.frame$Genus[i])==TRUE){
      matching.id.data.frame$matching[i] <- 1
    }
    else if(matching.id.data.frame$Genus[i]=="Artemisiospiza" & str_contains(matching.id.data.frame$annotation[i],"Amphispiza")){
      matching.id.data.frame$matching[i] <- 1
    }
    else if(matching.id.data.frame$Genus[i]=="Cyanocitta" & str_contains(matching.id.data.frame$annotation[i],"Cyanosita")){
      matching.id.data.frame$matching[i] <- 1
    }
    else{matching.id.data.frame$matching[i] <- 0
    }
    
  }
  
  matching.id.data.frame%>%
    filter(matching==1)%>%
    dplyr::select(scientific:EcoLabel, annotation)->Matched.vertnet.fasta.samples
  
  return(Matched.vertnet.fasta.samples)
}

subset.fasta.file<-function(file.path.fasta, list.matching.annotations){
  read.fasta(file = file.path(file.path.fasta))->temp.fasta
  list.matching.annotations->keep.list
  my_fasta_sub <- temp.fasta[str_contains(keep.list$annotation,names(temp.fasta))==TRUE]
  return(my_fasta_sub)
}

make.pca.data.frame<-function(subsetted.fasta){
  
  fasta.data<-as.alignment(nb = length(subsetted.fasta), nam = names(subsetted.fasta), 
                           seq = getSequence(subsetted.fasta), com = NA)
  
  ####get number of individuals in dataset####
  num.inds<-as.numeric(fasta.data$nb)
  
  #print(num.inds)
  ####get number of loci in the data####
  num.loci<-str_length(fasta.data$seq[[2]])
  
  #head(fasta.data)
  
  #####for loop to take the strings of ATCG's for each individual and: #####
  ####1) break them into single characters, ######
  ####2) convert them to numbers######
  ####and 3) calculate proportion of individuals with each nucleotide ######
  
  store.df<<-matrix(ncol = num.loci, nrow = 0)%>%as.data.frame()
  
  for(i in 1:num.inds){
    ind.sample<-str_split(unlist(fasta.data$seq[[i]]), pattern="")%>%unlist()
    s2n(ind.sample)%>%return()
    
    store.df<<-rbind(store.df,s2n(ind.sample))
  }
  
  count.variants<-function(data){
    zero<-0
    one<-0
    two<-0
    three<-0
    missing<-0
    for(i in 1:length(data)){
      
      if(is.na(data[i])==TRUE){missing<-missing+1}
      else if(data[i]==0){zero<-zero+1}
      else if(data[i]==1){one<-one+1}
      else if(data[i]==2){two<-two+1}
      else if(data[i]==3){three<-three+1}
      
    }
    return(paste(zero, one, two, three, missing))
  }
  
  
  lapply(store.df, count.variants)%>%as.matrix()->count.of.SNPS
  rownames(count.of.SNPS)->positions
  
  count.of.SNPS.seperated<-data.frame(position=as.character(positions),
                                      count.A=as.numeric(0),
                                      count.T=as.numeric(0),
                                      count.C=as.numeric(0),
                                      count.G=as.numeric(0))
  
  for(i in 1:nrow(count.of.SNPS)){
    str_split(unlist(count.of.SNPS[i]), pattern=" ")%>%
      unlist()%>%as.data.frame()->counts.seperated
    
    count.of.SNPS.seperated$count.A[i]<-as.numeric(counts.seperated[1,1]) #zero is A, which corresponds to row 1
    count.of.SNPS.seperated$count.T[i]<-as.numeric(counts.seperated[4,1]) #Three is T, which corresonds to row 4
    count.of.SNPS.seperated$count.C[i]<-as.numeric(counts.seperated[2,1]) #one is C, which corresponds to row 2
    count.of.SNPS.seperated$count.G[i]<-as.numeric(counts.seperated[3,1]) #two is G, which corresponds to row 3
  }
  
  
  count.of.SNPS.seperated%<>%
    mutate(individuals=num.inds,
           prop.A=(count.A/num.inds), #zero is A
           prop.T=(count.T/num.inds), #Three is T
           prop.C=(count.C/num.inds), #one is C
           prop.G=(count.G/num.inds)) #two is G
  #return(count.of.SNPS.seperated)
  
  ### filter loci####
  count.of.SNPS.seperated%>%
    dplyr::select("position", "prop.A", "prop.T", "prop.C", "prop.G")%>%
    pivot_longer(cols=prop.A:prop.G,
                 names_to = "Nucleotide",
                 values_to = "Proportion")%>%
    group_by(position)%>%
    dplyr::summarise(similarity=max(Proportion))%>%
    filter(similarity<=.95)->list.of.positions.to.keep.for.PCA
  
  ####PCA####
  #select columns
  store.df%>%
    dplyr::select(list.of.positions.to.keep.for.PCA$position)%>%
    dplyr::select(starts_with("X"))->pca.graph.data
  
  sapply(pca.graph.data, replace_na, value=as.numeric(.01))%>%as.data.frame()->pca.graph.data
  return(pca.graph.data)
}

run.pca.analysis<-function(fasta.to.pca.data, matched.vertnet.and.fasta,subsetted.fasta){
  fasta.to.pca.data->pca.graph.data
  subsetted.fasta->subset.names
  
  locus.pca<-prcomp(as.data.frame(pca.graph.data))
  row.names(locus.pca$x)<-names(subset.names)
  
  PCA.ggplot.data<-locus.pca$x[,c(1:2)]
  
  PCA.ggplot.data<-cbind(PCA.ggplot.data, matched.vertnet.and.fasta%>%arrange(annotation))%>%
    dplyr::select(annotation, scientific:EcoLabel, PC1, PC2)
  
  return(PCA.ggplot.data)
  
}

PC.Labels<-function(fasta.to.pca.data, matched.vertnet.and.fasta,subsetted.fasta){
  fasta.to.pca.data->pca.graph.data
  subsetted.fasta->subset.names
  
  locus.pca<-prcomp(as.data.frame(pca.graph.data))
  summary(locus.pca)["importance"]%>%as.data.frame()->summary.out
  
  summary.out["Proportion of Variance", 1:2]%>%as.matrix()->Graph.labels
  
  return(Graph.labels)
  
}

make.table.of.ecoregiongroups<-function(data.frame.graph.pca, title.input){
  
  data.frame.graph.pca->PCA.ggplot.data
  
  PCA.ggplot.data%>%
    group_by(EcoLabel)%>%
    dplyr::summarise(number.inds=n(),
                     latitude=paste(min(decimallat), "-", max(decimallat)),
                     longitude=paste(min(decimallon), "-", max(decimallon)),
                     PC1=mean(PC1),
                     PC2=mean(PC2),
                     list.IDs=str_c(catalognum, collapse = ","))
  
}

plot.PCA.Results <- function(data.frame.graph.pca, title.input, PC.lables.input) {
  PCA.ggplot.data <- left_join(data.frame.graph.pca, colors, by = c("decimallat", "decimallon"))%>%unique()
  PC.lables.use<-PC.lables.input
  
  pc1.label<-paste0("PC1 ",(PC.lables.use[1]*100)%>%round(., digits=2), "%")
  pc2.label<-paste0("PC2 ",(PC.lables.use[2]*100)%>%round(., digits=2), "%")
  
  Final.PCA.plot <- ggplot(data = PCA.ggplot.data, aes(x = PC1, y = PC2, colour = color)) +
    geom_point(
      data = PCA.ggplot.data,
      aes(
        x = PC1, y = PC2,
        colour = color,
        shape = EcoLabel
      ),
      size = 4,
      stroke=2,
      #position = position_jitter(width = .5, height = .5),
      position = position_jitterdodge(jitter.width = .9, jitter.height = .9, dodge.width=.9),
      alpha = 1
    ) +
    scale_colour_identity() +
    scale_shape_manual(values = c("Northern California Coast Ranges and Coast"=16,
                                  "Central California Coast Ranges and Coast"=17,
                                  "Klamath Mountains"=18,                         
                                  "Southern California Coast"=19,                 
                                  "Central Valley"=3,                            
                                  "Southern California Mountains and Valleys"=4, 
                                  "Sierra Nevada"=5,                            
                                  "Southern Cascades"=6,                         
                                  "Mojave Sonoran Desert"=8,                            
                                  "Modoc Plateau"=1,                             
                                  "Basin"=11,                                     
                                  "Colorado Desert"=0   
    ))+
    labs(x=pc1.label, y=pc2.label)+
    theme_classic(18) +
    theme(legend.position = "none")
  
  Final.PCA.plot +
    ggtitle(toString(title.input)) %>%  
    return()
}



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# UI ---------------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Define UI for data upload app ----
ui <- dashboardPage (
  skin = "green", 
   # tags$head(tags$style(HTML('.main-header .logo {
       # font-family: "Georgia", Times, "Times New Roman", serif;
       # font-weight: bold;
        #font-size: 24px; }'),

  # App title ----
  dashboardHeader(title = "Landscape Genetics CURE (BCEENET)",
                  titleWidth = 750
                  ),

  # Sidebar layout with input and output definitions ----
   dashboardSidebar(
  	
  	    # Input: Select a file ----
        fileInput("file1", "Choose Fasta File",
                multiple = TRUE,
                accept = c("text/csv",
                         "text/comma-separated-values,text/plain",
                         ".fasta",
                				 ".fa")
                ),

      # Horizontal line ----
       tags$hr(),
      
         textInput("graph_title", label = h4("Type Graph Title Here"), value = "Enter text..."),

      # Horizontal line ----  
      tags$hr(),
      # About info
      menuItem("Open for More Info", tabName = "More Info", 
               menuSubItem("BCEENET info", icon = icon("home"), 
                           href = "https://bceenetwork.org", 
                           newtab = T),
               menuSubItem("Source code (Github)", icon = icon("github"), 
                           href = "https://github.com/BNHM/BCEENET-Shiny", 
                           newtab = T),
               menuSubItem("Activity 3-Genetic Analysis DOC", icon = icon("file"), 
                            href = "https://github.com/BNHM/BCEENET-Shiny/blob/831954ec4e7ae638152064d1deed33ac07ef599b/Activity3-Genetic_Analysis_v1.docx", 
                            newtab = T),
               menuSubItem("Activity 4-Interpret the PCA DOC", icon = icon("file"), 
                            href = "https://github.com/BNHM/BCEENET-Shiny/blob/831954ec4e7ae638152064d1deed33ac07ef599b/Activity4-Interpret_the_PCA_v1.docx", 
                            newtab = T)
    )
   ),
  
dashboardBody(
    
  	
  	fluidRow(
  		
  		mainPanel(plotOutput(outputId = "pca_plot"))
  		
  	),
  	
  	fluidRow(
  		
  		# Output: Data file ----
  		mainPanel(DTOutput(outputId = "pca_table_output"))
  	)
  )
)



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Server -----------------------------------------------------------------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Define server logic to read selected file
server <- function(input, output) {

	output$pca_plot <- renderPlot({
	if(!is.null(input$file1$datapath)){
		
		matched.ID.list <- get.potential.voucher.numbers(input$file1$datapath) %>%
    									 match.vernet.to.fasta()
		PCA.dataframe <- subset.fasta.file(input$file1$datapath,matched.ID.list) %>%
		  make.pca.data.frame()
		
		PCA.results <- run.pca.analysis(PCA.dataframe,matched.ID.list,subset.fasta.file(input$file1$datapath,matched.ID.list))

		graph.labels <- PC.Labels(PCA.dataframe,matched.ID.list,subset.fasta.file(input$file1$datapath,matched.ID.list))
		
		output.table <- make.table.of.ecoregiongroups(PCA.results, input$graph_title)

		plot.PCA.Results(PCA.results, input$graph_title, PC.lables.input=graph.labels)

	}
	}
	)
	
	
	output$pca_table_output <- renderDataTable({
	if(!is.null(input$file1$datapath)){
		
		matched.ID.list <- get.potential.voucher.numbers(input$file1$datapath) %>%
    									 match.vernet.to.fasta()
		
		PCA.dataframe <- subset.fasta.file(input$file1$datapath,matched.ID.list) %>%
		  make.pca.data.frame()
		
		PCA.results <- run.pca.analysis(PCA.dataframe,matched.ID.list,subset.fasta.file(input$file1$datapath,matched.ID.list))

		graph.labels <- PC.Labels(PCA.dataframe,matched.ID.list,subset.fasta.file(input$file1$datapath,matched.ID.list))
		
		output.table <- make.table.of.ecoregiongroups(PCA.results, input$graph_title)

		datatable(output.table, escape = FALSE, 
					extensions = c("Buttons"),
					options = list(
		      "dom" = 'tB',
		      buttons = list(list(extend = 'copy', title = NULL)),
		      pageLength = 100),
					rownames = FALSE)
		
	}
	})
}

# Run the app ----
 shinyApp(ui, server)