Patterns_genetic_diversity_differentiation

####PACKAGES####
library(readxl)
library(xlsx)
library(dplyr)
library(radiator)
library(ggplot2)
library(RColorBrewer)
library(mapplots)
library(fields)
library(LEA) #BiocManager::install(c("LEA"))
library(adegenet)
library(reshape2)
library(hierfstat)
library(ape)
library(pcadapt)
library(OutFLANK) #devtools::install_github("StoreyLab/qvalue") AND THEN devtools::install_github("whitlock/OutFLANK")
library(genepop)
library(strataG)
library(tidyr)
library(stackr) #devtools::install_github("thierrygosselin/stackr")
library(igraph)
library(StAMPP)
library(poppr)
library(DECIPHER)

#------------------------------------------------------------------------------------------#
####GENETIC DIVERSITY STATISTICS####

#here we are using the dataset that has been filtered
#loci_filtered_df - end product of Loci_depth_filtering in Data_filtering folder

#Ho, He and Fis#
hierf <- write_hierfstat(loci_filtered_df) #convert dataframe to hierfstat structure

hierf_stats <- basic.stats(hierf) #retrieve Ho, Hs and Fis from data

hierf_Ho <- as.data.frame(hierf_stats[["Ho"]])
hierf_Hs <- as.data.frame(hierf_stats[["Hs"]])
hierf_Fis <- as.data.frame(hierf_stats[["Fis"]])

#calculate the mean per area
hierf_Ho_mean <- hierf_Ho %>% gather("Area", "Ho", 1:5) %>% group_by(Area) %>% summarise(mean = mean(Ho)) %>% spread(1, 2, 2)
hierf_Hs_mean <- hierf_Hs %>% gather("Area", "Hs", 1:5) %>% group_by(Area) %>% summarise(mean = mean(Hs)) %>% spread(1, 2, 2)
hierf_Fis_mean <- hierf_Fis %>% gather("Area", "Fis", 1:5) %>% filter(Fis != "NaN") %>% group_by(Area) %>% summarise(mean = mean(Fis)) %>% spread(1, 2, 2)

Summary_table <- rbind(hierf_Ho_mean, hierf_Hs_mean, hierf_Fis_mean)

#Ar - allelic richness#

Ar <- allelic.richness(hierf,min.n=NULL,diploid=TRUE)
Ar <- as.data.frame(Ar$Ar)
Ar <- gather(Ar, "Area", "Ar", 1:n) #n = columns to gather
Ar <- aggregate(Ar~Area, Ar, mean)

#Pi - nucleotide diversity#

#nucleotide diversity - data from Stacks (populations.sumstats.tsv) - NB replicate removed prior to running populations again, this can just be done by removing the indiviual from the population file

populations.sumstats <- read.delim("/path/to/file/populations.sumstats.tsv", comment.char="#")

populations.sumstats <- rename(populations.sumstats, LOCUS = Locus.ID, POS = Col)
populations.sumstats$LOCUS <- as.character(populations.sumstats$LOCUS) #need loci to be read as character
populations.sumstats$POS <- populations.sumstats$POS +1 #To correct the POS need at add 1 is using SNP data
populations.sumstats$POS <- as.character(populations.sumstats$POS) #need SNP position to be read as character

Pi <- semi_join(populations.sumstats, loci_filtered_df, by = LOCUS) #select only those loci common between the dataframes and remove 'by = LOCUS' is using SNP data

mean_Pi <- aggregate(Pi~Pop.ID, Pi, mean) #If the sum is taken you get the haplotype diversity

#------------------------------------------------------------------------------------------#
####GENETIC DIFFERENTIATION####

#Fst#

Strata <- write_gtypes(loci_filtered_df)
StrataFst <- popStructTest(Strata, nrep = 100000, stats = "fst", type = "both", keep.null = FALSE, quietly = FALSE, max.cores = NULL)

#extract p-values and adjust
p.adjust(pvalue/s,method="fdr") #this can be done manually typing pvalues in or making an adjust pvalue column

#AMOVA#

poppr <- write_genind(loci_filtered_df)

#a way to define population hierarchy if needed - as an example
population_hierarchy <- c()
#note that this order is dependant on the order of your individuals in your data
population_hierarchy$Pop <- as.character(c(rep("state1", no_indv), rep("state2", no_indv), rep("state3", no_indv), rep("state1", no_indv), rep("state3", no_indv)))
population_hierarchy$Subpop <- as.character(c(rep("location_w/i_state1", no_indv), rep("location_w/i_state2", no_indv), rep("location_w/i_state3", no_indv), rep("location_w/i_state1", no_indv), rep("location_w/i_state3", no_indv)))
population_hierarchy <- as.data.frame(population_hierarchy)

#adding this info back into the poppr dataframe that was created
poppr$other$population_hierarchy <- population_hierarchy
strata(poppr) <- data.frame(other(poppr)$population_hierarchy)

poppr.gen <- as.genclone(poppr) #convert to genclone
poppr.amova <- poppr.amova(poppr.gen, hier = ~Pop, missing = "ignore", cutoff = "ignore", method = "ade4", clonecorrect = FALSE, within = FALSE, dist = NULL, squared = FALSE, freq = TRUE, correction = "quasieuclid", sep = "_", filter = FALSE, threshold = 0, algorithm = "farthest_neighbor", threads = 1L, quiet = FALSE)
set.seed(1000)
poppr_signif   <- randtest(poppr.amova, nrepet = 100000) #p.adjust.method = "bonferroni" only if using hierarchical populations

#------------------------------------------------------------------------------------------#
####GENETIC CLUSTERING####

#DAPC#
DAPC <- write_genind(loci_filtered_df)

mycol = col_choice = brewer.pal(5, "Dark2")

xvalDapc(tab(DAPC, NA.method = "mean"), pop(DAPC)) #cross-validation to suggest the number of PCs to retain in the analysis
#https://grunwaldlab.github.io/Population_Genetics_in_R/DAPC.html good explanation for cross-validation

set.seed(999) #PCs to retain was 30 and another more in-depth cross validation was done to be certain
system.time(DAPCX <- xvalDapc(tab(DAPC, NA.method = "mean"), pop(DAPC),
                              n.pca = 20:40, n.rep = 1000,
                              parallel = "snow", ncpus = 4L))

#the xval shows that 30 pcs should be retained

dapc1 <- dapc(DAPC, DAPC$pop, n.pca = 30, n.da = 2) #CHANGE PCs and 2 DFs - PCs graph and DFs
 
scatter(dapc1, col = mycol, cex = 2, legend = TRUE, clabel = F, posi.leg = "topright", scree.pca = TRUE,
        posi.pca = "topleft", posi.da = "bottomleft", cleg = 0.75)
        
#PCA#
PCA <- write_genepop(loci_filtered_df)

stacks_pop <- read.genepop(PCA, ncode = 3L, quiet = FALSE)

stacks_pop_freq <- tab(stacks_pop, freq=TRUE, NA.method="mean")

stacks_pop_PCA <- dudi.pca(stacks_pop_freq, center=TRUE, scale=FALSE, scannf = F, nf = 2)

barplot(stacks_pop_PCA$eig, main="PCA eigenvalues", col=heat.colors(length(stacks_pop_PCA$eig))) #determine number of PCs to retain

col_choice = brewer.pal(5, "Set1") #using Rcolorbrewer

#plotting for axes x=1 and y=2. If want to see individuals s.label(stacks_pop_PCA$li)
s.class(stacks_pop_PCA$li, fac=pop(stacks_pop), col=col_choice)

add.scatter.eig(stacks_pop_PCA$eig[1:10],2,1,2, ratio=.15)

#if want to get a structure-like plot using LEA#
write_structure(loci_filtered_df, parallel.core = 1, filename = "./structure")

structure <- read.delim("./structure.str", header=FALSE) #for some reason still keeping the header
str_df <- structure[2:173,] #remove header

str_df <- rename(str_df, "Individual" = "V1")
str_df$Individual <- as.character(str_df$Individual)

#prepare coordinate file (e.g., coords) where you have the individual's and the long and lat of their collection location and location name, and join to str_df
coords_str <- left_join(str_df, coords, by = "Individual")

#Need to bring Long and Lat to columns 1 and 2 for correct formatting of .str df. Remove column one that contains the names of the individuals.
coords_str <- coords_str[,c(986, 987, 2:985)] #these numbers will change depending on your data - c(long, lat, loci_1:loci_n)

write.table(coords_str, "./coords_str.str", sep="\t", row.names = FALSE, col.names = FALSE)

input.file = "./coords_str.str"

snmf <- struct2geno(input.file, ploidy = 2, FORMAT = 2, extra.row = 0, extra.column = 3) #return output file in console

#choose K values to investigate - e.g., K = 1:5
obj.snmf = snmf("coords_str.str.geno", K = 1:5, project = "new", repetitions = 100, CPU = 1, alpha = 200, tolerance = 0.00001, entropy = T, percentage = 0.05, iterations = 500, ploidy = 2, seed = 10)

plot(obj.snmf, col = "blue4", cex = 1.4, pch = 19) #ID best K value

#in this case K = 2 was the best value
best = which.min(cross.entropy(obj.snmf, K = 2))

qmatrix = Q(obj.snmf, K = 2, run = best)

#now need to get individuals name back and as df maintains same order they can just be pulled out again
idv_names <- structure[2:173,1]
idv_names <- unique(idv_names)
idv_names <- as.data.frame(idv_names)
idv_names$no <- as.character(rep(1:86))

qmat_df <- as.data.frame(qmatrix)
qmat_df$no <- row.names(qmat_df)

join <- left_join(qmat_df, idv_names)

names(join) <- c("Group1", "Group2", "no", "ind") #add as many groups as there are best K values

struc <- melt(join, id.vars=3:4) #id.vars needs to match "no" and "ind" columns
#can add locations using left join and df coords, and join by individuals. This will allow you to plot by location if you would like.

Set1 <- brewer.pal(8, "Set1")

ggplot(struc, aes(ind, value, fill = variable)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  xlab("Individual") +
  ylab("Admixture coefficients")

#------------------------------------------------------------------------------------------#
####OUTLIER DETECTION####

#---BAYESCAN---#
#write Bayescan file - for some reason can't easily run radiators write_bayescan and takes a much longer than hierfstat does to convert the file.

write.bayescan(hierf, diploid = T, fn = "./path/to/save/BS_outliers_input")

#ran bayescan v 2.1 using the desktop app
#Default parameters were kept except for prior odds parameter was increased to 100 in order to try and eliminate false positives
#analyse results in R
Bayescan_results_fst <- read.csv("./path/to/file/BS_outliers_output.txt", sep="")
  
#filter for outliers qvalue < 0.1
Bayescan_results_fst$no <- rep(1:983)
Bayescan_results_fst <- filter(Bayescan_results_fst, qval < 0.1)
outliers_ref_vcf_BS <- left_join(Bayescan_results_fst, loci_no)
outliers_ref_vcf_BS$Bayescan <- rep("YES")

#---OUTFLANK---#
#tutorial https://adnguyen.github.io/2017_Ecological_Genomics/Tutorial/2017-04-03_OUTFLANK-yourself.html

#have previously already converted files to .geno for STRUCTURE analyses
ssw.geno.in<-read.fwf("./path/to/coords_str.str.geno",width=rep(1,86))
ssw.geno<-t(ssw.geno.in)

OF_SNPs<-MakeDiploidFSTMat(ssw.geno,locusNames=as.list(loci_no$loci_no),popNames=as.list(strata.tsv$STRATA)) #use strata file to get individual population assignments

OF_out<-OutFLANK(FstDataFrame = OF_SNPs,NumberOfSamples = n,qthreshold=0.1) #number of samples - The number of spatial locations included in the data set

OutFlank_outliers <- as.data.frame(OF_out$results)
OutFlank_outliers <- filter(OutFlank_outliers, qvalues <0.1)
OutFlank_outliers <- rename(loci_no = LocusName, OutFlank_outliers)
OutFlank_outliers$OutFlank <- rep("YES")

#---LEA---#
#LEA outlier detection can be used for multiallelic data
best = which.min(cross.entropy(obj.snmf, K = 2)) #Remember K could change with different datasets
p.val.snmf <- snmf.pvalues(obj.snmf, entropy = TRUE, ploidy = 2, K = 2, run = best, genomic.control = TRUE)
plot(-log10(p.val.snmf$pvalues), pch = 19, col = "blue", cex = .7)
p.val.snmf <- as.data.frame(p.val.snmf)
p.val.snmf$qval <- qvalue(p.val.snmf$pvalues)$qvalue
p.val.snmf$no <- rep(1:983)

#ID which loci are detected as outliers
loci_no <- unique(loci_filtered_df$LOCUS)
loci_no <- as.data.frame(loci_no)
loci_no$no <- rep(1:983)

snmf_outliers <- left_join(p.val.snmf, loci_no)
snmf_outliers <- filter(snmf_outliers, qval < 0.1)
snmf_outliers$LEA <- rep("YES")

#---DAPC---#

#Loadingplot for DAPC
dapc.LP <- dapc(DAPC, n.pca=3,n.da=2)

loadingplot(dapc.LP$var.contr, axis=1, thres=.005, lab.jitter=1) 

DAPC_loci_LP <- as.data.frame(dapc.LP$var.contr)
DAPC_loci_LP$loci_names <- row.names(DAPC_loci_LP)
DAPC_loci_LP <- filter(DAPC_loci_LP, LD1 > 0.009)

#---summarize outliers obtained from packages---#
#DAPC needs the most prep to get a comparable dataframe
DAPC_outliers <- as.data.frame(DAPC_loci_LP)
DAPC_outliers$loci_names <- rownames(DAPC_outliers)
DAPC_outliers <- filter(DAPC_outliers, LD1 > 0.009)
DAPC_outliers <- DAPC_outliers %>% separate(loci_names, into  = c("loci_no", "END"), sep = "[.]") 
DAPC_outliers$loci_no <- gsub("^1__",'',DAPC_outliers$loci_no)
DAPC_outliers$loci_no <- gsub("__\\d+",'',DAPC_outliers$loci_no)
#DAPC_outliers$loci_no <- gsub("[.]\\d+",'',DAPC_outliers$loci_no)
DAPC_outliers$adegenet <- rep("YES")
DAPC_outliers <- DAPC_outliers[,c("loci_no", "adegenet")]
DAPC_outliers <- unique(DAPC_outliers)

snmf_outliers <- snmf_outliers[,5:6] 

outliers_ref_vcf_BS <- outliers_ref_vcf_BS[7:8]

OutFlank_outliers <- OutFlank_outliers[,c("loci_no", "OutFlank")]

#compare them
outliers <- full_join(DAPC_outliers, snmf_outliers, by = "loci_no")
outliers <- full_join(outliers, outliers_ref_vcf_BS)
outliers <- full_join(outliers, OutFlank_outliers)