-
Notifications
You must be signed in to change notification settings - Fork 0
/
Patterns_genetic_diversity_differentiation
284 lines (203 loc) · 12.4 KB
/
Patterns_genetic_diversity_differentiation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
####PACKAGES####
library(readxl)
library(xlsx)
library(dplyr)
library(radiator)
library(ggplot2)
library(RColorBrewer)
library(mapplots)
library(fields)
library(LEA) #BiocManager::install(c("LEA"))
library(adegenet)
library(reshape2)
library(hierfstat)
library(ape)
library(pcadapt)
library(OutFLANK) #devtools::install_github("StoreyLab/qvalue") AND THEN devtools::install_github("whitlock/OutFLANK")
library(genepop)
library(strataG)
library(tidyr)
library(stackr) #devtools::install_github("thierrygosselin/stackr")
library(igraph)
library(StAMPP)
library(poppr)
library(DECIPHER)
#------------------------------------------------------------------------------------------#
####GENETIC DIVERSITY STATISTICS####
#here we are using the dataset that has been filtered
#loci_filtered_df - end product of Loci_depth_filtering in Data_filtering folder
#Ho, He and Fis#
hierf <- write_hierfstat(loci_filtered_df) #convert dataframe to hierfstat structure
hierf_stats <- basic.stats(hierf) #retrieve Ho, Hs and Fis from data
hierf_Ho <- as.data.frame(hierf_stats[["Ho"]])
hierf_Hs <- as.data.frame(hierf_stats[["Hs"]])
hierf_Fis <- as.data.frame(hierf_stats[["Fis"]])
#calculate the mean per area
hierf_Ho_mean <- hierf_Ho %>% gather("Area", "Ho", 1:5) %>% group_by(Area) %>% summarise(mean = mean(Ho)) %>% spread(1, 2, 2)
hierf_Hs_mean <- hierf_Hs %>% gather("Area", "Hs", 1:5) %>% group_by(Area) %>% summarise(mean = mean(Hs)) %>% spread(1, 2, 2)
hierf_Fis_mean <- hierf_Fis %>% gather("Area", "Fis", 1:5) %>% filter(Fis != "NaN") %>% group_by(Area) %>% summarise(mean = mean(Fis)) %>% spread(1, 2, 2)
Summary_table <- rbind(hierf_Ho_mean, hierf_Hs_mean, hierf_Fis_mean)
#Ar - allelic richness#
Ar <- allelic.richness(hierf,min.n=NULL,diploid=TRUE)
Ar <- as.data.frame(Ar$Ar)
Ar <- gather(Ar, "Area", "Ar", 1:n) #n = columns to gather
Ar <- aggregate(Ar~Area, Ar, mean)
#Pi - nucleotide diversity#
#nucleotide diversity - data from Stacks (populations.sumstats.tsv) - NB replicate removed prior to running populations again, this can just be done by removing the indiviual from the population file
populations.sumstats <- read.delim("/path/to/file/populations.sumstats.tsv", comment.char="#")
populations.sumstats <- rename(populations.sumstats, LOCUS = Locus.ID, POS = Col)
populations.sumstats$LOCUS <- as.character(populations.sumstats$LOCUS) #need loci to be read as character
populations.sumstats$POS <- populations.sumstats$POS +1 #To correct the POS need at add 1 is using SNP data
populations.sumstats$POS <- as.character(populations.sumstats$POS) #need SNP position to be read as character
Pi <- semi_join(populations.sumstats, loci_filtered_df, by = LOCUS) #select only those loci common between the dataframes and remove 'by = LOCUS' is using SNP data
mean_Pi <- aggregate(Pi~Pop.ID, Pi, mean) #If the sum is taken you get the haplotype diversity
#------------------------------------------------------------------------------------------#
####GENETIC DIFFERENTIATION####
#Fst#
Strata <- write_gtypes(loci_filtered_df)
StrataFst <- popStructTest(Strata, nrep = 100000, stats = "fst", type = "both", keep.null = FALSE, quietly = FALSE, max.cores = NULL)
#extract p-values and adjust
p.adjust(pvalue/s,method="fdr") #this can be done manually typing pvalues in or making an adjust pvalue column
#AMOVA#
poppr <- write_genind(loci_filtered_df)
#a way to define population hierarchy if needed - as an example
population_hierarchy <- c()
#note that this order is dependant on the order of your individuals in your data
population_hierarchy$Pop <- as.character(c(rep("state1", no_indv), rep("state2", no_indv), rep("state3", no_indv), rep("state1", no_indv), rep("state3", no_indv)))
population_hierarchy$Subpop <- as.character(c(rep("location_w/i_state1", no_indv), rep("location_w/i_state2", no_indv), rep("location_w/i_state3", no_indv), rep("location_w/i_state1", no_indv), rep("location_w/i_state3", no_indv)))
population_hierarchy <- as.data.frame(population_hierarchy)
#adding this info back into the poppr dataframe that was created
poppr$other$population_hierarchy <- population_hierarchy
strata(poppr) <- data.frame(other(poppr)$population_hierarchy)
poppr.gen <- as.genclone(poppr) #convert to genclone
poppr.amova <- poppr.amova(poppr.gen, hier = ~Pop, missing = "ignore", cutoff = "ignore", method = "ade4", clonecorrect = FALSE, within = FALSE, dist = NULL, squared = FALSE, freq = TRUE, correction = "quasieuclid", sep = "_", filter = FALSE, threshold = 0, algorithm = "farthest_neighbor", threads = 1L, quiet = FALSE)
set.seed(1000)
poppr_signif <- randtest(poppr.amova, nrepet = 100000) #p.adjust.method = "bonferroni" only if using hierarchical populations
#------------------------------------------------------------------------------------------#
####GENETIC CLUSTERING####
#DAPC#
DAPC <- write_genind(loci_filtered_df)
mycol = col_choice = brewer.pal(5, "Dark2")
xvalDapc(tab(DAPC, NA.method = "mean"), pop(DAPC)) #cross-validation to suggest the number of PCs to retain in the analysis
#https://grunwaldlab.github.io/Population_Genetics_in_R/DAPC.html good explanation for cross-validation
set.seed(999) #PCs to retain was 30 and another more in-depth cross validation was done to be certain
system.time(DAPCX <- xvalDapc(tab(DAPC, NA.method = "mean"), pop(DAPC),
n.pca = 20:40, n.rep = 1000,
parallel = "snow", ncpus = 4L))
#the xval shows that 30 pcs should be retained
dapc1 <- dapc(DAPC, DAPC$pop, n.pca = 30, n.da = 2) #CHANGE PCs and 2 DFs - PCs graph and DFs
scatter(dapc1, col = mycol, cex = 2, legend = TRUE, clabel = F, posi.leg = "topright", scree.pca = TRUE,
posi.pca = "topleft", posi.da = "bottomleft", cleg = 0.75)
#PCA#
PCA <- write_genepop(loci_filtered_df)
stacks_pop <- read.genepop(PCA, ncode = 3L, quiet = FALSE)
stacks_pop_freq <- tab(stacks_pop, freq=TRUE, NA.method="mean")
stacks_pop_PCA <- dudi.pca(stacks_pop_freq, center=TRUE, scale=FALSE, scannf = F, nf = 2)
barplot(stacks_pop_PCA$eig, main="PCA eigenvalues", col=heat.colors(length(stacks_pop_PCA$eig))) #determine number of PCs to retain
col_choice = brewer.pal(5, "Set1") #using Rcolorbrewer
#plotting for axes x=1 and y=2. If want to see individuals s.label(stacks_pop_PCA$li)
s.class(stacks_pop_PCA$li, fac=pop(stacks_pop), col=col_choice)
add.scatter.eig(stacks_pop_PCA$eig[1:10],2,1,2, ratio=.15)
#if want to get a structure-like plot using LEA#
write_structure(loci_filtered_df, parallel.core = 1, filename = "./structure")
structure <- read.delim("./structure.str", header=FALSE) #for some reason still keeping the header
str_df <- structure[2:173,] #remove header
str_df <- rename(str_df, "Individual" = "V1")
str_df$Individual <- as.character(str_df$Individual)
#prepare coordinate file (e.g., coords) where you have the individual's and the long and lat of their collection location and location name, and join to str_df
coords_str <- left_join(str_df, coords, by = "Individual")
#Need to bring Long and Lat to columns 1 and 2 for correct formatting of .str df. Remove column one that contains the names of the individuals.
coords_str <- coords_str[,c(986, 987, 2:985)] #these numbers will change depending on your data - c(long, lat, loci_1:loci_n)
write.table(coords_str, "./coords_str.str", sep="\t", row.names = FALSE, col.names = FALSE)
input.file = "./coords_str.str"
snmf <- struct2geno(input.file, ploidy = 2, FORMAT = 2, extra.row = 0, extra.column = 3) #return output file in console
#choose K values to investigate - e.g., K = 1:5
obj.snmf = snmf("coords_str.str.geno", K = 1:5, project = "new", repetitions = 100, CPU = 1, alpha = 200, tolerance = 0.00001, entropy = T, percentage = 0.05, iterations = 500, ploidy = 2, seed = 10)
plot(obj.snmf, col = "blue4", cex = 1.4, pch = 19) #ID best K value
#in this case K = 2 was the best value
best = which.min(cross.entropy(obj.snmf, K = 2))
qmatrix = Q(obj.snmf, K = 2, run = best)
#now need to get individuals name back and as df maintains same order they can just be pulled out again
idv_names <- structure[2:173,1]
idv_names <- unique(idv_names)
idv_names <- as.data.frame(idv_names)
idv_names$no <- as.character(rep(1:86))
qmat_df <- as.data.frame(qmatrix)
qmat_df$no <- row.names(qmat_df)
join <- left_join(qmat_df, idv_names)
names(join) <- c("Group1", "Group2", "no", "ind") #add as many groups as there are best K values
struc <- melt(join, id.vars=3:4) #id.vars needs to match "no" and "ind" columns
#can add locations using left join and df coords, and join by individuals. This will allow you to plot by location if you would like.
Set1 <- brewer.pal(8, "Set1")
ggplot(struc, aes(ind, value, fill = variable)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
xlab("Individual") +
ylab("Admixture coefficients")
#------------------------------------------------------------------------------------------#
####OUTLIER DETECTION####
#---BAYESCAN---#
#write Bayescan file - for some reason can't easily run radiators write_bayescan and takes a much longer than hierfstat does to convert the file.
write.bayescan(hierf, diploid = T, fn = "./path/to/save/BS_outliers_input")
#ran bayescan v 2.1 using the desktop app
#Default parameters were kept except for prior odds parameter was increased to 100 in order to try and eliminate false positives
#analyse results in R
Bayescan_results_fst <- read.csv("./path/to/file/BS_outliers_output.txt", sep="")
#filter for outliers qvalue < 0.1
Bayescan_results_fst$no <- rep(1:983)
Bayescan_results_fst <- filter(Bayescan_results_fst, qval < 0.1)
outliers_ref_vcf_BS <- left_join(Bayescan_results_fst, loci_no)
outliers_ref_vcf_BS$Bayescan <- rep("YES")
#---OUTFLANK---#
#tutorial https://adnguyen.github.io/2017_Ecological_Genomics/Tutorial/2017-04-03_OUTFLANK-yourself.html
#have previously already converted files to .geno for STRUCTURE analyses
ssw.geno.in<-read.fwf("./path/to/coords_str.str.geno",width=rep(1,86))
ssw.geno<-t(ssw.geno.in)
OF_SNPs<-MakeDiploidFSTMat(ssw.geno,locusNames=as.list(loci_no$loci_no),popNames=as.list(strata.tsv$STRATA)) #use strata file to get individual population assignments
OF_out<-OutFLANK(FstDataFrame = OF_SNPs,NumberOfSamples = n,qthreshold=0.1) #number of samples - The number of spatial locations included in the data set
OutFlank_outliers <- as.data.frame(OF_out$results)
OutFlank_outliers <- filter(OutFlank_outliers, qvalues <0.1)
OutFlank_outliers <- rename(loci_no = LocusName, OutFlank_outliers)
OutFlank_outliers$OutFlank <- rep("YES")
#---LEA---#
#LEA outlier detection can be used for multiallelic data
best = which.min(cross.entropy(obj.snmf, K = 2)) #Remember K could change with different datasets
p.val.snmf <- snmf.pvalues(obj.snmf, entropy = TRUE, ploidy = 2, K = 2, run = best, genomic.control = TRUE)
plot(-log10(p.val.snmf$pvalues), pch = 19, col = "blue", cex = .7)
p.val.snmf <- as.data.frame(p.val.snmf)
p.val.snmf$qval <- qvalue(p.val.snmf$pvalues)$qvalue
p.val.snmf$no <- rep(1:983)
#ID which loci are detected as outliers
loci_no <- unique(loci_filtered_df$LOCUS)
loci_no <- as.data.frame(loci_no)
loci_no$no <- rep(1:983)
snmf_outliers <- left_join(p.val.snmf, loci_no)
snmf_outliers <- filter(snmf_outliers, qval < 0.1)
snmf_outliers$LEA <- rep("YES")
#---DAPC---#
#Loadingplot for DAPC
dapc.LP <- dapc(DAPC, n.pca=3,n.da=2)
loadingplot(dapc.LP$var.contr, axis=1, thres=.005, lab.jitter=1)
DAPC_loci_LP <- as.data.frame(dapc.LP$var.contr)
DAPC_loci_LP$loci_names <- row.names(DAPC_loci_LP)
DAPC_loci_LP <- filter(DAPC_loci_LP, LD1 > 0.009)
#---summarize outliers obtained from packages---#
#DAPC needs the most prep to get a comparable dataframe
DAPC_outliers <- as.data.frame(DAPC_loci_LP)
DAPC_outliers$loci_names <- rownames(DAPC_outliers)
DAPC_outliers <- filter(DAPC_outliers, LD1 > 0.009)
DAPC_outliers <- DAPC_outliers %>% separate(loci_names, into = c("loci_no", "END"), sep = "[.]")
DAPC_outliers$loci_no <- gsub("^1__",'',DAPC_outliers$loci_no)
DAPC_outliers$loci_no <- gsub("__\\d+",'',DAPC_outliers$loci_no)
#DAPC_outliers$loci_no <- gsub("[.]\\d+",'',DAPC_outliers$loci_no)
DAPC_outliers$adegenet <- rep("YES")
DAPC_outliers <- DAPC_outliers[,c("loci_no", "adegenet")]
DAPC_outliers <- unique(DAPC_outliers)
snmf_outliers <- snmf_outliers[,5:6]
outliers_ref_vcf_BS <- outliers_ref_vcf_BS[7:8]
OutFlank_outliers <- OutFlank_outliers[,c("loci_no", "OutFlank")]
#compare them
outliers <- full_join(DAPC_outliers, snmf_outliers, by = "loci_no")
outliers <- full_join(outliers, outliers_ref_vcf_BS)
outliers <- full_join(outliers, OutFlank_outliers)