/
RADSeq_CountsToTrend_forGithub.R
75 lines (61 loc) · 3.33 KB
/
RADSeq_CountsToTrend_forGithub.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#Webster et al., 2019, RAD-seq starvation resistance manuscript
#This is the script to go from SNP count data to trait data used for GWAS
#The majority of this script was written by Brad Moore
#set working directory so that df.unique.csv and snps.meta.data.csv are in that directory
#read in df.unique.csv and snps.meta.data.csv
#df.unique.csv is the count data, and snps.meta.data.csv describes the replicates
#The ans dataframe in this script will have the separate replicates, and the tmp4 dataframe has the averages
getwd()
setwd("/Users/amywebster/Dropbox (Duke Bio_Ea)/Baugh Lab/baugh_lab_users/Amy/NIL_wormsizer/RADSeq_NIL_paper/RADseq_AmyAnalysis/")
df.unique<-read.csv("df.unique.csv",header = T)
snps.meta.data<-read.csv("snps.meta.data.csv",header = T)
#for the analysis for the RAD-seq paper, we only used indirect replicates 1 and 2
df.unique2<-subset(df.unique,exp.id=="R2D14ind"|exp.id=="R2D1ind"|exp.id=="R2D21ind"|exp.id=="R2D24ind"|exp.id=="R2D7ind"|exp.id=="RID16ind"|exp.id=="RID21ind")
head(df.unique2)
quick.lm <- function(x) {
ans <- c()
for (j in unique(x$strain)) {
for (k in unique(x$exp.id)) {
tmp <- coef(lm(count ~ total + 0, data=x[x$strain == j & x$exp.id == k,]))
ans <- rbind(ans, data.frame(slope=tmp["total"], strain=j, exp.id=k))
}
}
ans
}
my.lm <- quick.lm(df.unique2)
my.lm <- merge(my.lm, snps.meta.data, by.x='exp.id', by.y='id')
head(my.lm)
ans <- c()
for (exp in 'indirect') { # look at only indirect results
tmp <- with(my.lm, my.lm[experiment == exp,])
for (repl in unique(tmp$replicate.id)) {
tmp2 <- with(tmp, tmp[tmp$replicate.id == repl,])
for (s in unique(tmp$strain)) {
tmp.lm <- lm(slope ~ day, data=with(tmp2, tmp2[strain == s,]))
ans <- rbind(ans, data.frame(experiment=exp, replicate.id=repl, strain=s, trend=coef(tmp.lm)["day"], r.squared=summary(tmp.lm)$r.squared, rank=0))
}
idx <- with(ans, experiment == exp & replicate.id == repl)
ans$rank[idx] <- (ecdf(ans$trend[idx]))(ans$trend[idx])
}
}
#Once you have ans, you can use aggregate to get the mean of indirect replicates 1 and 2
trend.lm <- ans
trend.aggr <- aggregate(cbind(trend, r.squared) ~ strain, data=trend.lm, FUN=mean)
trend.lm$strain <- factor(trend.lm$strain, levels=with(trend.aggr, strain[order(trend, decreasing=FALSE)]))
trend.aggr$strain <- factor(trend.aggr$strain, levels=levels(trend.lm$strain))
trend.lm
#df.unique2 <- merge(df.unique, snps.meta.data, by.x='exp.id', by.y='id')
my.r = trend.lm # added 8/29/2018
my.r.aggr <- aggregate(cbind(rank, trend, r.squared) ~ strain, data=my.r, FUN=mean)
#combine trend.r.aggr and my.r.aggr to get the labels in order
tmp3 <- merge(trend.aggr, my.r.aggr, by='strain')
head(tmp3)
with(tmp3, cor(trend.x, trend.y))
names(tmp3) <- c('strain', 'mean.trend.unique', 'mean.squared.unique', 'rank', 'mean.trend.quadOLS', 'mean.rsquared.quadOLS')
#write.csv(tmp3, file='20160408-indirect-results.csv',row.names=FALSE)
tmp4 <- merge(tmp3, with(my.r, my.r[replicate.id == 'R1', c('strain', 'rank')]), by='strain', suffixes=c('.mean', '.R1'))
tmp4 <- merge(tmp4, with(my.r, my.r[replicate.id == 'R2', c('strain', 'rank')]), by='strain', suffixes=c('.mean', '.R2'))
names(tmp4)[8] <- 'rank.R2'
tmp4 <- merge(tmp4, with(trend.lm, trend.lm[replicate.id == 'R1', c('strain', 'trend')], by='strain'))
head(tmp4)
write.csv(tmp4,file='2019-RADSeq-results.csv',row.names = FALSE)