-
Notifications
You must be signed in to change notification settings - Fork 23
/
ds.meanSdGp.R
307 lines (277 loc) · 12.8 KB
/
ds.meanSdGp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#'
#' @title Computes the mean and standard deviation across groups defined by one factor
#' @description This function calculates the mean and SD of a continuous variable for each class of
#' a single factor.
#' @details This function calculates the mean, standard deviation (SD), N (number of observations)
#' and the standard error of the mean (SEM) of a continuous variable broken down into subgroups
#' defined by a single factor.
#'
#' There are important differences between \code{ds.meanSdGp} function compared to
#' the function \code{ds.meanByClass}:
#'
#' (A) \code{ds.meanSdGp} does not actually subset the data it simply calculates the required statistics
#' and reports them. This means you cannot use this function if you wish to physically break the
#' data into subsets. On the other hand, it makes the function very much faster than \code{ds.meanByClass}
#' if you do not need to create physical subsets. \cr
#' (B) \code{ds.meanByClass} allows you to specify up to
#' three categorising factors, but \code{ds.meanSdGp} only allows one. However, this is not a serious
#' problem. If you have two factors (e.g. sex with two levels \code{[0,1]} and \code{BMI.categorical} with
#' three levels \code{[1,2,3]}) you simply need to create a new factor that combines the two together in a
#' way that gives each combination of levels a different value in the new factor. So, in the
#' example given, the calculation \code{newfactor = (3*sex) + BMI} gives you six values: \cr
#' (1) \code{sex = 0} and \code{BMI = 1} -> \code{newfactor = 1} \cr
#' (2) \code{sex = 0} and \code{BMI = 2} -> \code{newfactor = 2} \cr
#' (3) \code{sex = 0} and \code{BMI = 3} -> \code{newfactor = 3} \cr
#' (4) \code{sex = 1} and \code{BMI = 1} -> \code{newfactor = 4} \cr
#' (5) \code{sex = 1} and \code{BMI = 2} -> \code{newfactor = 5} \cr
#' (6) \code{sex = 1} and \code{BMI = 3} -> \code{newfactor = 6} \cr
#'
#' (C) At present, \code{ds.meanByClass} calculates the sample size in each group to mean the
#' total sample size (i.e. it
#' includes all observations in each group regardless of whether or not they include missing values
#' for the continuous variable or the factor). The calculation of sample size in each group by
#' \code{ds.meanSdGp} always reports the number of observations that are non-missing both for the
#' continuous variable and the factor. This makes sense - in the case of \code{ds.meanByClass},
#' the total size of the physical subsets was important,
#' but when it comes down only to \code{ds.meanSdGp} which
#' undertakes analysis without physical subsetting, it is only the observations with non-missing
#' values in both variables that contribute to the calculation of means and SDs within each group
#' and so it is logical to consider those counts as primary. The only reference \code{ds.meanSdGp} makes
#' to missing counts is in the reporting of \code{Ntotal} and \code{Nmissing} overall (ie not broken down by
#' group).
#'
#' For the future, we plan to extend \code{ds.meanByClass} to report both total and non-missing
#' counts in subgroups.
#'
#' Depending on the variable \code{type} can be carried out different analysis:\cr
#' (1) \code{"combine"}: a pooled table of results is generated. \cr
#' (2) \code{"split"} a table of results is generated for each study. \cr
#' (3) \code{"both"} both sets of outputs are produced.
#'
#' Server function called: \code{meanSdGpDS}
#' @param x a character string specifying the name of a numeric continuous
#' variable.
#' @param y a character string specifying the name of a categorical
#' variable of class factor.
#' @param type a character string that represents the type of analysis to carry out.
#' This can be set as: \code{"combine"}, \code{"split"} or \code{"both"}.
#' Default \code{"both"}.
#' For more information see \strong{Details}.
#' @param do.checks logical. If TRUE the administrative checks
#' are undertaken to ensure that the input objects are defined in all studies and that the
#' variables are of equivalent class in each study.
#' Default is FALSE to save time.
#' @param datasources a list of \code{\link{DSConnection-class}}
#' objects obtained after login. If the \code{datasources} argument is not specified
#' the default set of connections will be used: see \code{\link{datashield.connections_default}}.
#' @return \code{ds.meanSdGp} returns to the client-side the mean, SD, Nvalid and SEM combined
#' across studies and/or separately for each study, depending on the argument \code{type}.
#'
#' @author DataSHIELD Development Team
#' @seealso \code{\link{ds.subsetByClass}} to subset by the classes of factor vector(s).
#' @seealso \code{\link{ds.subset}} to subset by complete cases (i.e. removing missing values), threshold,
#' columns and rows.
#' @export
#' @examples
#' \dontrun{
#'
#' ## Version 6, for version 5 see the Wiki
#'
#' # connecting to the Opal servers
#'
#' require('DSI')
#' require('DSOpal')
#' require('dsBaseClient')
#'
#' builder <- DSI::newDSLoginBuilder()
#' builder$append(server = "study1",
#' url = "http://192.168.56.100:8080/",
#' user = "administrator", password = "datashield_test&",
#' table = "SURVIVAL.EXPAND_NO_MISSING1", driver = "OpalDriver")
#' builder$append(server = "study2",
#' url = "http://192.168.56.100:8080/",
#' user = "administrator", password = "datashield_test&",
#' table = "SURVIVAL.EXPAND_NO_MISSING2", driver = "OpalDriver")
#' builder$append(server = "study3",
#' url = "http://192.168.56.100:8080/",
#' user = "administrator", password = "datashield_test&",
#' table = "SURVIVAL.EXPAND_NO_MISSING3", driver = "OpalDriver")
#' logindata <- builder$build()
#'
#' connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
#'
#'
#' #Example 1: Calculate the mean, SD, Nvalid and SEM of the continuous variable age.60 (age in
#' #years centralised at 60), broken down by time.id (a six level factor relating to survival time)
#' #and report the pooled results combined across studies.
#'
#' ds.meanSdGp(x = "D$age.60",
#' y = "D$time.id",
#' type = "combine",
#' do.checks = FALSE,
#' datasources = connections)
#'
#' #Example 2: Calculate the mean, SD, Nvalid and SEM of the continuous variable age.60 (age in
#' #years centralised at 60), broken down by time.id (a six level factor relating to survival time)
#' #and report both study-specific results and the pooled results combined across studies.
#' #Save the returned output to msg.b.
#'
#' ds.meanSdGp(x = "D$age.60",
#' y = "D$time.id",
#' type = "both",
#' do.checks = FALSE,
#' datasources = connections)
#'
#' # clear the Datashield R sessions and logout
#' datashield.logout(connections)
#' }
#'
ds.meanSdGp <- function(x=NULL, y=NULL, type='both', do.checks=FALSE, datasources=NULL){
# look for DS connections
if(is.null(datasources)){
datasources <- datashield.connections_find()
}
if(is.null(x)){
stop("Please provide the name of the input vector!", call.=FALSE)
}
if(is.null(y)){
stop("Please provide the name of the input vector!", call.=FALSE)
}
# the input variable might be given as column table (i.e. D$x)
# or just as a vector not attached to a table (i.e. x)
# we have to make sure the function deals with each case
xnames <- extract(x)
ynames <- extract(y)
xvarname <- xnames$elements
yvarname <- ynames$elements
xobj2lookfor <- xnames$holders
yobj2lookfor <- ynames$holders
xvariable <- xvarname
yvariable <- yvarname
if(do.checks)
{
# check if the input object(s) is(are) defined in all the studies
if(is.na(xobj2lookfor)){
defined <- isDefined(datasources, xvarname)
}else{
defined <- isDefined(datasources, xobj2lookfor)
}
if(is.na(yobj2lookfor)){
defined <- isDefined(datasources, yvarname)
}else{
defined <- isDefined(datasources, yobj2lookfor)
}
# call the internal function that checks the input object is of the same class in all studies.
typ1 <- checkClass(datasources, x)
typ2 <- checkClass(datasources, y)
}
# names of the studies
stdnames <- names(datasources)
# call the server side function that calculates mean and DS by group in each study
calltext <- paste0("meanSdGpDS(", x, ",", y, ")")
output <- DSI::datashield.aggregate(datasources, as.symbol(calltext))
numsources <- length(output)
all.tables.valid<-1
for(i in 1:numsources){
if(unlist(output[[i]]$Table_valid==FALSE)) all.tables.valid <- all.tables.valid-1
}
if(all.tables.valid<1){
warning.message<-"At least 1 cell count is 1-nfilter, please regroup"
return(warning.message)
}
if(all.tables.valid==1){
# COMBINE OR BOTH
if(type!="split"){
numsources <- length(output)
mean.matrix <- NULL
sd.matrix <- NULL
n.matrix <- NULL
Nvalid <- 0
Nmissing <- 0
Ntotal <- 0
for(j in 1:numsources){
mean.matrix <- rbind(mean.matrix,as.numeric(unlist(output[[j]][2])))
sd.matrix <- rbind(sd.matrix,as.numeric(unlist(output[[j]][3])))
n.matrix <- rbind(n.matrix,as.numeric(unlist(output[[j]][4])))
Nvalid <- Nvalid+as.numeric(unlist(output[[j]][5]))
Nmissing <- Nmissing+as.numeric(unlist(output[[j]][6]))
Ntotal <- Ntotal+as.numeric(unlist(output[[j]][7]))
}
var.matrix <- sd.matrix^2
nsum.vector <- rep(1,numsources)
# Calculate weighted means across studies in each group
mean.gp <- (diag(t(mean.matrix)%*%n.matrix))/(t(n.matrix)%*%nsum.vector)
# Calculate weighted SDs across studies in each group
var.gp <- (diag(t(var.matrix)%*%n.matrix))/(t(n.matrix)%*%nsum.vector)
SD.gp <- sqrt(var.gp)
N.gp <- (t(n.matrix) %*% nsum.vector)
SEM.gp <- SD.gp/sqrt(N.gp)
# create names
names.gp <- rep(NA,length(mean.gp))
for(k in 1:length(mean.gp)){
names.gp[k] <- paste0(yvarname,"_",k)
}
dimnames(mean.gp) <- c(list(names.gp),list("Mean_gp"))
dimnames(SD.gp) <- c(list(names.gp),list("SD_gp"))
dimnames(N.gp) <- c(list(names.gp),list("Nvalid_gp"))
dimnames(SEM.gp) <- c(list(names.gp),list("SEM_gp"))
}
# SPLIT OR BOTH
if(type!="combine"){
numsources <- length(output)
mean.matrix <- NULL
sd.matrix <- NULL
n.matrix <- NULL
Nvalid <- 0
Nmissing <- 0
Ntotal <- 0
for(j in 1:numsources){
mean.matrix <- rbind(mean.matrix,as.numeric(unlist(output[[j]][2])))
sd.matrix <- rbind(sd.matrix,as.numeric(unlist(output[[j]][3])))
n.matrix <- rbind(n.matrix,as.numeric(unlist(output[[j]][4])))
Nvalid <- Nvalid+as.numeric(unlist(output[[j]][5]))
Nmissing <- Nmissing+as.numeric(unlist(output[[j]][6]))
Ntotal <- Ntotal+as.numeric(unlist(output[[j]][7]))
}
var.matrix <- sd.matrix^2
mean.gp.study <- t(mean.matrix)
SD.gp.study <- t(sd.matrix)
N.gp.study <- t(n.matrix)
SEM.gp.study <- SD.gp.study/sqrt(N.gp.study)
# create names
names.gp <- rep(NA,dim(mean.gp.study)[1])
for(k in 1:dim(mean.gp.study)[1]){
names.gp[k] <- paste0(yvarname,"_",k)
}
names.study <- names(datasources)
dimnames(mean.gp.study) <- c(list(names.gp),list(names.study))
dimnames(SD.gp.study) <- c(list(names.gp),list(names.study))
dimnames(N.gp.study) <- c(list(names.gp),list(names.study))
dimnames(SEM.gp.study) <- c(list(names.gp),list(names.study))
}
if(type=="combine"){
result <- list(mean.gp,SD.gp,N.gp,SEM.gp,Nvalid,Nmissing,Ntotal)
names(result) <- list("Mean_gp","StDev_gp","Nvalid_gp","SEM_gp","Total_Nvalid","Total_Nmissing","Total_Ntotal")
return(result)
}
if(type=="split"){
result <- list(mean.gp.study,SD.gp.study,N.gp.study,SEM.gp.study,Nvalid,Nmissing,Ntotal)
names(result) <- list("Mean_gp_study","StDev_gp_study","Nvalid_gp_study","SEM_gp_study","Total_Nvalid","Total_Nmissing","Total_Ntotal")
return(result)
}
if(type!="combine" & type!="split"){
mean.gp.study <- cbind(mean.gp.study,mean.gp)
SD.gp.study <- cbind(SD.gp.study,SD.gp)
N.gp.study <- cbind(N.gp.study,N.gp)
SEM.gp.study <- cbind(SEM.gp.study,SEM.gp)
dimnames(mean.gp.study) <- c(list(names.gp),list(c(names.study,"COMBINE")))
dimnames(SD.gp.study) <- c(list(names.gp),list(c(names.study,"COMBINE")))
dimnames(N.gp.study) <- c(list(names.gp),list(c(names.study,"COMBINE")))
dimnames(SEM.gp.study) <- c(list(names.gp),list(c(names.study,"COMBINE")))
result <- list(mean.gp.study,SD.gp.study,N.gp.study,SEM.gp.study,Nvalid,Nmissing,Ntotal)
names(result) <- list("Mean_gp_study","StDev_gp_study","Nvalid_gp_study","SEM_gp_study","Total_Nvalid","Total_Nmissing","Total_Ntotal")
return(result)
}
}
}
#ds.meanSdGp