-
Notifications
You must be signed in to change notification settings - Fork 23
/
ds.table2D.R
242 lines (222 loc) · 11.4 KB
/
ds.table2D.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#'
#' @title Generates 2-dimensional contingency tables
#' @description The function ds.table2D is a client-side wrapper function. It calls the server-side function
#' 'table2DDS' that generates a 2-dimensional contingency table for each data source.
#' @details The table returned by the server side function might be valid (non disclosive - no table cell have
#' counts between 1 and the minimal number agreed by the data owner and set in the data repository as the "nfilter.tab") or
#' invalid (potentially disclosive - one or more table cells have a count between 1 and the minimal number agreed
#' by the data owner). If a 2-dimensional table is invalid all the cells are set to NA except the total counts.
#' In this way, it is possible to combine total counts across all the data sources but it is not possible to
#' identify the cell(s) that had the small counts which render the table invalid.
#' @param x a character, the name of a numerical vector with discrete values - usually a factor.
#' @param y a character, the name of a numerical vector with discrete values - usually a factor.
#' @param type a character which represent the type of table to ouput: pooled table or one table for each
#' data source or both. If \code{type} is set to 'combine', a pooled 2-dimensional table is returned; If \code{type}
#' is set to 'split' a 2-dimensional table is returned for each data source. If \code{type} is set to 'both' (default)
#' a pooled 2-dimensional table plus a 2-dimensional table for each data source are returned.
#' @param warningMessage a boolean, if set to TRUE (deafult) a warning is displayed if any returned table is invalid. Warning
#' messages are suppressed if this parameter is set to FALSE. However the analyst can still view 'validity' information
#' which are stored in the output object 'validity' - see the list of output objects.
#' @param datasources a list of \code{\link{DSConnection-class}} objects obtained after login. If the <datasources>
#' the default set of connections will be used: see \link{datashield.connections_default}.
#' @return A list object containing the following items:
#' \item{colPercent}{table(s) that hold column percentages for each level/category. Inner cells are reported as
#' missing if one or more cells are 'invalid'.}
#' \item{rowPercent}{table(s) that hold row percentages for each level/category. Inner cells are reported as
#' missing if one or more cells are 'invalid'.}
#' \item{chi2Test}{Chi-squared test for homogeneity.}
#' \item{counts}{table(s) that hold counts for each level/category. If some cell counts are invalid (see 'Details'
#' section) only the total (outer) cell counts are displayed in the returned individual study tables or in the pooled
#' table.}
#' \item{validity}{a text that informs the analyst about the validity of the output tables. If any tables are invalid the
#' studies they are originated from are also mentioned in the text message.}
#' @author Amadou Gaye, Paul Burton, Demetris Avraam, for DataSHIELD Development Team
#' @seealso \link{ds.table1D} for the tabulating one vector.
#' @export
#' @examples
#' \dontrun{
#'
#' # load the file that contains the login details
#' data(logindata)
#'
#' # login and assign all the variables to R
#' conns <- datashield.login(logins=logindata,assign=TRUE)
#'
#' # Example 1: generate a two dimensional table, outputting combined contingency
#' # tables - default behaviour
#' output <- ds.table2D(x='D$DIS_DIAB', y='D$GENDER')
#' # display the 5 results items, one at a time to avoid having too much information
#' # displayed at the same time
#' output$counts
#' output$rowPercent
#' output$colPercent
#' output$chi2Test
#' output$validity
#'
#' # Example 2: generate a two dimensional table, outputting study specific contingency tables
#' ds.table2D(x='D$DIS_DIAB', y='D$GENDER', type='split')
#' # display the 5 results items, one at a time to avoid having too much information displayed
#' at the same time
#' output$counts
#' output$rowPercent
#' output$colPercent
#' output$chi2Test
#' output$validity
#'
#' # Example 3: generate a two dimensional table, outputting combined contingency tables
#' # *** this example shows what happens when one or studies return an invalid table ***
#' output <- ds.table2D(x='D$DIS_CVA', y='D$GENDER', type='combine')
#' output$counts
#' output$rowPercent
#' output$colPercent
#' output$chi2Test
#' output$validity
#'
#' # Example 4: same example as above but output is given for each study,
#' # separately (i.e. type='split')
#' # *** this example shows what happens when one or studies return an invalid table ***
#' output <- ds.table2D(x='D$DIS_CVA', y='D$GENDER', type='split')
#' output$counts
#' output$rowPercent
#' output$colPercent
#' output$chi2Test
#' output$validity
#'
#' # clear the Datashield R sessions and logout
#' datashield.logout(conns)
#'
#' }
#'
ds.table2D <- function(x=NULL, y=NULL, type='both', warningMessage=TRUE, datasources=NULL){
.Deprecated("ds.table")
# look for DS connections
if(is.null(datasources)){
datasources <- datashield.connections_find()
}
if(is.null(x)){
stop("Please provide the name of the x vector!", call.=FALSE)
}
if(is.null(y)){
stop("Please provide the name of the y vector!", call.=FALSE)
}
if(!(type=="combine" || type=="split" || type=="both")){
stop("Function argument 'type' has to be either 'combine', 'split' or 'both'")
}
# the input variable might be given as column table (i.e. D$x)
# or just as a vector not attached to a table (i.e. x)
# we have to make sure the function deals with each case
xnames <- extract(x)
ynames <- extract(y)
xvarname <- xnames$elements
yvarname <- ynames$elements
xobj2lookfor <- xnames$holders
yobj2lookfor <- ynames$holders
xvariable <- xvarname
yvariable <- yvarname
# check if the input object(s) is(are) defined in all the studies
if(is.na(xobj2lookfor)){
defined <- isDefined(datasources, xvarname)
}else{
defined <- isDefined(datasources, xobj2lookfor)
}
if(is.na(yobj2lookfor)){
defined <- isDefined(datasources, yvarname)
}else{
defined <- isDefined(datasources, yobj2lookfor)
}
# call the internal function that checks the input object is of the same class in all studies.
# typ1 <- checkClass(datasources, x)
# typ2 <- checkClass(datasources, y)
# names of the studies
stdnames <- names(datasources)
# call the server side function that produces a 1-dimensional table for each study
cally <- paste0("table2DDS(", x, ",", y, ")")
output <- DSI::datashield.aggregate(datasources, as.symbol(cally))
# extract contingency (count) tables and validity information for each study
countTables <- vector("list", length(stdnames))
validityInfo <- vector("list", length(stdnames))
for(i in 1:length(stdnames)){
countTables[[i]] <- output[[i]][[1]]
validityInfo[[i]] <- output[[i]][[2]]
}
names(countTables) <- paste0(stdnames,"-",paste0(x,"(row)|",y,"(col)"))
names(validityInfo) <- stdnames
# check if any study is invalid - if yes then only add up the outer columns (i.e. the total)
invalids <- which(unlist(validityInfo) == "invalid table - invalid counts present")
# pooled counts table
pooledCounts <- countTables[[1]]
pooledCounts[] <- 0
for(i in 1:length(stdnames)){
pooledCounts <- pooledCounts + countTables[[i]]
}
# pooled row and column percentage tables
pooledRowPercent <- rowPercent(pooledCounts)
pooledColPercent <- colPercent(pooledCounts)
# if a table is invalid in one or more studies alter the pooled tables chi-sqaured test output accordingly
if(length(invalids > 0)){
pooledCounts[1:(dim(pooledCounts)[1]-1), 1:(dim(pooledCounts)[2]-1)] <- NA
pooledRowPercent[1:(dim(pooledCounts)[1]-1), 1:(dim(pooledCounts)[2]-1)] <- NA
pooledColPercent[1:(dim(pooledCounts)[1]-1), 1:(dim(pooledCounts)[2]-1)] <- NA
pooledChi2test <- "Error: all entries of the table must be non-negative and finite"
}else{
pooledContingencyTable <- pooledCounts[1:(dim(pooledCounts)[1]-1), 1:(dim(pooledCounts)[2]-1)]
pooledChi2test <- stats::chisq.test(pooledContingencyTable)
}
# study specific row and column percentage tables (one for each study)
rowPercentTables <- vector("list", length(stdnames))
colPercentTables <- vector("list", length(stdnames))
for(i in 1:length(stdnames)){
rowPercentTables[[i]] <- rowPercent(countTables[[i]])
colPercentTables[[i]] <- colPercent(countTables[[i]])
}
names(rowPercentTables) <- paste0(stdnames,"-",paste0(x,"(row)|",y,"(col)"))
names(colPercentTables) <- paste0(stdnames,"-",paste0(x,"(row)|",y,"(col)"))
# chi-squared tests for each study separetely
chi2Tests <- vector("list", length(stdnames))
for(i in 1:length(stdnames)){
if(length(invalids) > 0 & i%in%invalids){
chi2Tests[[i]] <- "Error: all entries of the table must be non-negative and finite"
}else{
contingencyTable <- countTables[[i]][1:(dim(countTables[[i]])[1]-1), 1:(dim(countTables[[i]])[2]-1)]
options(warn = -1) # suppress warning temporarily to avoid 'nuisance' message, analyst will found out chi2 results anyway.
chi2Tests[[i]] <- stats::chisq.test(contingencyTable )
options(warn = 0) # put warning back
}
}
names(chi2Tests) <- paste0(stdnames,"-",paste0(x,"(row)|",y,"(col)"))
# return the right output depending what the user specified: 'combine' or 'split' analysis
if(type=="combine" || type=="both"){
if(length(invalids > 0)){
if(warningMessage){
message(paste0("WARNING: Invalid contingency table from '", paste(stdnames[invalids], collapse=", "), "' !\n Only total values are returned in the output table(s)."))
}
validityValue <- paste0("Invalid contingency table from '", paste(stdnames[invalids], collapse=", "), "'!")
}else{
validityValue <- "All tables are valid!"
}
# adding labels to pooled outputs before returning them
pooledCounts <- structure(list(pooledCounts), names=paste0("pooled-",x,"(row)|",y,"(col)"))
pooledRowPercent <- structure(list(pooledRowPercent), names=paste0("pooled-",x,"(row)|",y,"(col)"))
pooledColPercent <- structure(list(pooledColPercent), names=paste0("pooled-",x,"(row)|",y,"(col)"))
pooledChi2test <- structure(list(pooledChi2test), names=paste0("pooled-",x,"(row)|",y,"(col)"))
out.all.studies <- list(colPercent.all.studies=pooledColPercent, rowPercent.all.studies=pooledRowPercent, chi2Test.all.studies=pooledChi2test, counts.all.studies=pooledCounts, validity=validityValue)
output.object <- out.all.studies
}
if(type=="split" || type=="both"){
if(length(invalids > 0)){
if(warningMessage){
message(paste0("WARNING: Invalid contingency table from '", paste(stdnames[invalids], collapse=", "), "' !\n Only total values are returned in the output table(s)."))
}
validityValue <- paste0("Invalid contingency table from '", paste(stdnames[invalids], collapse=", "), "'!")
}else{
validityValue <- "All tables are valid!"
}
out.split <- (list(colPercent=colPercentTables, rowPercent=rowPercentTables, chi2Test=chi2Tests, counts=countTables, validity=validityValue))
output.object <- out.split
if(type=="both"){
output.object <- list(colPercent=colPercentTables, colPercent.all.studies=pooledColPercent,rowPercent=rowPercentTables,rowPercent.all.studies=pooledRowPercent,
chi2Test=chi2Tests, chi2Test.all.studies=pooledChi2test, counts=countTables, counts.all.studies=pooledCounts, validity=validityValue)
}
}
return(output.object)
}