Skip to content

Commit

Permalink
Update compare_diagnoses.R
Browse files Browse the repository at this point in the history
 - reindent code
- make print function similar to reshape_diagnosis
- spell out object names
  • Loading branch information
lilymedina committed Feb 28, 2019
1 parent 2bc8c2e commit 9ed620a
Showing 1 changed file with 152 additions and 170 deletions.
322 changes: 152 additions & 170 deletions R/compare_diagnoses.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,132 +85,130 @@ compare_diagnoses <- function(base_design,

#' @export
compare_diagnoses_internal <- function(diagnosis1, diagnosis2, merge_by_estimator, alpha) {

# 1.Housekeeping
if(is.null(diagnosis1$bootstrap_replicates ))
if (is.null(diagnosis1$bootstrap_replicates))
stop("Can't compare diagnoses witouth bootstrap replicates")

if(length(diagnosis1$parameters_d[, "design_label"]) + length(diagnosis2$parameters_d[, "design_label"])> 2)
if (length(diagnosis1$parameters_d[, "design_label"]) + length(diagnosis2$parameters_d[, "design_label"]) > 2)
stop("Please only send design or diagnosis objects with one unique design_label.")


diagnosands <- base::intersect(diagnosis1$diagnosand_names,
diagnosis2$diagnosand_names)

# merge_by_set, used to merge diagnosands_df, must at least contain estimand_label
# at its largest possible cardinality, merge_by_set contains c("estimand_label", "term", " "estimator_label")
# if group_by_set is different in both designs, merge_by_set contains labels the intersection from both.
# NOTE: Need to test how comparison is done when only on diagnosis has term or estimator_label
diagnosands <- base::intersect(diagnosis1$diagnosand_names, diagnosis2$diagnosand_names)

# merge_by_set, used to merge diagnosands_df, must at least contain estimand_label at its
# largest possible cardinality, merge_by_set contains c('estimand_label', 'term', '
# 'estimator_label') if group_by_set is different in both designs, merge_by_set contains
# labels the intersection from both. NOTE: Need to test how comparison is done when only
# on diagnosis has term or estimator_label

group_by_set1 <- diagnosis1$group_by_set
group_by_set2 <- diagnosis2$group_by_set

mand <- "estimand_label" %in% group_by_set1 & "estimand_label" %in% group_by_set2
mator <- "estimator_label" %in% group_by_set1 & "estimator_label" %in% group_by_set2
if( mand + mator < 1)
stop(paste0("diagnosands_df must contain at least estimand_label or estimator_label in common" ))

estimand_in_set <- "estimand_label" %in% group_by_set1 & "estimand_label" %in% group_by_set2
estimator_in_set <- "estimator_label" %in% group_by_set1 & "estimator_label" %in% group_by_set2
merge_by_set <- NULL

if(mand) merge_by_set <- c("estimand_label")
if(merge_by_estimator){
if(!mator)
if (estimand_in_set + estimator_in_set < 1) {
stop(paste0("diagnosands_df must contain at least estimand_label or estimator_label in common"))
}

if (estimand_in_set) {
merge_by_set <- c("estimand_label")
}
if (merge_by_estimator) {
if (!estimator_in_set)
warning("Estimator label not used in columns for merging.
At least one of the designs doesn't contain estimator_label.")
else
merge_by_set <- c(merge_by_set, "estimator_label")
At least one of the designs doesn't contain estimator_label.") else {
merge_by_set <- c(merge_by_set, "estimator_label")
}
}

if("term" %in% group_by_set1 & "term" %in% group_by_set2)
if ("term" %in% group_by_set1 & "term" %in% group_by_set2) {
merge_by_set <- c(merge_by_set, "term")

comparison_df <- merge(diagnosis1$diagnosands_df , diagnosis2$diagnosands_df,
by = merge_by_set, suffixes = c("_1", "_2"), stringsAsFactors = FALSE)
}

comparison_df <- merge(diagnosis1$diagnosands_df, diagnosis2$diagnosands_df, by = merge_by_set,
suffixes = c("_1", "_2"), stringsAsFactors = FALSE)

if(nrow(comparison_df) == 0 ){
if (nrow(comparison_df) == 0) {
warning("Can't merge diagnosands data frames. Diagnoses don't have labels in common")
return(list(diagnosis1 = diagnosis1 , diagnosis2 = diagnosis2))
return(list(diagnosis1 = diagnosis1, diagnosis2 = diagnosis2))
}

c_names <- colnames(comparison_df)
suffix <- c("_1", "_2")
dropcols <- grepl("[[:digit:]]+$", c_names ) | c_names %in% c("design_label", "estimand_label", "estimator_label", "term")

# Grab standard.error columns before droping cols that are not present in both diagnosands_df
# just in case diagnosis2 doesn't include bootrstrap_replicates
filter_se <- grepl("^se", c_names)
diagnosands_se <- comparison_df[, filter_se]
if(diagnosis2$bootstrap_sims == 0){
diagnosands_se1 <- diagnosands_se; diagnosands_se2 <- matrix(data = rep(NA, length(diagnosands_se)))
} else {
diagnosands_se1 <- diagnosands_se[, grepl("_1$", colnames(diagnosands_se))]
diagnosands_se2 <- diagnosands_se[, grepl("_2$", colnames(diagnosands_se))]}

c_names <- colnames(comparison_df)
filter_se <- grepl("^se", c_names)
suffix <- c("_1", "_2")
keepcols <- grepl("[[:digit:]]+$", c_names) | grepl("label+$", c_names)| grepl("term", c_names) | filter_se
comparison_df <- comparison_df[, keepcols]
c_names <- colnames(comparison_df)


diagnosands_se <- comparison_df[, filter_se]
if (diagnosis2$bootstrap_sims == 0) {
diagnosands_se1 <- diagnosands_se
diagnosands_se2 <- matrix(data = rep(NA, length(diagnosands_se)))
} else {
diagnosands_se1 <- diagnosands_se[, endsWith(colnames(diagnosands_se), "_1")]
diagnosands_se2 <- diagnosands_se[, endsWith(colnames(diagnosands_se), "_2")]
}

comparison_df <- comparison_df[, dropcols]

# Compute bootstrap confidence interval

bootstrap_df <- diagnosis1$bootstrap_replicates
group_by_list <- bootstrap_df[, group_by_set1, drop = FALSE]
labels_df <- split(group_by_list, lapply(group_by_list, addNA), drop = TRUE)
labels_df <- lapply(labels_df, head, n = 1)
bootstrap_df <- split(bootstrap_df , lapply(group_by_list, addNA), drop = TRUE)
group_by_set <- diagnosis1$group_by_set
lower.bound <- lapply(bootstrap_df,FUN = function(x) {

bootstrap_df <- diagnosis1$bootstrap_replicates
group_by_list <- bootstrap_df[, group_by_set1, drop = FALSE]
labels_df <- split(group_by_list, lapply(group_by_list, addNA), drop = TRUE)
labels_df <- lapply(labels_df, head, n = 1)
bootstrap_df <- split(bootstrap_df, lapply(group_by_list, addNA), drop = TRUE)
group_by_set <- diagnosis1$group_by_set

lower.bound <- lapply(bootstrap_df, FUN = function(x) {
d <- x[, diagnosands]
set <- head(x[,group_by_set], 1)
q <- sapply(d, function(d) quantile(d, c(lower = 0.5*alpha), na.rm = TRUE))
set <- head(x[, group_by_set], 1)
q <- sapply(d, function(d) quantile(d, c(lower = 0.5 * alpha), na.rm = TRUE))
q <- setNames(q, diagnosands)
q <- cbind(set, t(q))
})

upper.bound <- lapply(bootstrap_df,FUN = function(x) {
d <- x[, diagnosands]
set <- head(x[,group_by_set], 1)
q <- sapply(d, function(d) quantile(d, 1-0.5*alpha, na.rm = TRUE))
q <- setNames(q, diagnosands)
q <- cbind(set, t(q))
})

upper.bound <- lapply(bootstrap_df, FUN = function(x) {
d <- x[, diagnosands]
set <- head(x[, group_by_set], 1)
q <- sapply(d, function(d) quantile(d, 1 - 0.5 * alpha, na.rm = TRUE))
q <- setNames(q, diagnosands)
q <- cbind(set, t(q))
})

lower.bound <- rbind_disjoint(lower.bound)
upper.bound <- rbind_disjoint(upper.bound)


# Split columns by diagnosands
filter_mean <- gsub("_[[:digit:]]+$","", colnames(comparison_df)) %in% diagnosands
diagnosands_mean <- comparison_df[, filter_mean]
filter_sims <- grepl("^n_sims", colnames(comparison_df))
sims_df <- comparison_df[,filter_sims]
filter_se <- grepl("^se", colnames(comparison_df))
comparison_df <- comparison_df[!filter_mean & !filter_se & !filter_sims]
filter_mean <- gsub("_[[:digit:]]+$", "", colnames(comparison_df)) %in% diagnosands
diagnosands_mean <- comparison_df[, filter_mean]
filter_sims <- grepl("^n_sims", colnames(comparison_df))
sims_df <- comparison_df[, filter_sims]
filter_se <- grepl("^se", colnames(comparison_df))
identifiers_df <- comparison_df[!filter_mean & !filter_se & !filter_sims]

# Suppress warnings on rownames
out <- suppressWarnings( do.call(rbind, lapply(1:nrow(diagnosands_mean), function(pair){
data.frame(comparison_df[pair, ],
diagnosand = diagnosands,
t(mapply(function(mean_1, mean_2, se_1, se_2, l, u, n1 , n2)
c(mean_1 = mean_1, mean_2 = mean_2, se_1 = se_1, se_2 = se_2,
n_sims1 = n1, n_sims2 = n2, conf.low_1 = l, conf.upper_1 = u,
in_interval = ((mean_2 >= l) & (mean_2 <= u))),
#args
mean_1 = diagnosands_mean[pair, 1:length(diagnosands)],
mean_2 = diagnosands_mean[pair, (length(diagnosands)+1):ncol(diagnosands_mean)],
se_1 = diagnosands_se1[pair,],
se_2 = diagnosands_se2[pair,],
l = lower.bound[pair, diagnosands],
u = upper.bound[pair, diagnosands],
n1 = sims_df[pair, 1],
n2 = sims_df[pair, 2])),

stringsAsFactors = FALSE)})))

c_names <- colnames(out)

#comparison_df <- rapply(object = out , f = round, classes = "numeric", how = "replace", digits = 2)
comparison_df <- out
comparison_df <- suppressWarnings(do.call(rbind, lapply(1:nrow(diagnosands_mean), function(pair){
data.frame(identifiers_df[pair, ], diagnosand = diagnosands,
t(mapply(
function(mean_1, mean_2, se_1, se_2, l, u, n1, n2){
c(mean_1 = mean_1, mean_2 = mean_2, se_1 = se_1, se_2 = se_2,
n_sims1 = n1, n_sims2 = n2, conf.low_1 = l, conf.upper_1 = u,
in_interval = ((mean_2 >= l) & (mean_2 <= u)))
},
mean_1 = diagnosands_mean[pair, 1:length(diagnosands)],
mean_2 = diagnosands_mean[pair, (length(diagnosands) + 1):ncol(diagnosands_mean)],
se_1 = diagnosands_se1[pair, ],
se_2 = diagnosands_se2[pair,],
l = lower.bound[pair, diagnosands],
u = upper.bound[pair, diagnosands],
n1 = sims_df[pair, 1],
n2 = sims_df[pair, 2])),
stringsAsFactors = FALSE)
})))

c_names <- colnames(comparison_df)

# Prepare output
# Add "_1" or "_2" to labels as needed.
Expand All @@ -220,22 +218,21 @@ compare_diagnoses_internal <- function(diagnosis1, diagnosis2, merge_by_estimato
c_names[c_names == "estimator_label"] <- paste0("estimator_label", suffix[c( "estimator_label" %in% group_by_set1, "estimator_label" %in% group_by_set2)])
if("term" %in% c_names & !"term" %in% merge_by_set)
c_names[c_names == "term"] <- paste0("term", suffix[c("term" %in% group_by_set1, "term" %in% group_by_set2)])
comparison_df <- setNames( comparison_df, c_names)
comparison_df <- setNames(comparison_df, c_names)

if( all(is.na(comparison_df$se_2))) comparison_df$se_2 <- NULL

attr(comparison_df, "alpha") <- alpha
if (all(is.na(comparison_df$se_2))) comparison_df$se_2 <- NULL



out <- list(compared.diagnoses_df = comparison_df,
diagnosis1 = diagnosis1, diagnosis2 = diagnosis2)

attr(out, "alpha") <- alpha

out <- list(compared.diagnoses_df =comparison_df ,
diagnosis1 = diagnosis1,
diagnosis2 = diagnosis2)

class(out) <- "compared.diagnoses"

invisible(out)
}


}


#' @export
Expand All @@ -260,85 +257,70 @@ print.summary.compared_diagnoses <- function(x, ...){
bootstrap_rep2 <- x$diagnosis2$bootstrap_sims
n_sims1 <- nrow(x$diagnosis1$simulations_df)
n_sims2 <- nrow(x$diagnosis2$simulations_df)
x <- x[["compared.diagnoses_df"]]
sx <- subset(x, x[,"in_interval"] == 0)
compared.diagnoses_df <- x[["compared.diagnoses_df"]]
compared.diagnoses_df <- subset(compared.diagnoses_df, compared.diagnoses_df[, "in_interval"] == 0)
alpha <- attr(x, "alpha")
atext <- paste0("(confidence level = ", alpha, ")")

if (nrow(compared.diagnoses_df) == 0) {
print(paste("No visible differences between objects (at a confidence level =", atext,")"))
return(x)
}

atext <- paste0("(alpha level = ", attr(x, "alpha"), ")")

if(nrow(sx) == 0) {
print(paste("No divergences found ", atext))
return(x)}
if (n_sims1 == n_sims2) {
cat(paste0("\n Comparison of research designs diagnoses based on ", n_sims1, " simulations", atext))
} else {
cat(paste0("\n Comparison of research designs diagnoses based on ", n_sims1, " simulations from `base_design` and ",
n_sims2, " from `comparison_design`", atext))
}
if (bootstrap_rep1 == bootstrap_rep2) {
cat(paste0("\nDiagnosand estimates with bootstrapped standard errors in parentheses (", bootstrap_rep1,
")."))
} else {
cat(paste0("\nDiagnosand estimates with bootstrapped standard errors in parentheses (base_design = ", bootstrap_rep1,
", comparison_design = ", bootstrap_rep2, ")."))
}


if(n_sims1 == n_sims2 )
cat(paste0("\n Comparison of research designs diagnoses based on ", n_sims1, " simulations", atext))
else
cat(paste0("\n Comparison of research designs diagnoses based on ", n_sims1, " simulations from `base_design` and ", n_sims2, " from `comparison_design`", atext))
if(bootstrap_rep1 == bootstrap_rep2)
cat(paste0("\nDiagnosand estimates with bootstrapped standard errors in parentheses (", bootstrap_rep1,")." ))
else
cat(paste0("\nDiagnosand estimates with bootstrapped standard errors in parentheses (base_design = ", bootstrap_rep1,", omparison_design = ", bootstrap_rep2, ")." ))

column_names <- colnames(compared.diagnoses_df)
identifiers <- column_names %in% c("estimand_label", "term") | grepl("^estimator_label", column_names)
identifiers_columns <- column_names[identifiers]

# Make diagnosand only df
clean_comparison_df <- compared.diagnoses_df[, c(identifiers_columns, "diagnosand", "mean_1", "mean_2"), drop = FALSE]

se_1 <- paste0("(",format_num(sx$se_1, 2), ")")

se_2 <- ifelse(bootstrap_rep2 !=0,
paste0("(",format_num(sx$se_2, 2), ")"),
rep("-", length(sx$se_1)))
clean_values_df <- data.frame(lapply(clean_comparison_df[, c("mean_1", "mean_2"), drop = FALSE], format_num,
digits = 2), stringsAsFactors = FALSE)

# if(bootstrap_rep2 !=0) se_2 <- paste0("(",format_num(sx$se_2, 2), ")")
# else se_2 <- rep(length(sx$se_1), "-")
diagnosands_only_df <- cbind(clean_comparison_df[, c(identifiers_columns, "diagnosand"), drop = FALSE], clean_values_df)


sx <- sx[ ,!colnames(sx) %in% c("in_interval", "se_1", "se_2", "n_sims1", "n_sims2", "conf.low_1", "conf.upper_1")]

colnames(diagnosands_only_df)[colnames(diagnosands_only_df) %in% c("mean_1", "mean_2")] <- c("base", "comparison")
# Make se only df
se_only_df <- compared.diagnoses_df[, grepl("^se", colnames(compared.diagnoses_df)), drop = FALSE]

# Prep vars to shape data frame
mand <- startsWith(colnames(sx), "estimand")
term <- startsWith(colnames(sx), "term")
mator <- startsWith(colnames(sx), "estimator")
n_labels <- sum(mand, mator, term)

r <- nrow(sx)
k <- ncol(sx)

label_cols <- matrix(replicate(n_labels + 1 , rep("", r)),
ncol = n_labels + 1 )

se_rows <- data.frame(label_cols,
se_1,
se_2,
stringsAsFactors = FALSE)

# create a data frame double the size of diagnosands_df
# odd rows correspond to diagnosands' means
# even rows to se(diagnosands)
designs <- startsWith(colnames(sx), "design")
out <- data.frame(rbind(sx, sx), stringsAsFactors = FALSE)
diagnosands_cols <- startsWith(colnames(out), "mean")
colnames(out)[diagnosands_cols] <- c("base", "comparison")
m <- nrow(out)
odd <- (1:m) %% 2 != 0
even <- (1:m) %% 2 == 0
diagnosands_m <- sapply(sx[,diagnosands_cols], format_num, digits=2)


out[odd, diagnosands_cols ] <- diagnosands_m
out[even, !designs] <- se_rows
out[odd, "diagnosand"] <- sx$diagnosand

# Force new factor

out[, designs] <- sapply(out[, designs] ,
function(x) factor(x, levels = c(levels(x), " ")))
out[even, designs] <- " "


se_only_df <- data.frame(lapply(se_only_df, function(se) {
paste0("(", format_num(se, digits = 2), ")")

}), stringsAsFactors = FALSE)

se_only_df <- cbind(clean_comparison_df[, c(identifiers_columns, "diagnosand"), drop = FALSE], se_only_df)
colnames(se_only_df) <- colnames(diagnosands_only_df)

# Merge
return_df <- rbind_disjoint(list(diagnosands_only_df, se_only_df), infill = "")

return_df <- return_df[do.call(order, as.list(return_df[, c(identifiers_columns, "diagnosand")])), , drop = FALSE]
r <- nrow(return_df)
return_df[(1:r)%%2 == 0, c(identifiers_columns, "diagnosand")] <- ""


cat("\n\n")
print(out, row.names = FALSE)
cat(paste0("\n\n Displaying diagnosands that statistically diverge between base and comparison designs. See help file for details." ))
invisible(out)
print(return_df, row.names = FALSE)
cat(paste0("\n\n Displaying diagnosands of the comparison design that lie outside the ", (1-alpha)*100 ,"% bootstrap confidence interval of their counterparts in the base_design."))
invisible(return_df)
}


Expand Down

0 comments on commit 9ed620a

Please sign in to comment.