Merge pull request #86 from jasenfinch/devel

v0.14.9
jasenfinch · Jan 27, 2022 · 80181f5 · 80181f5
2 parents e17b352 + 882ea24
commit 80181f5
Show file tree

Hide file tree

Showing 147 changed files with 1,001 additions and 263 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: metabolyseR
 Title: Methods for Pre-Treatment, Data Mining and Correlation Analyses of Metabolomics Data
-Version: 0.14.8
+Version: 0.14.9
 Authors@R: person("Jasen", "Finch", email = "jsf9@aber.ac.uk", role = c("aut", "cre"))
 Description: A tool kit for pre-treatment, modelling, feature selection and correlation analyses of metabolomics data.
 URL: https://jasenfinch.github.io/metabolyseR
@@ -57,6 +57,7 @@ Collate: allClasses.R
     info.R
     join.R
     keep.R
+    mds.R
     metabolyse.R
     metabolyseR.R
     modelling.R
@@ -80,6 +81,7 @@ Collate: allClasses.R
     remove.R
     randomForest.R
     rsd.R
+    roc.R
     show-method.R
     split.R
     transform.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -43,6 +43,7 @@ export(keepClasses)
 export(keepFeatures)
 export(keepSamples)
 export(linearRegression)
+export(mds)
 export(metabolyse)
 export(metrics)
 export(modellingMethods)
@@ -72,13 +73,15 @@ export(preTreated)
 export(preTreatmentElements)
 export(preTreatmentMethods)
 export(preTreatmentParameters)
+export(proximity)
 export(randomForest)
 export(raw)
 export(reAnalyse)
 export(removeClasses)
 export(removeFeatures)
 export(removeSamples)
 export(response)
+export(roc)
 export(rsd)
 export(sinfo)
 export(split)
@@ -113,6 +116,7 @@ importFrom(crayon,green)
 importFrom(crayon,red)
 importFrom(crayon,yellow)
 importFrom(doFuture,registerDoFuture)
+importFrom(dplyr,across)
 importFrom(dplyr,arrange)
 importFrom(dplyr,arrange_all)
 importFrom(dplyr,bind_cols)
@@ -124,14 +128,18 @@ importFrom(dplyr,full_join)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_all)
 importFrom(dplyr,group_by_at)
+importFrom(dplyr,group_map)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_all)
 importFrom(dplyr,mutate_at)
+importFrom(dplyr,mutate_if)
 importFrom(dplyr,n)
+importFrom(dplyr,relocate)
 importFrom(dplyr,rename)
 importFrom(dplyr,rowwise)
 importFrom(dplyr,select)
+importFrom(dplyr,select_if)
 importFrom(dplyr,summarise)
 importFrom(dplyr,summarise_all)
 importFrom(dplyr,ungroup)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+# metabolyseR 0.14.9
+
+* Suppressed name repair console message encountered during random forest permutation testing.
+
+* Added the [`proximity()`](https://jasenfinch.github.io/metabolyseR/reference/modelling-accessors.html) method for extracting sample proximities from the [`RandomForest`](https://jasenfinch.github.io/metabolyseR/reference/RandomForest-class.html) S4 class.
+
+* Added the [`mds()`](https://jasenfinch.github.io/metabolyseR/reference/mds.html) method to perform multidimensional scaling on sample proximities from the [`RandomForest`](https://jasenfinch.github.io/metabolyseR/reference/RandomForest-class.html) S4 class.
+
+* Added the [`roc()`](https://jasenfinch.github.io/metabolyseR/reference/roc.html) method to calculate receiver-operator characteristic curves from the [`RandomForest`](https://jasenfinch.github.io/metabolyseR/reference/RandomForest-class.html) S4 class.
+
 # metabolyseR 0.14.8
 
 * An error is now thrown during random forest classification when less than two classes are specified.

diff --git a/R/mds.R b/R/mds.R
@@ -0,0 +1,103 @@
+#' Multidimensional scaling (MDS)
+#' @rdname mds
+#' @description Multidimensional scaling of random forest proximities.
+#' @param x S4 object of class `RandomForest`, `Analysis` or a list
+#' @param dimensions The number of dimensions by which the data are to be represented.
+#' @param idx sample information column to use for sample names. If `NULL`, the sample row number will be used. Sample names should be unique for each row of data.
+#' @return 
+#' A tibble containing the scaled dimensions.
+#' @examples
+#' library(metaboData)
+#' 
+#' x <- analysisData(abr1$neg[,200:300],abr1$fact) %>%
+#'        occupancyMaximum(cls = 'day') %>%
+#'        transformTICnorm()
+#'        
+#' rf <- randomForest(x,cls = 'day')
+#' 
+#' mds(rf)
+#' @export
+
+setGeneric("mds", function(x,dimensions = 2,idx = NULL) 
+  standardGeneric("mds")
+)
+
+#' @rdname mds
+#' @importFrom dplyr mutate_if group_map select_if across
+
+setMethod('mds',signature = 'RandomForest',
+          function(x,dimensions = 2,idx = NULL){
+
+            group_vars <- switch(type(x),
+                                 classification = c('Response','Comparison'),
+                                 regression = 'Response',
+                                 unsupervised = NULL)
+
+            dissimilarities <- x %>% 
+              proximity(idx = idx) %>% 
+              mutate(across(Sample1:Sample2,as.character)) %>% 
+              spread(Sample2,Proximity) %>% 
+              mutate_if(is.numeric,~ 1 - .x)
+
+            mds_dimensions <- dissimilarities %>% 
+              group_by_at(group_vars) %>% 
+              group_map(~ .x %>% 
+                          select_if(is.numeric) %>% 
+                          cmdscale(k = dimensions) %>% 
+                          set_colnames(str_c('Dimension ',
+                                             seq_len(dimensions))) %>% 
+                          as_tibble() %>% 
+                          bind_cols(select_if(.x %>% 
+                                                ungroup(),
+                                              is.character)) %>%
+                          relocate(contains('Dimension'),
+                                   .after = last_col()),
+                        .keep = TRUE
+              ) %>% 
+              bind_rows() %>% 
+              rename(Sample = Sample1)
+
+            if (is.null(idx)){
+              mds_dimensions <- mds_dimensions %>% 
+                mutate(Sample = as.numeric(Sample)) %>% 
+                arrange(across(c(group_vars,'Sample')))
+            }
+
+            return(mds_dimensions)
+          }
+)
+
+#' @rdname mds
+
+setMethod('mds',signature = 'list',
+          function(x,dimensions = 2,idx = NULL){
+            object_classes <- x %>%
+              map_chr(class)
+
+            if (FALSE %in% (object_classes == 'RandomForest')) {
+              message(
+                str_c('All objects contained within supplied list ',
+                      'that are not of class RandomForest will be ignored.'))
+            }
+
+            x <- x[object_classes == 'RandomForest']
+
+            if (length(x) > 0) {
+              x %>%
+                map(mds,dimensions = dimensions,idx = idx) %>%
+                bind_rows()  
+            } else {
+              tibble()
+            }
+
+          })
+
+#' @rdname mds
+
+setMethod('mds',signature = 'Analysis',
+          function(x,dimensions = 2,idx = NULL){
+            x %>% 
+              analysisResults('modelling') %>% 
+              mds(dimensions = dimensions,
+                  idx = idx)
+          })
diff --git a/R/metabolyseR.R b/R/metabolyseR.R
@@ -7,5 +7,5 @@ globalVariables(
     'Occupancy','adjustedPvalue','adjusted.p.value','Label','response',
     '-log10(p)','DF1','Sample1','Proximity','Dimension 1','Dimension 2',
     'x','.level','sensitivity','specificity','Mode','RSD','Median','Colour',
-    'Index','TIC','y','label','batch','correction','N','term','Metric','Frequency','|r|'
+    'Index','TIC','y','label','batch','correction','N','term','Metric','Frequency','|r|','idx_1','idx_2'
   ))
diff --git a/R/modelling-accessors.R b/R/modelling-accessors.R
@@ -5,6 +5,7 @@
 #' @param cls sample information column to use
 #' @param metric importance metric for which to retrieve explanatory features
 #' @param threshold threshold below which explanatory features are extracted
+#' @param idx sample information column to use for sample names. If `NULL`, the sample row number will be used. Sample names should be unique for each row of data.
 #' @param ... arguments to parse to method for specific class
 #' @section Methods:
 #' * `binaryComparisons`: Return a vector of all possible binary comparisons for a given sample information column.
@@ -13,6 +14,7 @@
 #' * `metrics`: Retrieve the model performance metrics for a random forest analysis
 #' * `importanceMetrics`: Retrieve the available feature importance metrics for a random forest analysis.
 #' * `importance`: Retrieve feature importance results.
+#' * `proximity`: Retrieve the random forest sample proximities.
 #' * `explanatoryFeatures`: Retrieve explanatory features.
 #' @examples 
 #' library(metaboData)
@@ -40,6 +42,9 @@
 #' ## Retrieve the feature importance results
 #' importance(rf_analysis)
 #' 
+#' ## Retrieve the sample proximities
+#' proximity(rf_analysis)
+#' 
 #' ## Retrieve the explanatory features
 #' explanatoryFeatures(rf_analysis,metric = 'FalsePositiveRate',threshold = 0.05)
 #' @export
@@ -210,6 +215,98 @@ setMethod('importance',signature = 'Analysis',
 #' @rdname modelling-accessors
 #' @export
 
+setGeneric("proximity", function(x,idx = NULL) 
+  standardGeneric("proximity")
+)
+
+#' @rdname modelling-accessors
+#' @importFrom dplyr relocate
+
+setMethod('proximity',signature = 'RandomForest',
+          function(x,idx = NULL){
+
+            group_vars <- switch(type(x),
+                                 classification = c('Response','Comparison'),
+                                 regression = 'Response',
+                                 unsupervised = NULL) %>% 
+              c(.,'Sample1','Sample2')
+
+            proximities <- x@proximities %>% 
+              group_by_at(group_vars) %>% 
+              summarise(Proximity = mean(Proximity),
+                        .groups = 'drop')
+
+            if (!is.null(idx)){
+              sample_idx <- x %>% 
+                clsExtract(cls = idx)
+
+              if (any(duplicated(sample_idx))){
+                stop(str_c('Duplicated sample names found in sample information column `',
+                           idx,
+                           '`. The specified sample names should be unique to each sample.'),
+                     call. = FALSE)
+              }
+
+              sample_idx <- sample_idx %>% 
+                tibble(idx = .) %>% 
+                rowid_to_column(var = 'row')
+
+              proximities <- proximities %>% 
+                left_join(sample_idx,
+                          by = c('Sample1' = 'row')) %>% 
+                rename(idx_1 = idx) %>% 
+                left_join(sample_idx,
+                          by = c('Sample2' = 'row')) %>% 
+                rename(idx_2 = idx) %>% 
+                select(-Sample1,
+                       -Sample2,
+                       Sample1 = idx_1,
+                       Sample2 = idx_2) %>% 
+                relocate(Proximity,.after = Sample2)
+            }
+
+            return(proximities)
+          }
+)
+
+#' @rdname modelling-accessors
+
+setMethod('proximity',signature = 'list',
+          function(x,idx = NULL){
+            object_classes <- x %>%
+              map_chr(class)
+
+            if (FALSE %in% (object_classes == 'RandomForest')) {
+              message(
+                str_c('All objects contained within supplied list ',
+                      'that are not of class RandomForest will be ignored.'))
+            }
+
+            x <- x[object_classes == 'RandomForest']
+
+            if (length(x) > 0) {
+              x %>%
+                map(proximity,idx = idx) %>%
+                bind_rows()  
+            } else {
+              tibble()
+            }
+
+          })
+
+#' @rdname modelling-accessors
+
+setMethod('proximity',signature = 'Analysis',
+          function(x,idx = NULL){
+            x %>% 
+              analysisResults(element = 'modelling') %>% 
+              proximity(idx = idx)
+          })
+
+
+#' @rdname modelling-accessors
+#' @export
+
 setGeneric('explanatoryFeatures', function(x,...) 
   standardGeneric("explanatoryFeatures")
 )