# Summary One Variable by Groups

There is a categorical variable (based on one or the interaction of multiple variables), there is a continuous variable, obtain statistics for the continuous variable conditional on the categorical variable, but also unconditionally. 

Store results in a matrix, but also flatten results wide to row with appropriate keys/variable-names for all group statistics. 

Pick which statistics to be included in final wide row

## Program

In [54]:
# Single Variable Group Statistics
ff_summ_by_group_summ_one <- function(df, vars.group, var.numeric, str.stats.group = 'main', str.stats.specify = NULL){
    
    # List of statistics
    # https://rdrr.io/cran/dplyr/man/summarise.html
    strs.center <- c('mean', 'median')
    strs.spread <- c('sd', 'IQR', 'mad')
    strs.range <- c('min', 'max')
    strs.pos <- c('first', 'last')
    strs.count <- c('n_distinct')

    # Grouping of Statistics
    if (missing(str.stats.specify)) {
        if (str.stats.group == 'main') {
            strs.all <- c('mean', 'min', 'max', 'sd')
        }
        if (str.stats.group == 'all') {
            strs.all <- c(strs.center, strs.spread, strs.range, strs.pos, strs.count)
        }
    } else {
        strs.all <- str.stats.specify
    }
    
    # Group Sort
    df.select <- df %>% 
                    drop_na() %>% 
                    group_by(!!!syms(vars.group)) %>% 
                    arrange(!!!syms(c(vars.group, var.numeric)))

    # Table of Statistics
    df.table.grp.stats <- df.select %>% summarize_at(vars(var.numeric), funs(!!!strs.all))
    
    # Add Stat Name
    if (length(strs.all) == 1) {
        # give it a name, otherwise if only one stat, name of stat not saved
        df.table.grp.stats <- df.table.grp.stats %>% rename(!!strs.all := !!sym(var.numeric))
    } 

    
    # Row of Statistics
    str.vars.group.combine <- paste0(vars.group, collapse='_')
    if (length(vars.group) == 1) {
        df.row.grp.stats <- df.table.grp.stats %>%
                mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.', 
                                               vars.group, '.g', 
                                               (!!!syms(vars.group)))) %>%
                gather(variable, value, -one_of(vars.group)) %>%
                unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
                spread(str.vars.group.combine, value)        
    } else {
        df.row.grp.stats <- df.table.grp.stats %>% 
                                mutate(vars.groups.combine := paste0(paste0(vars.group, collapse='.')), 
                                       !!(str.vars.group.combine) := paste0(interaction(!!!(syms(vars.group))))) %>% 
                                mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.', vars.groups.combine, '.',
                                                                           (!!sym(str.vars.group.combine)))) %>%
                                ungroup() %>%
                                select(-vars.groups.combine, -one_of(vars.group)) %>%
                gather(variable, value, -one_of(str.vars.group.combine))  %>%
                unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
                spread(str.vars.group.combine, value)
        
    }

    # Clean up name strings
    names(df.table.grp.stats) <- gsub(x = names(df.table.grp.stats),pattern = "_", replacement = "\\.")
    names(df.row.grp.stats) <- gsub(x = names(df.row.grp.stats),pattern = "_", replacement = "\\.")
    
    # Return
    return(list(df_table_grp_stats = df.table.grp.stats, 
                df_row_grp_stats = df.row.grp.stats))    
}

## Data

Load data and test

In [55]:
# Library
library(tidyverse)

# Load Sample Data
setwd('C:/Users/fan/R4Econ/_data/')
df <- read_csv('height_weight.csv')

Parsed with column specification:
cols(
  S.country = col_character(),
  vil.id = col_double(),
  indi.id = col_double(),
  sex = col_character(),
  svymthRound = col_double(),
  momEdu = col_double(),
  wealthIdx = col_double(),
  hgt = col_double(),
  wgt = col_double(),
  hgt0 = col_double(),
  wgt0 = col_double(),
  prot = col_double(),
  cal = col_double(),
  p.A.prot = col_double(),
  p.A.nProt = col_double()
)


## Function Testing By Gender Groups 

### Select Variables

Need two variables, a group variable that is a factor, and a numeric

In [56]:
vars.group <- 'sex'
var.numeric <- 'hgt'

In [57]:
df.select <- df %>% select(one_of(vars.group, var.numeric)) %>% drop_na()

### Main Statistics 

In [58]:
# Single Variable Group Statistics
ff_summ_by_group_summ_one(df.select, vars.group = vars.group, var.numeric = var.numeric, str.stats.group = 'main')

sex,mean,min,max,sd
Female,82.81198,41.2,170.6,29.79351
Male,84.68152,41.3,182.9,31.75037

hgt.sex.gFemale.max,hgt.sex.gFemale.mean,hgt.sex.gFemale.min,hgt.sex.gFemale.sd,hgt.sex.gMale.max,hgt.sex.gMale.mean,hgt.sex.gMale.min,hgt.sex.gMale.sd
170.6,82.81198,41.2,29.79351,182.9,84.68152,41.3,31.75037


### Specify Two Specific Statistics

In [59]:
ff_summ_by_group_summ_one(df.select, vars.group = vars.group, var.numeric = var.numeric, str.stats.specify = c('mean', 'sd'))

sex,mean,sd
Female,82.81198,29.79351
Male,84.68152,31.75037

hgt.sex.gFemale.mean,hgt.sex.gFemale.sd,hgt.sex.gMale.mean,hgt.sex.gMale.sd
82.81198,29.79351,84.68152,31.75037


### Specify One Specific Statistics

In [60]:
ff_summ_by_group_summ_one(df.select, vars.group = vars.group, var.numeric = var.numeric, str.stats.specify = c('mean'))

sex,mean
Female,82.81198
Male,84.68152

hgt.sex.gFemale.mean,hgt.sex.gMale.mean
82.81198,84.68152


## Function Testing By Country and Gender  Groups 

### Select Variables

Need two variables, a group variable that is a factor, and a numeric. Now joint grouping variables.

In [61]:
vars.group <- c('S.country', 'sex')
var.numeric <- 'hgt'

In [62]:
df.select <- df %>% select(one_of(vars.group, var.numeric)) %>% drop_na()

### Main Statistics 

In [63]:
ff_summ_by_group_summ_one(df.select, vars.group = vars.group, var.numeric = var.numeric, str.stats.group = 'main')

S.country,sex,mean,min,max,sd
Cebu,Female,84.61326,41.3,170.6,32.53651
Cebu,Male,87.02836,41.3,182.9,34.9909
Guatemala,Female,76.58771,41.2,119.9,15.71801
Guatemala,Male,77.0471,41.5,124.7,15.11444

hgt.S.country.sex.Cebu.Female.max,hgt.S.country.sex.Cebu.Female.mean,hgt.S.country.sex.Cebu.Female.min,hgt.S.country.sex.Cebu.Female.sd,hgt.S.country.sex.Cebu.Male.max,hgt.S.country.sex.Cebu.Male.mean,hgt.S.country.sex.Cebu.Male.min,hgt.S.country.sex.Cebu.Male.sd,hgt.S.country.sex.Guatemala.Female.max,hgt.S.country.sex.Guatemala.Female.mean,hgt.S.country.sex.Guatemala.Female.min,hgt.S.country.sex.Guatemala.Female.sd,hgt.S.country.sex.Guatemala.Male.max,hgt.S.country.sex.Guatemala.Male.mean,hgt.S.country.sex.Guatemala.Male.min,hgt.S.country.sex.Guatemala.Male.sd
170.6,84.61326,41.3,32.53651,182.9,87.02836,41.3,34.9909,119.9,76.58771,41.2,15.71801,124.7,77.0471,41.5,15.11444


### Specify Two Specific Statistics

In [64]:
ff_summ_by_group_summ_one(df.select, vars.group = vars.group, var.numeric = var.numeric, str.stats.specify = c('mean', 'sd'))

S.country,sex,mean,sd
Cebu,Female,84.61326,32.53651
Cebu,Male,87.02836,34.9909
Guatemala,Female,76.58771,15.71801
Guatemala,Male,77.0471,15.11444

hgt.S.country.sex.Cebu.Female.mean,hgt.S.country.sex.Cebu.Female.sd,hgt.S.country.sex.Cebu.Male.mean,hgt.S.country.sex.Cebu.Male.sd,hgt.S.country.sex.Guatemala.Female.mean,hgt.S.country.sex.Guatemala.Female.sd,hgt.S.country.sex.Guatemala.Male.mean,hgt.S.country.sex.Guatemala.Male.sd
84.61326,32.53651,87.02836,34.9909,76.58771,15.71801,77.0471,15.11444


### Specify One Specific Statistics

In [65]:
ff_summ_by_group_summ_one(df.select, vars.group = vars.group, var.numeric = var.numeric, str.stats.specify = c('mean'))

S.country,sex,mean
Cebu,Female,84.61326
Cebu,Male,87.02836
Guatemala,Female,76.58771
Guatemala,Male,77.0471

hgt.S.country.sex.Cebu.Female.mean,hgt.S.country.sex.Cebu.Male.mean,hgt.S.country.sex.Guatemala.Female.mean,hgt.S.country.sex.Guatemala.Male.mean
84.61326,87.02836,76.58771,77.0471
