Back to **[Fan](https://fanwangecon.github.io/)**'s R4Econ Homepage **[Table of Content](https://fanwangecon.github.io/R4Econ/)**

# Generate Joint Quantiles from Multiple Continuous Variables as a Categorical Variable with Linear Index

There are multiple or a single continuous variables. Find which quantile each observation belongs to for each of the variables. Then also generate a joint/interaction variable of all combinations of quantiles from different variables.

The program has these features:

1. Quantiles breaks are generated based on group_by characteristics, meaning quantiles for individual level characteristics when data is panel
2. Quantiles variables apply to full panel at within-group observation levels.
3. Robust to non-unique breaks for quantiles (non-unique grouped together)
4. Quantile categories have detailed labeling (specifying which non-unique groupings belong to quantile)


When joining multiple quantile variables together:

1. First check if only calculate quantiles at observations where all quantile base variables are not null
2. Calculate Quantiles for each variable, with different quantile levels for sub-groups of variables
3. Summary statistics by mulltiple quantile-categorical variables, summary 

## Program

### Support Functions

In [183]:
# Quantiles for any variable
gen_quantiles <- function(var, df, prob=c(0.25, 0.50, 0.75)) {
    enframe(quantile(as.numeric(df[[var]]), prob, na.rm=TRUE), 'quant.perc', var) 
}
# Support Functions for Variable Suffix 
f_Q_suffix <- function(seq.quantiles) {
    quantile.suffix <- paste0('Qs', min(seq.quantiles),
                              'e', max(seq.quantiles),
                              'n', (length(seq.quantiles)-1))
}
# Support Functions for Quantile Labeling
f_Q_label <- function(arr.quantiles, 
                      arr.sort.unique.quantile,
                      seq.quantiles) {
    paste0('(', 
           paste0(which(arr.quantiles %in% arr.sort.unique.quantile), collapse=','), 
           ') of ', f_Q_suffix(seq.quantiles)) 
}
# Generate New Variable Names with Quantile Suffix
f_var_rename <- function(name, seq.quantiles) {
    quantile.suffix <- paste0('_', f_Q_suffix(seq.quantiles))
    return(sub('_q', quantile.suffix, name))
}

# Check Are Values within Group By Unique? If not, STOP
f_check_distinct_ingroup <- function(df, vars.group_by, vars.values_in_group) {
    
    df.uniqus.in.group <- df %>% group_by(!!!syms(vars.group_by)) %>% 
            mutate(quant_vars_paste = paste(!!!(syms(vars.values_in_group)), sep='-')) %>%
            mutate(unique_in_group = n_distinct(quant_vars_paste)) %>%
            slice(1L) %>%
            ungroup() %>% 
            group_by(unique_in_group) %>%
            summarise(n=n())

    if (sum(df.uniqus.in.group$unique_in_group) > 1) {
        print(df.uniqus.in.group)
        print(paste('vars.values_in_group', vars.values_in_group, sep=':'))
        print(paste('vars.group_by', vars.group_by, sep=':'))
        stop("The variables for which quantiles are to be taken are not identical within the group variables")
    }
}

### Data Slicing and Quantile Generation

- Function 1: generate quantiles based on group-specific characteristics. the groups could be at the panel observation level as well. 

In [184]:
# First Step, given groups, generate quantiles based on group characteristics
# vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
# seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
# vars.group_by <- c('indi.id')
# vars.arrange <- c('indi.id', 'svymthRound')
# vars.continuous <- c('wealthIdx', 'hgt0', 'wgt0')
df_sliced_quantiles <- function(df, vars.cts2quantile, seq.quantiles, 
                                vars.group_by, vars.arrange) {
    
    # Slicing data
    df.grp.L1 <- df %>% group_by(!!!syms(vars.group_by)) %>% arrange(!!!syms(vars.arrange)) %>% slice(1L) %>% ungroup()
    
    # Quantiles based on sliced data
    df.sliced.quantiles <- lapply(vars.cts2quantile, gen_quantiles, df=df.grp.L1, prob=seq.quantiles) %>% reduce(full_join)
    
    return(list(df.sliced.quantiles=df.sliced.quantiles, 
                df.grp.L1=df.grp.L1))
}

## Data Cutting

- Function 2: cut groups for full panel dataframe based on group-specific characteristics quantiles.

In [185]:
# Cutting Function, Cut Continuous Variables into Quantiles with labeing
f_cut <- function(var, df.sliced.quantiles, seq.quantiles, include.lowest=TRUE, fan.labels=TRUE, print=FALSE) {
    
    # unparsed string variable name
    var.str <- substitute(var)
    
    # Breaks
    arr.quantiles <- df.sliced.quantiles[[var.str]]
    arr.sort.unique.quantiles <- sort(unique(arr.quantiles))
    if (print) {
        print(arr.sort.unique.quantiles)
    }
    
    # Regular cutting With Standard Labels
    # TRUE, means the lowest group has closed bracket left and right 
    var.quantile <- cut(var, breaks=arr.sort.unique.quantiles, include.lowest=include.lowest)
    
    # Use my custom labels
    if (fan.labels) {
        levels.suffix <- lapply(arr.sort.unique.quantiles[1:(length(arr.sort.unique.quantiles)-1)],
                                f_Q_label,                                
                                arr.quantiles=arr.quantiles,
                                seq.quantiles=seq.quantiles)
        if (print) {
            print(levels.suffix)
        }
        levels(var.quantile) <- paste0(levels(var.quantile), '; ', levels.suffix)
    }
    
    # Return
    return(var.quantile)
}

In [186]:
# Combo Quantile Function
# vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
# seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
# vars.group_by <- c('indi.id')
# vars.arrange <- c('indi.id', 'svymthRound')
# vars.continuous <- c('wealthIdx', 'hgt0', 'wgt0')
df_cut_by_sliced_quantiles <- function(df, vars.cts2quantile, seq.quantiles, 
                                       vars.group_by, vars.arrange) {
    
    
    # Check Are Values within Group By Unique? If not, STOP
    f_check_distinct_ingroup(df, vars.group_by, vars.values_in_group=vars.cts2quantile)
                                       
    # First Step Slicing
    df.sliced <- df_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange)

    # Second Step Generate Categorical Variables of Quantiles
    df.with.cut.quant <- df %>% mutate_at(vars.cts2quantile,
                               funs(q=f_cut(., df.sliced$df.sliced.quantiles,
                                            seq.quantiles=seq.quantiles, 
                                            include.lowest=TRUE, fan.labels=TRUE)))
    
    if (length(vars.cts2quantile) > 1) {
        df.with.cut.quant <- df.with.cut.quant %>% 
                              rename_at(vars(contains('_q')), 
                                        funs(f_var_rename(., seq.quantiles=seq.quantiles)))
    } else {
        new.var.name <- paste0(vars.cts2quantile[1], '_', f_Q_suffix(seq.quantiles))
        df.with.cut.quant <- df.with.cut.quant %>% rename(!!new.var.name := q)
    }
    
    # Newly Generated Quantile-Cut Variables
    vars.quantile.cut <- df.with.cut.quant %>% 
                select(matches(paste0(vars.cts2quantile, collapse='|'))) %>% 
                select(matches(f_Q_suffix(seq.quantiles)))
    
    # Return
    return(list(df.with.cut.quant = df.with.cut.quant, 
                df.sliced.quantiles=df.sliced$df.sliced.quantiles, 
                df.grp.L1=df.sliced$df.grp.L1,
                vars.quantile.cut=vars.quantile.cut))
    
}

## Different Vars Different Probabilities Joint Quantiles

- Accomondate multiple continuousv ariables
- Different percentiles
- list of lists
- generate joint categorical variables
- keep only values that exist for all quantile base vars

In [187]:
# Function to handle list inputs with different quantiles vars and probabilities
df_cut_by_sliced_quantiles_grps <- function(quantile.grp.list, df, vars.group_by, vars.arrange) {
   vars.cts2quantile <- quantile.grp.list$vars
   seq.quantiles <- quantile.grp.list$prob
   return(df_cut_by_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange))
}
# Show Results
df_cut_by_sliced_quantiles_joint_results_grped <- function(df.with.cut.quant.all, vars.cts2quantile, vars.group_by, vars.arrange,
                                                           vars.quantile.cut.all, var.qjnt.grp.idx) {
    # Show ALL
    df.group.panel.cnt.mean <- df.with.cut.quant.all %>% group_by(!!!syms(vars.quantile.cut.all), !!sym(var.qjnt.grp.idx)) %>%
            summarise_at(vars.cts2quantile, funs(mean, n()))

    # Show Based on SLicing first
    df.group.slice1.cnt.mean <- df.with.cut.quant.all %>% group_by(!!!syms(vars.group_by)) %>% arrange(!!!syms(vars.arrange)) %>% slice(1L) %>%
            group_by(!!!syms(vars.quantile.cut.all), !!sym(var.qjnt.grp.idx)) %>%
            summarise_at(vars.cts2quantile, funs(mean, n()))
    
    return(list(df.group.panel.cnt.mean=df.group.panel.cnt.mean, 
                df.group.slice1.cnt.mean=df.group.slice1.cnt.mean))
}

In [188]:
# # Joint Quantile Group Name
# var.qjnt.grp.idx <- 'group.index'
# # Generate Categorical Variables of Quantiles
# vars.group_by <- c('indi.id')
# vars.arrange <- c('indi.id', 'svymthRound')
# # Quantile Variables and Quantiles
# vars.cts2quantile.wealth <- c('wealthIdx')
# seq.quantiles.wealth <- c(0, .5, 1.0)
# vars.cts2quantile.wgthgt <- c('hgt0', 'wgt0')
# seq.quantiles.wgthgt <- c(0, .3333, 0.6666, 1.0)
# drop.any.quantile.na <- TRUE
# # collect to list
# list.cts2quantile <- list(list(vars=vars.cts2quantile.wealth,
#                                prob=seq.quantiles.wealth),
#                           list(vars=vars.cts2quantile.wgthgt,
#                                prob=seq.quantiles.wgthgt))

df_cut_by_sliced_quantiles_joint <- function(df, var.qjnt.grp.idx, 
                                             list.cts2quantile,
                                             vars.group_by, vars.arrange,
                                             drop.any.quantile.na = TRUE,
                                             toprint = TRUE) {

  #  Original dimensions
  if(toprint) {
   print(dim(df))
  }

  # All Continuous Variables from lists
  vars.cts2quantile <- unlist(lapply(list.cts2quantile, function(elist) elist$vars))
  vars.cts2quantile

  # Keep only if not NA for all Quantile variables
  if (drop.any.quantile.na) {
   df.select <- df %>% drop_na(c(vars.group_by, vars.arrange, vars.cts2quantile))
  } else {
   df.select <- df
  }

  if(toprint) {
   print(dim(df.select))
  }

  # Apply qunatile function to all elements of list of list
  df.cut.list <- lapply(list.cts2quantile, df_cut_by_sliced_quantiles_grps,
                        df=df.select, vars.group_by=vars.group_by, vars.arrange=vars.arrange)

  # Reduce Resulting Core Panel Matrix Together
  df.with.cut.quant.all <- lapply(df.cut.list, function(elist) elist$df.with.cut.quant) %>% reduce(left_join)
  df.sliced.quantiles.all <- lapply(df.cut.list, function(elist) elist$df.sliced.quantiles)

  if(toprint) {
    print(dim(df.with.cut.quant.all))
  }

  # Obrain Newly Created Quantile Group Variables
  vars.quantile.cut.all <- unlist(lapply(df.cut.list, function(elist) names(elist$vars.quantile.cut)))
  if(toprint) {
    print(vars.quantile.cut.all)
    print(summary(df.with.cut.quant.all %>% select(one_of(vars.quantile.cut.all))))
  }

  # Generate Joint Quantile Index Variable
  df.with.cut.quant.all <- df.with.cut.quant.all %>% mutate(!!var.qjnt.grp.idx := group_indices(., !!!syms(vars.quantile.cut.all)))

  # Quantile Groups
  arr.group.idx <- t(sort(unique(df.with.cut.quant.all[[var.qjnt.grp.idx]])))

  # Results Display
  df.group.print <- df_cut_by_sliced_quantiles_joint_results_grped(df.with.cut.quant.all, vars.cts2quantile,
                                                 vars.group_by, vars.arrange,
                                                 vars.quantile.cut.all, var.qjnt.grp.idx)

  # list to Return
  # These returns are the same as returns earlier: df_cut_by_sliced_quantiles
  # Except that they are combined together
  return(list(df.with.cut.quant = df.with.cut.quant.all,
              df.sliced.quantiles = df.sliced.quantiles.all,
              df.grp.L1 = (df.cut.list[[1]])$df.grp.L1,
              vars.quantile.cut = vars.quantile.cut.all, 
              df.group.panel.cnt.mean = df.group.print$df.group.panel.cnt.mean, 
              df.group.slice1.cnt.mean = df.group.print$df.group.slice1.cnt.mean))

}

## Use Program

### Load Data

In [189]:
# Library
library(tidyverse)

# Load Sample Data
setwd('C:/Users/fan/R4Econ/_data/')
df <- read_csv('height_weight.csv')

Parsed with column specification:
cols(
  S.country = col_character(),
  vil.id = col_double(),
  indi.id = col_double(),
  sex = col_character(),
  svymthRound = col_double(),
  momEdu = col_double(),
  wealthIdx = col_double(),
  hgt = col_double(),
  wgt = col_double(),
  hgt0 = col_double(),
  wgt0 = col_double(),
  prot = col_double(),
  cal = col_double(),
  p.A.prot = col_double(),
  p.A.nProt = col_double()
)


## Use Program

### Hgt0 3 Groups

In [190]:
# Joint Quantile Group Name
var.qjnt.grp.idx <- 'group.index'
list.cts2quantile <- list(list(vars=c('hgt0'), prob=c(0, .3333, 0.6666, 1.0)))
results <- df_cut_by_sliced_quantiles_joint(df, var.qjnt.grp.idx, list.cts2quantile,
                                            vars.group_by = c('indi.id'), vars.arrange = c('indi.id', 'svymthRound'),
                                            drop.any.quantile.na = TRUE, toprint = FALSE)
# Show Results
results$df.group.slice1.cnt.mean

hgt0_Qs0e1n3,group.index,mean,n
"[40.6,48.5]; (1) of Qs0e1n3",1,47.04103,580
"(48.5,50.2]; (2) of Qs0e1n3",2,49.38948,561
"(50.2,58]; (3) of Qs0e1n3",3,51.65563,568


### Wealth 5 Groups Guatemala

In [191]:
# Joint Quantile Group Name
var.qjnt.grp.idx <- 'wltQuintle.index'
list.cts2quantile <- list(list(vars=c('wealthIdx'), prob=seq(0, 1.0, 0.20)))
results <- df_cut_by_sliced_quantiles_joint((df %>% filter(S.country == 'Guatemala')),
                                            var.qjnt.grp.idx, list.cts2quantile,
                                            vars.group_by = c('indi.id'), vars.arrange = c('indi.id', 'svymthRound'),
                                            drop.any.quantile.na = TRUE, toprint = FALSE)
# Show Results
results$df.group.slice1.cnt.mean

wealthIdx_Qs0e1n5,wltQuintle.index,mean,n
"[1,1.6]; (1) of Qs0e1n5",1,1.245033,151
"(1.6,2.1]; (2) of Qs0e1n5",2,1.822302,139
"(2.1,2.3]; (3) of Qs0e1n5",3,2.245324,139
"(2.3,2.9]; (4) of Qs0e1n5",4,2.697761,134
"(2.9,6.6]; (5) of Qs0e1n5",5,3.77027,111


### Hgt0 2 groups, Wgt0 2 groups too

In [192]:
# Joint Quantile Group Name
var.qjnt.grp.idx <- 'group.index'
list.cts2quantile <- list(list(vars=c('hgt0', 'wgt0'), prob=c(0, .5, 1.0)))
results <- df_cut_by_sliced_quantiles_joint(df, var.qjnt.grp.idx, list.cts2quantile,
                                            vars.group_by = c('indi.id'), vars.arrange = c('indi.id', 'svymthRound'),
                                            drop.any.quantile.na = TRUE, toprint = FALSE)
# Show Results
results$df.group.slice1.cnt.mean

Joining, by = "quant.perc"


hgt0_Qs0e1n2,wgt0_Qs0e1n2,group.index,hgt0_mean,wgt0_mean,hgt0_n,wgt0_n
"[40.6,49.4]; (1) of Qs0e1n2","[1.4e+03,3.01e+03]; (1) of Qs0e1n2",1,47.41534,2650.312,652,652
"[40.6,49.4]; (1) of Qs0e1n2","(3.01e+03,5.49e+03]; (2) of Qs0e1n2",2,48.53904,3244.168,228,228
"(49.4,58]; (2) of Qs0e1n2","[1.4e+03,3.01e+03]; (1) of Qs0e1n2",3,50.42376,2828.974,202,202
"(49.4,58]; (2) of Qs0e1n2","(3.01e+03,5.49e+03]; (2) of Qs0e1n2",4,51.30479,3483.461,626,626


## Hgt0 2 groups, Wealth 2 groups, Cebu Only

In [193]:
# Joint Quantile Group Name
var.qjnt.grp.idx <- 'group.index'
list.cts2quantile <- list(list(vars=c('wealthIdx'), prob=c(0, .5, 1.0)), list(vars=c('hgt0'), prob=c(0, .333, 0.666, 1.0)))
results <- df_cut_by_sliced_quantiles_joint((df %>% filter(S.country == 'Cebu')),
                                             var.qjnt.grp.idx, list.cts2quantile,
                                             vars.group_by = c('indi.id'), vars.arrange = c('indi.id', 'svymthRound'),
                                             drop.any.quantile.na = TRUE, toprint = FALSE)
# Show Results
results$df.group.slice1.cnt.mean

Joining, by = c("S.country", "vil.id", "indi.id", "sex", "svymthRound", "momEdu", "wealthIdx", "hgt", "wgt", "hgt0", "wgt0", "prot", "cal", "p.A.prot", "p.A.nProt")


wealthIdx_Qs0e1n2,hgt0_Qs0e1n3,group.index,wealthIdx_mean,hgt0_mean,wealthIdx_n,hgt0_n
"[5.2,8.3]; (1) of Qs0e1n2","[41.1,48.4]; (1) of Qs0e1n3",1,7.154074,46.88741,270,270
"[5.2,8.3]; (1) of Qs0e1n2","(48.4,50.1]; (2) of Qs0e1n3",2,7.177323,49.23197,269,269
"[5.2,8.3]; (1) of Qs0e1n2","(50.1,58]; (3) of Qs0e1n3",3,7.13178,51.33814,236,236
"(8.3,19.3]; (2) of Qs0e1n2","[41.1,48.4]; (1) of Qs0e1n3",4,11.075978,47.16927,179,179
"(8.3,19.3]; (2) of Qs0e1n2","(48.4,50.1]; (2) of Qs0e1n3",5,11.237297,49.30973,185,185
"(8.3,19.3]; (2) of Qs0e1n2","(50.1,58]; (3) of Qs0e1n3",6,11.644928,51.69758,207,207


### Results of income + Wgt0 + Hgt0 joint Gruops in Cebu

Weight at month 0 below and above median, height at month zero into three terciles. 

In [194]:
# Joint Quantile Group Name
var.qjnt.grp.idx <- 'wltHgt0Wgt0.index'
list.cts2quantile <- list(list(vars=c('wealthIdx'), prob=c(0, .5, 1.0)), list(vars=c('hgt0', 'wgt0'), prob=c(0, .5, 1.0)))
results <- df_cut_by_sliced_quantiles_joint((df %>% filter(S.country == 'Cebu')),
                                            var.qjnt.grp.idx, list.cts2quantile,
                                            vars.group_by = c('indi.id'), vars.arrange = c('indi.id', 'svymthRound'),
                                            drop.any.quantile.na = TRUE, toprint = FALSE)
# Show Results
results$df.group.slice1.cnt.mean

Joining, by = "quant.perc"
Joining, by = c("S.country", "vil.id", "indi.id", "sex", "svymthRound", "momEdu", "wealthIdx", "hgt", "wgt", "hgt0", "wgt0", "prot", "cal", "p.A.prot", "p.A.nProt")


wealthIdx_Qs0e1n2,hgt0_Qs0e1n2,wgt0_Qs0e1n2,wltHgt0Wgt0.index,wealthIdx_mean,hgt0_mean,wgt0_mean,wealthIdx_n,hgt0_n,wgt0_n
"[5.2,8.3]; (1) of Qs0e1n2","[41.1,49.2]; (1) of Qs0e1n2","[1.4e+03,2.98e+03]; (1) of Qs0e1n2",1,7.161039,47.27338,2606.668,308,308,308
"[5.2,8.3]; (1) of Qs0e1n2","[41.1,49.2]; (1) of Qs0e1n2","(2.98e+03,5.49e+03]; (2) of Qs0e1n2",2,7.270588,48.37745,3155.769,102,102,102
"[5.2,8.3]; (1) of Qs0e1n2","(49.2,58]; (2) of Qs0e1n2","[1.4e+03,2.98e+03]; (1) of Qs0e1n2",3,7.003093,50.15155,2781.425,97,97,97
"[5.2,8.3]; (1) of Qs0e1n2","(49.2,58]; (2) of Qs0e1n2","(2.98e+03,5.49e+03]; (2) of Qs0e1n2",4,7.160075,50.96791,3327.95,268,268,268
"(8.3,19.3]; (2) of Qs0e1n2","[41.1,49.2]; (1) of Qs0e1n2","[1.4e+03,2.98e+03]; (1) of Qs0e1n2",5,10.925806,47.4043,2631.88,186,186,186
"(8.3,19.3]; (2) of Qs0e1n2","[41.1,49.2]; (1) of Qs0e1n2","(2.98e+03,5.49e+03]; (2) of Qs0e1n2",6,11.302469,48.46667,3196.432,81,81,81
"(8.3,19.3]; (2) of Qs0e1n2","(49.2,58]; (2) of Qs0e1n2","[1.4e+03,2.98e+03]; (1) of Qs0e1n2",7,11.253659,50.1622,2778.694,82,82,82
"(8.3,19.3]; (2) of Qs0e1n2","(49.2,58]; (2) of Qs0e1n2","(2.98e+03,5.49e+03]; (2) of Qs0e1n2",8,11.718468,51.39955,3430.7,222,222,222


## Line by Line--Quantiles Var by Var

The idea of the function is to generate quantiles levels first, and then use those to generate the categories based on quantiles. Rather than doing this in one step. These are done in two steps, to increase clarity in the quantiles used for quantile category generation. And a dataframe with these quantiles are saved as a separate output of the function.

### Dataframe of Variables' Group-by Level Quantiles

Quantiles from Different Variables. Note that these variables are specific to the individual, not individual/month. So we need to first slick the data, so that we only get the first rows. 

Do this in several steps to clarify group_by level. No speed loss. 

In [195]:
# Selected Variables, many Percentiles
vars.group_by <- c('indi.id')
vars.arrange <- c('indi.id', 'svymthRound')
vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
df.sliced <- df_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange)
df.sliced.quantiles <- df.sliced$df.sliced.quantiles
df.grp.L1 <- df.sliced$df.grp.L1

Joining, by = "quant.perc"
Joining, by = "quant.perc"


In [196]:
df.sliced.quantiles

quant.perc,wealthIdx,hgt0,wgt0
0%,1.0,40.6,1402.5
33.33%,5.2,48.5,2843.472
66.66%,8.3,50.2,3208.689
100%,19.3,58.0,5493.8


In [197]:
# Quantiles all Variables
suppressMessages(lapply(names(df), gen_quantiles, df=df.grp.L1, prob=seq(0.1,0.9,0.10)) %>% reduce(full_join))

"NAs introduced by coercion"

quant.perc,S.country,vil.id,indi.id,sex,svymthRound,momEdu,wealthIdx,hgt,wgt,hgt0,wgt0,prot,cal,p.A.prot,p.A.nProt
10%,,3,203.2,,0,5.7,1.7,46.3,1396.94,46.6,2500.28,0.5,0.5,24.28,0.5
20%,,4,405.4,,0,6.9,2.3,47.3,1839.64,47.7,2686.28,0.5,0.5,172.3,0.5
30%,,6,607.6,,0,7.7,3.3,48.0,2271.69,48.3,2803.89,0.5,0.5,721.08,1.06
40%,,8,809.8,,0,8.6,6.3,48.7,2669.16,48.8,2909.68,0.5,0.5,1009.88,19.0
50%,,9,1012.0,,0,9.3,7.3,49.4,3050.1,49.4,3013.0,0.5,0.5,1273.3,110.95
60%,,13,1214.2,,0,10.4,8.3,49.9,3439.5,49.9,3126.08,0.5,3.88,1614.4,221.92
70%,,14,1416.4,,0,11.36,8.3,50.5,3857.28,50.4,3249.52,0.7,8.26,2679.54,256.8
80%,,17,1618.6,,0,12.7,9.3,51.2,4258.12,51.04,3417.86,1.2,11.5,4761.14,298.12
90%,,26,1820.8,,0,14.6,11.3,52.3,4703.62,52.0,3682.83,1.6,15.6,10867.72,365.46


### Cut Quantile Categorical Variables

Using the Quantiles we have generate, cut the continuous variables to generate categorical quantile variables in the full dataframe.

Note that we can only cut based on unique breaks, but sometimes quantile break-points are the same if some values are often observed, and also if there are too few observations with respect to quantile groups. 

To resolve this issue, we only look at unique quantiles. 

We need several support Functions:
1. support functions to generate suffix for quantile variables based on quantile cuts
2. support for labeling variables of resulting quantiles beyond bracketing

In [198]:
# Function Testing
arr.quantiles <- df.sliced.quantiles[[substitute('wealthIdx')]]
arr.quantiles
arr.sort.unique.quantiles <- sort(unique(df.sliced.quantiles[[substitute('wealthIdx')]]))
arr.sort.unique.quantiles
f_Q_label(arr.quantiles, arr.sort.unique.quantiles[1], seq.quantiles)
f_Q_label(arr.quantiles, arr.sort.unique.quantiles[2], seq.quantiles)
lapply(arr.sort.unique.quantiles[1:(length(arr.sort.unique.quantiles)-1)],
       f_Q_label,
       arr.quantiles=arr.quantiles,
       seq.quantiles=seq.quantiles)

In [199]:
# Generate Categorical Variables of Quantiles
vars.group_by <- c('indi.id')
vars.arrange <- c('indi.id', 'svymthRound')
vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
df.cut <- df_cut_by_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange)
vars.quantile.cut <- df.cut$vars.quantile.cut
df.with.cut.quant <- df.cut$df.with.cut.quant
df.grp.L1 <- df.cut$df.grp.L1

Joining, by = "quant.perc"
Joining, by = "quant.perc"


In [200]:
# Cut Variables Generated
names(vars.quantile.cut)
summary(vars.quantile.cut)

                  wealthIdx_Qs0e1n3                      hgt0_Qs0e1n3  
 [1,5.2]; (1) of Qs0e1n3   :10958   [40.6,48.5]; (1) of Qs0e1n3:10232  
 (5.2,8.3]; (2) of Qs0e1n3 :13812   (48.5,50.2]; (2) of Qs0e1n3: 9895  
 (8.3,19.3]; (3) of Qs0e1n3:10295   (50.2,58]; (3) of Qs0e1n3  : 9908  
                                    NA's                       : 5030  
                              wgt0_Qs0e1n3  
 [1.4e+03,2.84e+03]; (1) of Qs0e1n3 :10105  
 (2.84e+03,3.21e+03]; (2) of Qs0e1n3:10056  
 (3.21e+03,5.49e+03]; (3) of Qs0e1n3: 9858  
 NA's                               : 5046  

In [201]:
# options(repr.matrix.max.rows=50, repr.matrix.max.cols=20)
# df.with.cut.quant

### Individual Variables' Quantile Cuts Review Results

In [202]:
# Group By Results
f.count <- function(df, var.cts, seq.quantiles) {
    df %>% select(S.country, indi.id, svymthRound, matches(paste0(var.cts, collapse='|'))) %>%
        group_by(!!sym(f_var_rename(paste0(var.cts,'_q'), seq.quantiles))) %>%
        summarise_all(funs(n=n()))
}

In [203]:
# Full Panel Results
lapply(vars.cts2quantile, f.count, df=df.with.cut.quant, seq.quantiles=seq.quantiles)

wealthIdx_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wealthIdx_n
"[1,5.2]; (1) of Qs0e1n3",10958,10958,10958,10958
"(5.2,8.3]; (2) of Qs0e1n3",13812,13812,13812,13812
"(8.3,19.3]; (3) of Qs0e1n3",10295,10295,10295,10295

hgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,hgt0_n
"[40.6,48.5]; (1) of Qs0e1n3",10232,10232,10232,10232
"(48.5,50.2]; (2) of Qs0e1n3",9895,9895,9895,9895
"(50.2,58]; (3) of Qs0e1n3",9908,9908,9908,9908
,5030,5030,5030,5030

wgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wgt0_n
"[1.4e+03,2.84e+03]; (1) of Qs0e1n3",10105,10105,10105,10105
"(2.84e+03,3.21e+03]; (2) of Qs0e1n3",10056,10056,10056,10056
"(3.21e+03,5.49e+03]; (3) of Qs0e1n3",9858,9858,9858,9858
,5046,5046,5046,5046


In [204]:
# Results Individual Slice
lapply(vars.cts2quantile, f.count, 
       df=(df.with.cut.quant %>% group_by(!!!syms(vars.group_by)) %>% arrange(!!!syms(vars.arrange)) %>% slice(1L)), 
       seq.quantiles = seq.quantiles)

wealthIdx_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wealthIdx_n
"[1,5.2]; (1) of Qs0e1n3",683,683,683,683
"(5.2,8.3]; (2) of Qs0e1n3",768,768,768,768
"(8.3,19.3]; (3) of Qs0e1n3",572,572,572,572

hgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,hgt0_n
"[40.6,48.5]; (1) of Qs0e1n3",580,580,580,580
"(48.5,50.2]; (2) of Qs0e1n3",561,561,561,561
"(50.2,58]; (3) of Qs0e1n3",568,568,568,568
,314,314,314,314

wgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wgt0_n
"[1.4e+03,2.84e+03]; (1) of Qs0e1n3",569,569,569,569
"(2.84e+03,3.21e+03]; (2) of Qs0e1n3",569,569,569,569
"(3.21e+03,5.49e+03]; (3) of Qs0e1n3",570,570,570,570
,315,315,315,315


## Differential Quantiles for Different Variables Then Combine to Form New Groups
Collect together different quantile base variables and their percentile cuttings quantile rules. 

### Input Parameters

In [205]:
# Generate Categorical Variables of Quantiles
vars.group_by <- c('indi.id')
vars.arrange <- c('indi.id', 'svymthRound')

In [206]:
# Quantile Variables and Quantiles
vars.cts2quantile.wealth <- c('wealthIdx')
seq.quantiles.wealth <- c(0, .5, 1.0)
vars.cts2quantile.wgthgt <- c('hgt0', 'wgt0')
seq.quantiles.wgthgt <- c(0, .3333, 0.6666, 1.0)
drop.any.quantile.na <- TRUE
# collect to list
list.cts2quantile <- list(list(vars=vars.cts2quantile.wealth,
                               prob=seq.quantiles.wealth),
                          list(vars=vars.cts2quantile.wgthgt,
                               prob=seq.quantiles.wgthgt))

## Check if Within Group Variables Are The Same

Need to make sure quantile variables are unique within groups

In [207]:
vars.cts2quantile <- unlist(lapply(list.cts2quantile, function(elist) elist$vars))
f_check_distinct_ingroup(df, vars.group_by, vars.values_in_group=vars.cts2quantile)

### Keep only non-NA for all Quantile Variables

In [208]:
# Original dimensions
dim(df)
# All Continuous Variables from lists
vars.cts2quantile <- unlist(lapply(list.cts2quantile, function(elist) elist$vars))
vars.cts2quantile
# Keep only if not NA for all Quantile variables       
if (drop.any.quantile.na) {
    df.select <- df %>% drop_na(c(vars.group_by, vars.arrange, vars.cts2quantile))
}
dim(df.select)

### Apply Quantiles for Each Quantile Variable

In [209]:
# Dealing with a list of quantile variables
df.cut.wealth <- df_cut_by_sliced_quantiles(df.select, vars.cts2quantile.wealth, seq.quantiles.wealth, vars.group_by, vars.arrange)
summary(df.cut.wealth$vars.quantile.cut)
# summary((df.cut.wealth$df.with.cut.quant)[['wealthIdx_Qs0e1n2']])
# df.cut.wealth$df.with.cut.quant %>% filter(is.na(wealthIdx_Qs0e1n2))
# df.cut.wealth$df.with.cut.quant %>% filter(indi.id == 500)

                  wealthIdx_Qs0e1n2
 [1,7.3]; (1) of Qs0e1n2   :14936  
 (7.3,19.3]; (2) of Qs0e1n2:15083  

In [210]:
df.cut.wgthgt <- df_cut_by_sliced_quantiles(df.select, vars.cts2quantile.wgthgt, seq.quantiles.wgthgt, vars.group_by, vars.arrange)
summary(df.cut.wgthgt$vars.quantile.cut)

Joining, by = "quant.perc"


                      hgt0_Qs0e1n3  
 [40.6,48.5]; (1) of Qs0e1n3:10216  
 (48.5,50.2]; (2) of Qs0e1n3: 9895  
 (50.2,58]; (3) of Qs0e1n3  : 9908  
                              wgt0_Qs0e1n3  
 [1.4e+03,2.84e+03]; (1) of Qs0e1n3 :10105  
 (2.84e+03,3.21e+03]; (2) of Qs0e1n3:10056  
 (3.21e+03,5.49e+03]; (3) of Qs0e1n3: 9858  

### Apply Quantiles Functionally

In [211]:
# Function to handle list inputs with different quantiles vars and probabilities
df_cut_by_sliced_quantiles_grps <- function(quantile.grp.list, df, vars.group_by, vars.arrange) {
    vars.cts2quantile <- quantile.grp.list$vars
    seq.quantiles <- quantile.grp.list$prob
    return(df_cut_by_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange))
}

In [212]:
# Apply function
df.cut.list <- lapply(list.cts2quantile, df_cut_by_sliced_quantiles_grps, 
                      df=df.select, vars.group_by=vars.group_by, vars.arrange=vars.arrange)

Joining, by = "quant.perc"


In [213]:
# Reduce Resulting Matrixes Together
df.with.cut.quant.all <- lapply(df.cut.list, function(elist) elist$df.with.cut.quant) %>% reduce(left_join)
dim(df.with.cut.quant.all)

Joining, by = c("S.country", "vil.id", "indi.id", "sex", "svymthRound", "momEdu", "wealthIdx", "hgt", "wgt", "hgt0", "wgt0", "prot", "cal", "p.A.prot", "p.A.nProt")


In [214]:
# Obrain Newly Created Quantile Group Variables
vars.quantile.cut.all <- unlist(lapply(df.cut.list, function(elist) names(elist$vars.quantile.cut)))
vars.quantile.cut.all

### Summarize by Groups

Summarize by all groups.

In [215]:
summary(df.with.cut.quant.all %>% select(one_of(vars.quantile.cut.all)))

                  wealthIdx_Qs0e1n2                      hgt0_Qs0e1n3  
 [1,7.3]; (1) of Qs0e1n2   :14936   [40.6,48.5]; (1) of Qs0e1n3:10216  
 (7.3,19.3]; (2) of Qs0e1n2:15083   (48.5,50.2]; (2) of Qs0e1n3: 9895  
                                    (50.2,58]; (3) of Qs0e1n3  : 9908  
                              wgt0_Qs0e1n3  
 [1.4e+03,2.84e+03]; (1) of Qs0e1n3 :10105  
 (2.84e+03,3.21e+03]; (2) of Qs0e1n3:10056  
 (3.21e+03,5.49e+03]; (3) of Qs0e1n3: 9858  

In [216]:
# df.with.cut.quant.all %>%
#     group_by(!!!syms(vars.quantile.cut.all)) %>%
#     summarise_at(vars.cts2quantile, funs(mean, n()))

### Generate Joint Quantile Vars Unique Groups

In [217]:
# Generate Joint Quantile Index Variable
var.qjnt.grp.idx <- 'group.index'
df.with.cut.quant.all <- df.with.cut.quant.all %>% mutate(!!var.qjnt.grp.idx := group_indices(., !!!syms(vars.quantile.cut.all)))

In [218]:
arr.group.idx <- t(sort(unique(df.with.cut.quant.all[[var.qjnt.grp.idx]])))
arr.group.idx

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18


In [219]:
df.with.cut.quant.all %>% group_by(!!!syms(vars.quantile.cut.all), !!sym(var.qjnt.grp.idx)) %>%
        summarise_at(vars.cts2quantile, funs(mean, n()))

wealthIdx_Qs0e1n2,hgt0_Qs0e1n3,wgt0_Qs0e1n3,group.index,wealthIdx_mean,hgt0_mean,wgt0_mean,wealthIdx_n,hgt0_n,wgt0_n
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",1,5.306477,46.56259,2497.543,3304,3304,3304
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",2,5.0773,47.61424,2992.62,1348,1348,1348
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",3,3.639227,47.74586,3428.613,362,362,362
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",4,6.042504,49.24233,2671.04,1134,1134,1134
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",5,5.355495,49.34579,3030.472,2184,2184,2184
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",6,4.360647,49.61496,3480.88,1484,1484,1484
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",7,6.254082,51.16327,2665.767,196,196,196
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",8,5.451432,50.96835,3047.78,1466,1466,1466
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",9,4.055986,51.83008,3660.124,3458,3458,3458
"(7.3,19.3]; (2) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",10,9.860733,46.79267,2539.984,3438,3438,3438


In [220]:
df.with.cut.quant.all  %>% group_by(!!!syms(vars.group_by)) %>% arrange(!!!syms(vars.arrange)) %>% slice(1L) %>%
        group_by(!!!syms(vars.quantile.cut.all), !!sym(var.qjnt.grp.idx)) %>%
        summarise_at(vars.cts2quantile, funs(mean, n()))

wealthIdx_Qs0e1n2,hgt0_Qs0e1n3,wgt0_Qs0e1n3,group.index,wealthIdx_mean,hgt0_mean,wgt0_mean,wealthIdx_n,hgt0_n,wgt0_n
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",1,5.200526,46.55632,2498.762,190,190,190
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",2,4.958974,47.60256,2992.736,78,78,78
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",3,3.563636,47.73182,3430.941,22,22,22
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",4,5.989063,49.24375,2671.014,64,64,64
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",5,5.246032,49.34603,3031.429,126,126,126
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",6,4.235227,49.61136,3484.544,88,88,88
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",7,6.218182,51.15455,2665.818,11,11,11
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",8,5.360714,50.96905,3048.073,84,84,84
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",9,3.944928,51.83623,3667.147,207,207,207
"(7.3,19.3]; (2) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",10,9.860733,46.79267,2539.984,191,191,191


### Change values Based on Index

Index from 1 to 18, change input values based on index

In [221]:
# arr.group.idx.subsidy <- arr.group.idx*2 - ((arr.group.idx)^2)*0.01
arr.group.idx.subsidy <- arr.group.idx*2 
df.with.cut.quant.all %>% 
        mutate(more_prot = prot + arr.group.idx.subsidy[!!sym(var.qjnt.grp.idx)]) %>% 
        group_by(!!!syms(vars.quantile.cut.all), !!sym(var.qjnt.grp.idx))  %>% 
        summarise_at(c('more_prot', 'prot'), funs(mean(., na.rm=TRUE)))

wealthIdx_Qs0e1n2,hgt0_Qs0e1n3,wgt0_Qs0e1n3,group.index,more_prot,prot
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",1,14.08242,12.08242
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",2,15.89847,11.89847
"[1,7.3]; (1) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",3,27.15484,21.15484
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",4,18.90528,10.90528
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",5,22.32498,12.32498
"[1,7.3]; (1) of Qs0e1n2","(48.5,50.2]; (2) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",6,28.6312,16.6312
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",7,25.47638,11.47638
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","(2.84e+03,3.21e+03]; (2) of Qs0e1n3",8,28.02607,12.02607
"[1,7.3]; (1) of Qs0e1n2","(50.2,58]; (3) of Qs0e1n3","(3.21e+03,5.49e+03]; (3) of Qs0e1n3",9,34.69356,16.69356
"(7.3,19.3]; (2) of Qs0e1n2","[40.6,48.5]; (1) of Qs0e1n3","[1.4e+03,2.84e+03]; (1) of Qs0e1n3",10,30.73473,10.73473
