Back to **[Fan](https://fanwangecon.github.io/)**'s R4Econ Homepage **[Table of Content](https://fanwangecon.github.io/R4Econ/)**

# Generate Joint Quantiles from Multiple Continuous Variables as a Categorical Variable with Linear Index

There are multiple or a single continuous variables. Find which quantile each observation belongs to for each of the variables. Then also generate a joint/interaction variable of all combinations of quantiles from different variables.

The program has these features:

1. Quantiles breaks are generated based on group_by characteristics, meaning quantiles for individual level characteristics when data is panel
2. Quantiles variables apply to full panel at within-group observation levels.
3. Robust to non-unique breaks for quantiles (non-unique grouped together)
4. Quantile categories have detailed labeling (specifying which non-unique groupings belong to quantile)

## Program

### Support Functions

In [15]:
# Quantiles for any variable
gen_quantiles <- function(var, df, prob=c(0.25, 0.50, 0.75)) {
    enframe(quantile(as.numeric(df[[var]]), prob, na.rm=TRUE), 'quant.perc', var) 
}
# Support Functions for Variable Suffix 
f_Q_suffix <- function(seq.quantiles) {
    quantile.suffix <- paste0('Qs', min(seq.quantiles),
                              'e', max(seq.quantiles),
                              'n', (length(seq.quantiles)-1))
}
# Support Functions for Quantile Labeling
f_Q_label <- function(arr.quantiles, 
                      arr.sort.unique.quantile,
                      seq.quantiles) {
    paste0('(', 
           paste0(which(arr.quantiles %in% arr.sort.unique.quantile), collapse=','), 
           ') of ', f_Q_suffix(seq.quantiles)) 
}
# Generate New Variable Names with Quantile Suffix
f_var_rename <- function(name, seq.quantiles) {
    quantile.suffix <- paste0('_', f_Q_suffix(seq.quantiles))
    return(sub('_q', quantile.suffix, name))
}

### Data Slicing and Quantile Generation

- Function 1: generate quantiles based on group-specific characteristics. the groups could be at the panel observation level as well. 

In [16]:
# First Step, given groups, generate quantiles based on group characteristics
# vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
# seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
# vars.group_by <- c('indi.id')
# vars.arrange <- c('indi.id', 'svymthRound')
# vars.continuous <- c('wealthIdx', 'hgt0', 'wgt0')
df_sliced_quantiles <- function(df, vars.cts2quantile, seq.quantiles, 
                                vars.group_by, vars.arrange) {
    
    # Slicing data
    df.grp.L1 <- df %>% group_by(!!!syms(vars.group_by)) %>% arrange(!!!syms(vars.arrange)) %>% slice(1L) %>% ungroup()
    
    # Quantiles based on sliced data
    df.sliced.quantiles <- lapply(vars.cts2quantile, gen_quantiles, df=df.grp.L1, prob=seq.quantiles) %>% reduce(full_join)
    
    return(list(df.sliced.quantiles=df.sliced.quantiles, 
                df.grp.L1=df.grp.L1))
}

## Data Cutting

- Function 2: cut groups for full panel dataframe based on group-specific characteristics quantiles.

In [17]:
# Cutting Function, Cut Continuous Variables into Quantiles with labeing
f_cut <- function(var, df.sliced.quantiles, include.lowest=TRUE, fan.labels=TRUE, print=FALSE) {
    
    # unparsed string variable name
    var.str <- substitute(var)
    
    # Breaks
    arr.quantiles <- df.sliced.quantiles[[var.str]]
    arr.sort.unique.quantiles <- sort(unique(arr.quantiles))
    if (print) {
        print(arr.sort.unique.quantiles)
    }
    
    # Regular cutting With Standard Labels
    # TRUE, means the lowest group has closed bracket left and right 
    var.quantile <- cut(var, breaks=arr.sort.unique.quantiles, include.lowest=include.lowest)
    
    # Use my custom labels
    if (fan.labels) {
        levels.suffix <- lapply(arr.sort.unique.quantiles[1:(length(arr.sort.unique.quantiles)-1)],
                                f_Q_label,
                                arr.quantiles=arr.quantiles,
                                seq.quantiles=seq.quantiles)
        if (print) {
            print(levels.suffix)
        }
        levels(var.quantile) <- paste0(levels(var.quantile), '; ', levels.suffix)
    }
    
    # Return
    return(var.quantile)
}

In [18]:
# Combo Quantile Function
# vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
# seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
# vars.group_by <- c('indi.id')
# vars.arrange <- c('indi.id', 'svymthRound')
# vars.continuous <- c('wealthIdx', 'hgt0', 'wgt0')
df_cut_by_sliced_quantiles <- function(df, vars.cts2quantile, seq.quantiles, 
                                       vars.group_by, vars.arrange) {
    
    # First Step Slicing
    df.sliced <- df_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange)

    # Second Step Generate Categorical Variables of Quantiles
    df.with.cut.quant <- df %>% mutate_at(vars.cts2quantile,
                               funs(q=f_cut(., df.sliced$df.sliced.quantiles, 
                                           include.lowest=TRUE, fan.labels=TRUE))) %>% 
                rename_at(vars(contains('_q')), 
                          funs(f_var_rename(., seq.quantiles=seq.quantiles)))
    # Return
    return(list(df.with.cut.quant = df.with.cut.quant, 
                df.sliced.quantiles=df.sliced$df.sliced.quantiles, 
                df.grp.L1=df.sliced$df.grp.L1))    
    
}

## Use Program

### Load Data

In [19]:
# Library
library(tidyverse)

# Load Sample Data
setwd('C:/Users/fan/R4Econ/_data/')
df <- read_csv('height_weight.csv')

Parsed with column specification:
cols(
  S.country = col_character(),
  vil.id = col_double(),
  indi.id = col_double(),
  sex = col_character(),
  svymthRound = col_double(),
  momEdu = col_double(),
  wealthIdx = col_double(),
  hgt = col_double(),
  wgt = col_double(),
  hgt0 = col_double(),
  wgt0 = col_double(),
  prot = col_double(),
  cal = col_double(),
  p.A.prot = col_double(),
  p.A.nProt = col_double()
)


## Line by Line

The idea of the function is to generate quantiles levels first, and then use those to generate the categories based on quantiles. Rather than doing this in one step. These are done in two steps, to increase clarity in the quantiles used for quantile category generation. And a dataframe with these quantiles are saved as a separate output of the function.

### Dataframe of Variables' Group-by Level Quantiles

Quantiles from Different Variables. Note that these variables are specific to the individual, not individual/month. So we need to first slick the data, so that we only get the first rows. 

Do this in several steps to clarify group_by level. No speed loss. 

In [20]:
# Selected Variables, many Percentiles
vars.group_by <- c('indi.id')
vars.arrange <- c('indi.id', 'svymthRound')
vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
df.sliced <- df_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange)
df.sliced.quantiles <- df.sliced$df.sliced.quantiles
df.grp.L1 <- df.sliced$df.grp.L1

Joining, by = "quant.perc"
Joining, by = "quant.perc"


In [21]:
df.sliced.quantiles

quant.perc,wealthIdx,hgt0,wgt0
0%,0.5,40.9,1410.2
33.33%,0.5,48.6,2885.528
66.66%,3.0,50.3,3221.898
100%,14.0,59.0,5180.9


In [22]:
# Quantiles all Variables
suppressMessages(lapply(names(df), gen_quantiles, df=df.grp.L1, prob=seq(0.1,0.9,0.10)) %>% reduce(full_join))

"NAs introduced by coercion"

quant.perc,S.country,vil.id,indi.id,sex,svymthRound,momEdu,wealthIdx,hgt,wgt,hgt0,wgt0,prot,cal,p.A.prot,p.A.nProt
10%,,3,203.2,,0,0.5,0.5,46.3,1463.92,46.6,2508.83,0.5,0.5,29.3,0.5
20%,,4,405.4,,0,0.5,0.5,47.6,1959.0,47.7,2700.78,0.5,0.5,201.22,0.5
30%,,6,607.6,,0,0.5,0.5,48.3,2336.59,48.4,2830.69,0.5,0.5,727.55,5.39
40%,,8,809.8,,0,4.1,0.9,48.9,2751.72,48.9,2954.12,0.5,0.5,1021.64,45.46
50%,,9,1012.0,,0,7.6,1.9,49.4,3146.4,49.4,3052.0,0.5,0.5,1261.3,121.85
60%,,12,1214.2,,0,11.92,2.8,50.0,3496.64,50.0,3157.76,0.5,4.12,1577.08,228.06
70%,,14,1416.4,,0,16.0,3.1,50.7,3846.83,50.5,3282.13,0.7,8.3,2395.17,260.7
80%,,17,1618.6,,0,20.0,4.1,51.4,4313.78,51.1,3446.4,1.1,12.1,4285.28,293.06
90%,,26,1820.8,,0,23.78,6.0,52.4,4719.33,52.0,3698.51,1.5,16.2,9019.84,368.14


### Cut Quantile Categorical Variables

Using the Quantiles we have generate, cut the continuous variables to generate categorical quantile variables in the full dataframe.

Note that we can only cut based on unique breaks, but sometimes quantile break-points are the same if some values are often observed, and also if there are too few observations with respect to quantile groups. 

To resolve this issue, we only look at unique quantiles. 

We need several support Functions:
1. support functions to generate suffix for quantile variables based on quantile cuts
2. support for labeling variables of resulting quantiles beyond bracketing

In [23]:
# Function Testing
arr.quantiles <- df.sliced.quantiles[[substitute('wealthIdx')]]
arr.quantiles
arr.sort.unique.quantiles <- sort(unique(df.sliced.quantiles[[substitute('wealthIdx')]]))
arr.sort.unique.quantiles
f_Q_label(arr.quantiles, arr.sort.unique.quantiles[1], seq.quantiles)
f_Q_label(arr.quantiles, arr.sort.unique.quantiles[2], seq.quantiles)
lapply(arr.sort.unique.quantiles[1:(length(arr.sort.unique.quantiles)-1)],
       f_Q_label,
       arr.quantiles=arr.quantiles,
       seq.quantiles=seq.quantiles)

In [24]:
# Generate Categorical Variables of Quantiles
vars.group_by <- c('indi.id')
vars.arrange <- c('indi.id', 'svymthRound')
vars.cts2quantile <- c('wealthIdx', 'hgt0', 'wgt0')
seq.quantiles <- c(0, 0.3333, 0.6666, 1.0)
df.cut <- df_cut_by_sliced_quantiles(df, vars.cts2quantile, seq.quantiles, vars.group_by, vars.arrange)
df.with.cut.quant <- df.cut$df.with.cut.quant
df.grp.L1 <- df.cut$df.grp.L1

Joining, by = "quant.perc"
Joining, by = "quant.perc"


In [25]:
options(repr.matrix.max.rows=50, repr.matrix.max.cols=20)
df.with.cut.quant

S.country,vil.id,indi.id,sex,svymthRound,momEdu,wealthIdx,hgt,wgt,hgt0,wgt0,prot,cal,p.A.prot,p.A.nProt,wealthIdx_Qs0e1n3,hgt0_Qs0e1n3,wgt0_Qs0e1n3
Cebu,1,1,Male,0,10.3,1.0,44.7,912.1,44.7,2038.4,0.7,0.5,3964.6,70.0,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,2,10.8,1.0,54.4,5892.8,44.7,2038.4,0.5,0.5,1974.1,213.4,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,4,8.2,1.0,60.4,5972.1,44.7,2038.4,0.5,12.5,1519.4,237.7,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,6,1.7,0.9,63.5,5667.2,44.7,2038.4,5.7,154.0,2088.7,221.2,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,8,9.3,0.9,65.9,7281.4,44.7,2038.4,2.1,70.8,2975.7,165.0,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,10,0.5,0.9,69.3,9756.9,44.7,2038.4,15.0,292.7,2647.4,145.9,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,12,0.5,1.0,69.0,7552.1,44.7,2038.4,12.9,342.6,1834.0,164.8,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,14,20.2,0.8,,,44.7,2038.4,,,,,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,16,2.4,1.0,,,44.7,2038.4,,,,,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"
Cebu,1,1,Male,18,12.0,1.2,,,44.7,2038.4,,,,,"[0.5,3]; (1,2) of Qs0e1n3","[40.9,48.6]; (1) of Qs0e1n3","[1.41e+03,2.89e+03]; (1) of Qs0e1n3"


### Individual Variables' Quantile Cuts Review Results

In [26]:
# Group By Results
f.count <- function(df, var.cts) {
    df %>% select(S.country, indi.id, svymthRound, matches(paste0(var.cts, collapse='|'))) %>%
        group_by(!!sym(f_var_rename(paste0(var.cts,'_q'), seq.quantiles))) %>%
        summarise_all(funs(n=n()))
}

In [27]:
# Full Panel Results
lapply(vars.cts2quantile, f.count, df=df.with.cut.quant)

wealthIdx_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wealthIdx_n
"[0.5,3]; (1,2) of Qs0e1n3",23258,23258,23258,23258
"(3,14]; (3) of Qs0e1n3",11784,11784,11784,11784
,7,7,7,7

hgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,hgt0_n
"[40.9,48.6]; (1) of Qs0e1n3",10476,10476,10476,10476
"(48.6,50.3]; (2) of Qs0e1n3",9724,9724,9724,9724
"(50.3,59]; (3) of Qs0e1n3",9597,9597,9597,9597
,5252,5252,5252,5252

wgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wgt0_n
"[1.41e+03,2.89e+03]; (1) of Qs0e1n3",10032,10032,10032,10032
"(2.89e+03,3.22e+03]; (2) of Qs0e1n3",9990,9990,9990,9990
"(3.22e+03,5.18e+03]; (3) of Qs0e1n3",9759,9759,9759,9759
,5268,5268,5268,5268


In [28]:
# Results Individual Slice
lapply(vars.cts2quantile, f.count, 
       df=(df.with.cut.quant %>% group_by(!!!syms(vars.group_by)) %>% arrange(!!!syms(vars.arrange)) %>% slice(1L)))

wealthIdx_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wealthIdx_n
"[0.5,3]; (1,2) of Qs0e1n3",1366,1366,1366,1366
"(3,14]; (3) of Qs0e1n3",657,657,657,657

hgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,hgt0_n
"[40.9,48.6]; (1) of Qs0e1n3",594,594,594,594
"(48.6,50.3]; (2) of Qs0e1n3",552,552,552,552
"(50.3,59]; (3) of Qs0e1n3",549,549,549,549
,328,328,328,328

wgt0_Qs0e1n3,S.country_n,indi.id_n,svymthRound_n,wgt0_n
"[1.41e+03,2.89e+03]; (1) of Qs0e1n3",565,565,565,565
"(2.89e+03,3.22e+03]; (2) of Qs0e1n3",564,564,564,564
"(3.22e+03,5.18e+03]; (3) of Qs0e1n3",565,565,565,565
,329,329,329,329


### Combine Groups