In [3]:
library(stringr)
library(ggplot2)

<table><tr><td bgcolor="green">Variables</font></td></tr></table>

In [4]:
regions_missing = c('HK(region)')
regions_order = c('USA','UK','Australia','South Africa','Germany',
                  'Japan','Israel','CHN','HK(region)','France',
                  'Spain','Mexico','Chile','Portugal','Brazil',
                  'Russia','Egypt','Qatar','India')

if (sum(regions_missing!='')>0){
    regions_order = regions_order[-which(regions_order%in%regions_missing)]
}

y_variable = 'checked/Gender_equal'

<table><tr><td bgcolor="blue">Helper function</font></td></tr></table>

In [5]:
fix_region = function(df){
    regions = rownames(df)
    regions = str_replace(regions,'United States','USA')
    regions = str_replace(regions,'United Kingdom','UK')
    regions = str_replace(regions,'China','CHN')
    regions = str_replace(regions,'South_africa','South Africa')
    regions = str_replace(regions,'Spain','Spanish')
    regions = str_replace(regions,'HK','HK(region)')
    regions = gsub('Hong Kong SAR, CHN','HK(region)',regions,fixed=TRUE)
    regions = str_replace(regions,'Hong Kong','HK(region)')
    regions = str_replace(regions,'Spanish','Spain')
    regions = str_replace(regions,'Russian Federation','Russia')
    regions = str_replace(regions,'Russian','Russia')
    regions = str_replace(regions,'Egypt, Arab Rep.','Egypt')
    
    rownames(df) = regions
    colnames(df) = regions
    
    df = df[regions_order,regions_order]
    return(df)
}

lowerTriangle <- function(m){
  return(m[lower.tri(m,diag = FALSE)])
}

In [6]:
y_rdm = read.csv(paste0('../../rdm/',y_variable,'_dist.csv'), 
                      row.names=1, encoding = "UTF-8",check.names = FALSE)
y_rdm = fix_region(y_rdm)

# FAVEE-HPP rdm

In [7]:
root_path = '../../../RSA_Regression/output_data/models_rdm/'

full_rdm = read.csv(paste0(root_path,'full_feature/raw33d_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
full_rdm = full_rdm[regions_order,regions_order]

# FAVEE
favee_rdm = read.csv(paste0(root_path,'dimensional/favee_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
favee_rdm = favee_rdm[regions_order,regions_order]

formality_rdm = read.csv(paste0(root_path,'dimensional/formality_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
formality_rdm = formality_rdm[regions_order,regions_order]

activeness_rdm = read.csv(paste0(root_path,'dimensional/activeness_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
activeness_rdm = activeness_rdm[regions_order,regions_order]

valence_rdm = read.csv(paste0(root_path,'dimensional/valence_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
valence_rdm = valence_rdm[regions_order,regions_order]

exchange_rdm = read.csv(paste0(root_path,'dimensional/exchange_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
exchange_rdm = exchange_rdm[regions_order,regions_order]

equality_rdm = read.csv(paste0(root_path,'dimensional/equality_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
equality_rdm = equality_rdm[regions_order,regions_order]

# HPP
hpp_rdm = read.csv(paste0(root_path,'categorical/hpp_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
hpp_rdm = hpp_rdm[regions_order,regions_order]

hostile_rdm = read.csv(paste0(root_path,'categorical/hostile_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
hostile_rdm = hostile_rdm[regions_order,regions_order]

private_rdm = read.csv(paste0(root_path,'categorical/private_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
private_rdm = private_rdm[regions_order,regions_order]

public_rdm = read.csv(paste0(root_path,'categorical/public_dissim_dist.csv'),
                    row.names =1, check.names = FALSE)
public_rdm = public_rdm[regions_order,regions_order]

# Predict results

<table><tr><td bgcolor="blue">Helper function</font></td></tr></table>

In [8]:
x_variables = function(variables_names){
    variables <- list()
    for(i in c(1:length(variables_names))){
      variables[[i]] = get(variables_names[i])
    }
    return(variables)
}

In [9]:
permutation_result = function(culture_regress_standard_model){
    # create permutation matrix
    nperm <- 10000
    permf <- matrix(NA,nperm)
    permr2 <- matrix(NA,nperm)
    # contain beta
    permt <- matrix(NA,nperm,length(variables_names)) # 12variables
    colnames(permt) <- variables_names
    # caculate proportation of beta or F value over the model results
    permf_pvals <- NA 
    permt_pvals <- NA
    tstats = NA

    # the model results
    fstats <- culture_regress_standard_model$fstatistic[[1]]
    for (i in 1:length(variables_names)){
      tstats[i] <- culture_regress_standard_model$coefficients[1+i,1]
    }
    r2stats <- culture_regress_standard_model$r.squared
    print(paste('fstats:',fstats))
    #print('tstats:')
    #tstats
    print(paste('r2stats:',r2stats))

    set.seed(2)
    for (i in 1:nperm){
      psel <- sample(length(regions_order)) # permuted index, total 17 regions
      x <- do.call(cbind,lapply(variables,function(x) lowerTriangle(x[psel,psel]))) # generate permuted predictors
      rfit <- lm(scale(y_rdm_array)~scale(x)) # fit permuted model
      srfit <- summary(rfit)
      permf[i,1] <- srfit$fstatistic[1] # F-stat
      permr2[i,1] <- srfit$r.squared # R-squared
      #permf[,i] <- unlist(lapply(srfit,function(x) x$fstatistic[1])) # F-stat
      #permr2[,i] <- unlist(lapply(srfit,function(x) x$r.squared)) # R-squared
      for (j in 1:length(variables_names)){
        permt[i,j] <- srfit$coefficients[1+j,1] # each var t-stat
      }
      #print(i)
    }
    print(i)

    # F statistics
    permf_pvals <- mean(permf[,1] >= fstats)
    print(paste('permf_pvals',permf_pvals))

    # Each variable's regression
    for (i in 1:length(variables_names)){
      permt_pvals[i] <- mean(permt[,i] >= tstats[i])
    }

    permt_pvals <- as.data.frame(t(permt_pvals)) 

    colnames(permt_pvals) <- variables_names
    return(permt_pvals)
} 

***

In [10]:
variables_names_list <- list()
variables_names_list[['full_rdm']] <- 'full_rdm'


variables_names_list[['favee_rdm']] <- 'favee_rdm'
variables_names_list[['favee_separate']] <- c('formality_rdm','activeness_rdm','valence_rdm',
                                              'exchange_rdm','equality_rdm')
variables_names_list[['formality_rdm']] <- 'formality_rdm'
variables_names_list[['activeness_rdm']] <- 'activeness_rdm'
variables_names_list[['valence_rdm']] <- 'valence_rdm'
variables_names_list[['exchange_rdm']] <- 'exchange_rdm'
variables_names_list[['equality_rdm']] <- 'equality_rdm'


variables_names_list[['hpp_rdm']] <- 'hpp_rdm'
variables_names_list[['hpp_separate']] <- c('hostile_rdm','private_rdm','public_rdm')
variables_names_list[['hostile_rdm']] <- 'hostile_rdm'
variables_names_list[['private_rdm']] <- 'private_rdm'
variables_names_list[['public_rdm']] <- 'public_rdm'

In [11]:
y_rdm_array = lowerTriangle(y_rdm)
for (variables_label in names(variables_names_list)){
    print(variables_label)
    variables_names = variables_names_list[[variables_label]]
    variables = x_variables(variables_names=variables_names)
    
    x <- do.call(cbind,lapply(variables,function(x) lowerTriangle(x)))
    colnames(x) = variables_names
    culture_regress_standard <- lm(scale(y_rdm_array)~scale(x))
    culture_regress_standard_model <- summary(culture_regress_standard)
    print(culture_regress_standard_model)                          
                              
    p_value = permutation_result(culture_regress_standard_model)
    print(p_value)
    print('###############################################################')
}

[1] "full_rdm"

Call:
lm(formula = scale(y_rdm_array) ~ scale(x))

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8509 -0.7711 -0.1265  0.8984  1.9115 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.326e-16  7.408e-02   0.000        1    
scale(x)    4.074e-01  7.432e-02   5.482 1.73e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9163 on 151 degrees of freedom
Multiple R-squared:  0.166,	Adjusted R-squared:  0.1605 
F-statistic: 30.05 on 1 and 151 DF,  p-value: 1.727e-07

[1] "fstats: 30.0505949961017"
[1] "r2stats: 0.165978990550949"
[1] 10000
[1] "permf_pvals 0.0084"
  full_rdm
1   0.0084
[1] "###############################################################"
[1] "favee_rdm"

Call:
lm(formula = scale(y_rdm_array) ~ scale(x))

Residuals:
    Min      1Q  Median      3Q     Max 
-1.6846 -0.7918 -0.1307  0.9128  1.9302 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(I

# Plot results

<table><tr><td bgcolor="blue">Helper function</font></td></tr></table>

In [10]:
Plot_results = function(x_rdm,x_label='Valence',y_label='Marriage',
                        x_var = Valence, y_var = Marriage,
                        my.color.point='#FFC000',my.color.smooth="#9966FF"){
    
    x_rdm_array = lowerTriangle(x_rdm)
    # Organize data
    df = data.frame(cbind(scale(y_rdm_array),scale(x_rdm_array)))
    colnames(df) = c(y_label,x_label)
    print(summary(lm(scale(df[1])~scale(df[2]))))
    
    # Evaluate x_label and y_label
    x_var <- enquo(x_var)
    y_var <- enquo(y_var)
    
    # Plot results
    options(repr.plot.width = 12,repr.plot.height = 8)
    ggplot(data=df, aes(x=!!x_var,y=!!y_var))+
    geom_point(alpha = 0.6,size=12,colour=my.color.point,stroke=3)+
    geom_smooth(method = 'lm', size=4,formula = y ~ x,color = my.color.smooth) +

    theme_classic() + 
    theme(
        axis.text=element_text(size=35,color="black"),
        axis.title=element_text(size=40,color="black"),
        axis.line.x=element_line(linetype=1,color="black",size=2),
        axis.line.y=element_line(linetype=1,color="black",size=2),
        legend.position = 'none') 
}