In [2]:
library(openxlsx)
library(tidyverse)
library(reshape)
library(mgsub) # mapping

library(ggplot2)
library(ggpubr)
library(RColorBrewer)
library(corrplot)
library(corrr)

In [3]:
# Organize relationship score
regions = c(
    # English
    'USA','UK','Australia','South Africa',
    # Germany
    'Germany',
    # Japanese
    'Japan',
    # Hebrew
    'Israel',
    # Chinese
    'CHN','HK(region)',

    # French
    'France',
    # Spanish
    'Spain','Mexico','Chile',
    # Portuguese
    'Portugal','Brazil',
    # Russian
    'Russia',
    # Arabic
    'Egypt','Qatar',
    'India')

# Full-feature model

In [6]:
for (num in seq(5,158)){
    dir.create(paste('output_data/regression/models_rdm/full_feature/',num,sep=''))
    
    file_path = paste('output_data/full_feature/',num,sep='')
    file_path = paste(file_path,'/',sep='')
    raw_data_list = list()
    for (file in dir(file_path)){
        raw_df = read.csv(paste(file_path,file,sep=''),row.names=1)

        region = strsplit(file,'.csv')[[1]] %>% str_replace('HK','HK(region)') %>%
                                             str_replace('South_africa','South Africa')
        raw_data_list[[region]] = raw_df    
    }

    rdm_list = list()
    for (region in regions){
        region_raw_data = raw_data_list[[region]]
        rdm_list[[region]] = c(dist(region_raw_data,
                                 method = 'euclidean'))
    }
    combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])

    # combine all regions' results
    for (region in regions[3:length(regions)]){
        combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
    colnames(combine_rdm) = regions

    # caculate correlation between regions
    combine_rdm.cor = cor(combine_rdm, method = 'spearman')
    regions_label = colnames(combine_rdm.cor)

    regions_selected = rownames(combine_rdm.cor)
    raw33d_dissim_dist = 1-combine_rdm.cor
    model_rdm = raw33d_dissim_dist[lower.tri(raw33d_dissim_dist,diag = FALSE)]

    write.csv(combine_rdm,
    file=paste(paste(paste('output_data/regression/models_rdm/full_feature/',num,sep=''),'/',sep=''),
              'raw33d_dissim_array.csv',sep=''))
    write.csv(raw33d_dissim_dist,
    file=paste(paste(paste('output_data/regression/models_rdm/full_feature/',num,sep=''),'/',sep=''),
              'raw33d_dissim_dist.csv',sep=''))
}

# Dimensional

In [7]:
for (num in seq(5,158)){
    dir.create(paste('output_data/regression/models_rdm/dimensional/',num,sep=''))

    file_path = paste('output_data/dimensional_model/relationship_var_score/',num,sep='')
    file_path = paste(file_path,'/',sep='')
    relationship_score_list = list()
    for (file in dir(file_path)){
        favee_df = read.csv(paste(file_path,file,sep=''),row.names=1)
        region = strsplit(file,'.csv')[[1]] %>% str_replace('HK','HK(region)') %>%
                                             str_replace('South_africa','South Africa')
        relationship_score_list[[region]] = favee_df    
    }
    
    rdm_list = list()
    for (region in regions){
        region_model = relationship_score_list[[region]]
        rdm_list[[region]] = c(dist(region_model,
                                 method = 'euclidean'))
    }
    # combine all regions' results
    combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
    for (region in regions[3:length(regions)]){
        combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
    colnames(combine_rdm) = regions

    # caculate correlation between regions
    combine_rdm.cor = cor(combine_rdm, method = 'spearman')

    # transform correlation between 19 regions into rdm for regression
    regions_selected = rownames(combine_rdm.cor)
    favee_dissim_dist = 1-combine_rdm.cor
    model_rdm = favee_dissim_dist[lower.tri(favee_dissim_dist,diag = FALSE)]
    
    write.csv(combine_rdm,
    file=paste(paste(paste('output_data/regression/models_rdm/dimensional/',num,sep=''),'/',sep=''),
              'favee_cor_array.csv',sep=''))
    write.csv(favee_dissim_dist,
    file=paste(paste(paste('output_data/regression/models_rdm/dimensional/',num,sep=''),'/',sep=''),
              'favee_dissim_dist.csv',sep=''))
}

# Categorical(classification)

In [8]:
for (num in seq(5,158)){
    dir.create(paste('output_data/regression/models_rdm/categorical/',num,sep=''))
    
    file_path = paste('output_data/categorical_rdm/',num,sep='')
    file_path = paste(file_path,'/',sep='')
    
    rdm_list = list()
    for (file in dir(file_path)){
        hpp_rdm_df = read.csv(paste(file_path,file,sep=''),row.names=1)
        hpp_rdm_array = hpp_rdm_df[lower.tri(hpp_rdm_df,diag = FALSE)]
        
        region = strsplit(file,'_raw_3k.csv')[[1]] %>% str_replace('HK','HK(region)') %>%
                                             str_replace('South_africa','South Africa')
        rdm_list[[region]] = hpp_rdm_array}  
        
    # combine all regions' results
    combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
    regions = names(rdm_list)
    for (region in regions[3:length(regions)]){
        combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
    colnames(combine_rdm) = regions

    # caculate correlation between regions
    combine_rdm.cor = cor(combine_rdm, method = 'spearman')
        
    # transform correlation between 19 regions into rdm for regression
    regions_selected = rownames(combine_rdm.cor)
    hpp_dissim_dist = 1-combine_rdm.cor
    model_rdm = hpp_dissim_dist[lower.tri(hpp_dissim_dist,diag = FALSE)]

    write.csv(combine_rdm,
    file=paste(paste(paste('output_data/regression/models_rdm/categorical/',num,sep=''),'/',sep=''),
              'hpp_cor_array.csv',sep=''))
    write.csv(hpp_dissim_dist,
    file=paste(paste(paste('output_data/regression/models_rdm/categorical/',num,sep=''),'/',sep=''),
              'hpp_dissim_dist.csv',sep=''))    
}