<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Full-feature-model" data-toc-modified-id="Full-feature-model-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Full-feature model</a></span></li><li><span><a href="#Dimensional" data-toc-modified-id="Dimensional-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dimensional</a></span><ul class="toc-item"><li><span><a href="#FAVEE" data-toc-modified-id="FAVEE-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>FAVEE</a></span></li><li><span><a href="#Formality" data-toc-modified-id="Formality-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Formality</a></span></li><li><span><a href="#Activeness" data-toc-modified-id="Activeness-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Activeness</a></span></li><li><span><a href="#Valence" data-toc-modified-id="Valence-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Valence</a></span></li><li><span><a href="#Exchange" data-toc-modified-id="Exchange-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Exchange</a></span></li><li><span><a href="#Equality" data-toc-modified-id="Equality-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Equality</a></span></li></ul></li><li><span><a href="#Categorical(classification)" data-toc-modified-id="Categorical(classification)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Categorical(classification)</a></span><ul class="toc-item"><li><span><a href="#HPP" data-toc-modified-id="HPP-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>HPP</a></span></li><li><span><a href="#Hostile" data-toc-modified-id="Hostile-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Hostile</a></span></li><li><span><a href="#Public" data-toc-modified-id="Public-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Public</a></span></li><li><span><a href="#Private" data-toc-modified-id="Private-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Private</a></span></li></ul></li></ul></div>

In [2]:
options (warn = -1)
library(openxlsx)
library(tidyverse)
library(reshape)
library(mgsub) # mapping

library(ggplot2)
library(ggpubr)
library(RColorBrewer)
library(corrplot)
library(corrr)

In [3]:
# Organize relationship score
regions = c(
    # English
    'USA','UK','Australia','South Africa',
    # Germany
    'Germany',
    # Japanese
    'Japan',
    # Hebrew
    'Israel',
    # Chinese
    'CHN','HK(region)',

    # French
    'France',
    # Spanish
    'Spain','Mexico','Chile',
    # Portuguese
    'Portugal','Brazil',
    # Russian
    'Russia',
    # Arabic
    'Egypt','Qatar',
    'India')

# Full-feature model

In [4]:
file_path = '../DataClean_FAVEE_HPP/output_data/cleaning_results/'

raw_data_list = list()
for (region in dir(file_path)[-which(dir(file_path)=='ReadMe.txt')]){
    file = paste(file_path,region,sep='')
    file = paste(paste(file,'/',sep=''),region,sep='')
    file = paste(file,'_dim_rel_scaled.csv',sep='')
    raw_df = read.csv(file,row.names=1)
    
    region = gsub('HK','HK(region)',region) 
    region = gsub('South_africa','South Africa',region)
    raw_data_list[[region]] = raw_df}

rdm_list = list()
for (region in regions){
    region_raw_data = raw_data_list[[region]]
    rdm_list[[region]] = c(dist(region_raw_data,
                             method = 'euclidean'))}

combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
# combine all regions' results
for (region in regions[3:length(regions)]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = regions

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')
regions_label = colnames(combine_rdm.cor)

regions_selected = rownames(combine_rdm.cor)
raw33d_dissim_dist = 1-combine_rdm.cor
model_rdm = raw33d_dissim_dist[lower.tri(raw33d_dissim_dist,diag = FALSE)]

write.csv(combine_rdm.cor,
file='output_data/models_rdm/full_feature/raw33d_cor.csv')
write.csv(combine_rdm,
file='output_data/models_rdm/full_feature/raw33d_dissim_array.csv')
write.csv(raw33d_dissim_dist,
file='output_data/models_rdm/full_feature/raw33d_dissim_dist.csv')

# Dimensional

## FAVEE

In [5]:
file_path = '../DataClean_FAVEE_HPP/output_data/pca_results/'

relationship_score_list = list()
for (region in dir(file_path)){
    file = paste(file_path,region,sep='')
    file = paste(paste(file,'/',sep=''),region,sep='')
    file = paste(file,'_scores_var_5c_33d.csv',sep='')
    favee_df = read.csv(file,row.names=1)
    
    region = gsub('HK','HK(region)',region) 
    region = gsub('South_africa','South Africa',region)
    relationship_score_list[[region]] = favee_df}

rdm_list = list()
for (region in regions){
    region_model = relationship_score_list[[region]]
    rdm_list[[region]] = c(dist(region_model,
                             method = 'euclidean'))}

combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
# combine all regions' results
for (region in regions[3:length(regions)]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = regions

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')
regions_label = colnames(combine_rdm.cor)

regions_selected = rownames(combine_rdm.cor)
favee_dissim_dist = 1-combine_rdm.cor
model_rdm = favee_dissim_dist[lower.tri(favee_dissim_dist,diag = FALSE)]

write.csv(combine_rdm.cor,
file='output_data/models_rdm/dimensional/favee_cor.csv')
write.csv(combine_rdm,
file='output_data/models_rdm/dimensional/favee_dissim_array.csv')
write.csv(favee_dissim_dist,
file='output_data/models_rdm/dimensional/favee_dissim_dist.csv')

assign labels to dimensions

In [6]:
# summarise model labels
model_label_df = read.xlsx('../DataClean_FAVEE_HPP/input_data/dimensions_check.xlsx',
                           sheet='Model_33d_5c')
model_label_df = rename(model_label_df,c('South.Africa'='South Africa'))
model_label_df = model_label_df[1:5,regions]
# transform dataframe into dictionary(list)
model_label_list = as.list(model_label_df)

# add model labal to raw models
relationship_score_name_list = list()
for (region in names(model_label_list)){
    # grab the model labels
    model_label = model_label_list[[region]]
    reverse = rep(1,5)
    col_numbers = seq(1,5)
    
    count = 1
    for (label in model_label){
        # label of one dimension
        temp = strsplit(label,'(',fixed = TRUE)[[1]]
        model_label[count] = temp[1]
        
        if (!is.na(temp[2])){
            # whether the dimension required to be reversed or not
            reverse[count] = -1
        }
        
        count = count + 1
    }
    region_df = relationship_score_list[[region]]
    colnames(region_df) = model_label
    # reverse the column
    for (col in col_numbers){region_df[col] = region_df[col]*reverse[col]}
    relationship_score_name_list[[region]] = region_df
    
    print(region)
    print(reverse)
}

[1] "USA"
[1] -1  1 -1  1  1
[1] "UK"
[1]  1 -1 -1 -1  1
[1] "Australia"
[1]  1  1 -1  1  1
[1] "South Africa"
[1] -1  1  1 -1 -1
[1] "Germany"
[1]  1  1 -1 -1  1
[1] "Japan"
[1] -1  1  1 -1  1
[1] "Israel"
[1] -1  1 -1 -1  1
[1] "CHN"
[1] -1  1  1  1 -1
[1] "HK(region)"
[1] -1  1  1  1  1
[1] "France"
[1]  1 -1 -1  1  1
[1] "Spain"
[1] -1  1 -1  1  1
[1] "Mexico"
[1] -1  1  1  1 -1
[1] "Chile"
[1] 1 1 1 1 1
[1] "Portugal"
[1]  1  1  1 -1 -1
[1] "Brazil"
[1] -1  1  1 -1 -1
[1] "Russia"
[1] -1 -1 -1 -1 -1
[1] "Egypt"
[1] -1  1 -1 -1 -1
[1] "Qatar"
[1] -1  1  1  1 -1
[1] "India"
[1]  1  1 -1 -1  1


## Formality

In [7]:
# caculate distance between 4 or 5 dimensions, and grab the low triangle of rdm
rdm_list = list()
region_count = 0
for (region in regions){
    # extract specific dimension
    region_df = relationship_score_name_list[[region]]
    if (c('Formality') %in% colnames(region_df)){
        region_model = dist(region_df[['Formality']],method = 'euclidean')
        region_count = region_count + 1
        rdm_list[[region]] = c(region_model)
    }
}
print('The number of regions than contain specific dimension:')
print(region_count)

# combine all regions' results    
combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
for (region in names(rdm_list)[3:region_count]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = names(rdm_list)

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist,diag = FALSE)]

write.csv(combine_rdm,
file='output_data/models_rdm/dimensional/formality_cor_array.csv')
write.csv(dissim_dist,
file='output_data/models_rdm/dimensional/formality_dissim_dist.csv')

[1] "The number of regions than contain specific dimension:"
[1] 19


## Activeness

In [8]:
# caculate distance between 4 or 5 dimensions, and grab the low triangle of rdm
rdm_list = list()
region_count = 0
for (region in regions){
    # extract specific dimension
    region_df = relationship_score_name_list[[region]]
    if (c('Activeness') %in% colnames(region_df)){
        region_model = dist(region_df[['Activeness']],method = 'euclidean')
        region_count = region_count + 1
        rdm_list[[region]] = c(region_model)
    }
}
print('The number of regions than contain specific dimension:')
print(region_count)

# combine all regions' results    
combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
for (region in names(rdm_list)[3:region_count]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = names(rdm_list)

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist,diag = FALSE)]

write.csv(combine_rdm,
file='output_data/models_rdm/dimensional/activeness_cor_array.csv')
write.csv(dissim_dist,
file='output_data/models_rdm/dimensional/activeness_dissim_dist.csv')

[1] "The number of regions than contain specific dimension:"
[1] 19


## Valence

In [9]:
# caculate distance between 4 or 5 dimensions, and grab the low triangle of rdm
rdm_list = list()
region_count = 0
for (region in regions){
    # extract specific dimension
    region_df = relationship_score_name_list[[region]]
    if (c('Valence') %in% colnames(region_df)){
        region_model = dist(region_df[['Valence']],method = 'euclidean')
        region_count = region_count + 1
        rdm_list[[region]] = c(region_model)
    }
}
print('The number of regions than contain specific dimension:')
print(region_count)

# combine all regions' results    
combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
for (region in names(rdm_list)[3:region_count]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = names(rdm_list)

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist,diag = FALSE)]

write.csv(combine_rdm,
file='output_data/models_rdm/dimensional/valence_cor_array.csv')
write.csv(dissim_dist,
file='output_data/models_rdm/dimensional/valence_dissim_dist.csv')

[1] "The number of regions than contain specific dimension:"


[1] 19


## Exchange

In [10]:
# caculate distance between 4 or 5 dimensions, and grab the low triangle of rdm
rdm_list = list()
region_count = 0
for (region in regions){
    # extract specific dimension
    region_df = relationship_score_name_list[[region]]
    if (c('Exchange') %in% colnames(region_df)){
        region_model = dist(region_df[['Exchange']],method = 'euclidean')
        region_count = region_count + 1
        rdm_list[[region]] = c(region_model)
    }
}
print('The number of regions than contain specific dimension:')
print(region_count)

# combine all regions' results    
combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
for (region in names(rdm_list)[3:region_count]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = names(rdm_list)

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist,diag = FALSE)]

write.csv(combine_rdm,
file='output_data/models_rdm/dimensional/exchange_cor_array.csv')
write.csv(dissim_dist,
file='output_data/models_rdm/dimensional/exchange_dissim_dist.csv')

[1] "The number of regions than contain specific dimension:"
[1] 19


## Equality

In [11]:
# caculate distance between 4 or 5 dimensions, and grab the low triangle of rdm
rdm_list = list()
region_count = 0
for (region in regions){
    # extract specific dimension
    region_df = relationship_score_name_list[[region]]
    # identity 'Socioemotional' with 'Equality'
    model_label = colnames(region_df)
    model_label = gsub('Socioemotional','Equality',model_label)
    colnames(region_df) = model_label
    if (c('Equality') %in% colnames(region_df)){
        region_model = dist(region_df[['Equality']],method = 'euclidean')
        region_count = region_count + 1
        rdm_list[[region]] = c(region_model)
    }
}
print('The number of regions than contain specific dimension:')
print(region_count)

# combine all regions' results    
combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
for (region in names(rdm_list)[3:region_count]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = names(rdm_list)

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist,diag = FALSE)]

write.csv(combine_rdm,
file='output_data/models_rdm/dimensional/equality_cor_array.csv')
write.csv(dissim_dist,
file='output_data/models_rdm/dimensional/equality_dissim_dist.csv')

[1] "The number of regions than contain specific dimension:"


[1] 19


# Categorical(classification)

## HPP

In [13]:
file_path = '../DataClean_FAVEE_HPP/output_data/categorical_model/rdm_results/'

rdm_list_temp = list()
for (file in dir(file_path)){
    hpp_rdm_df = read.csv(paste(file_path,file,sep=''),row.names=1)
    hpp_rdm_array = hpp_rdm_df[lower.tri(hpp_rdm_df,diag = FALSE)]

    region = strsplit(file,'_raw_3k.csv')[[1]] %>% str_replace('HK','HK(region)') %>%
                                         str_replace('South_africa','South Africa')
    rdm_list_temp[[region]] = hpp_rdm_array}

rdm_list = list()
for (region in regions){
    rdm_list[[region]] = rdm_list_temp[[region]]
}

# combine all regions' results
combine_rdm = cbind(rdm_list[[1]],rdm_list[[2]])
regions = names(rdm_list)
for (region in regions[3:length(regions)]){
    combine_rdm = cbind(combine_rdm,rdm_list[[region]])}
colnames(combine_rdm) = regions

# caculate correlation between regions
combine_rdm.cor = cor(combine_rdm, method = 'spearman')

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
hpp_dissim_dist = 1-combine_rdm.cor
model_rdm = hpp_dissim_dist[lower.tri(hpp_dissim_dist,diag = FALSE)]

write.csv(combine_rdm.cor,
file='output_data/models_rdm/categorical/hpp_cor.csv')
write.csv(combine_rdm,
file='output_data/models_rdm/categorical/hpp_cor_array.csv')
write.csv(hpp_dissim_dist,
file='output_data/models_rdm/categorical/hpp_dissim_dist.csv')

## Hostile

In [15]:
mapping = read.xlsx('../DataClean_FAVEE_HPP/output_data/categorical_model/categorical_results.xlsx',
                   sheet='raw33d_dissim')
# each region model
cluster_results = list()
for (region in regions){
    region = gsub('HK(region)','HK',region,fixed = TRUE)
    region = gsub('South Africa','South_africa',region)
    file_path = '../DataClean_FAVEE_HPP/output_data/categorical_model/raw_dissim/'
    dissim_cluster_file = paste(region,'_raw_3k.csv',sep='')
    dissim_cluster_path = paste(file_path,dissim_cluster_file,sep='')
    region_model = read.csv(dissim_cluster_path,row.names=1)['cl_k3']
    region_model$'cluster' = mgsub(region_model$'cl_k3',
                                   mapping$'cluster_id',mapping[[region]])
    # relationships attributed to specific cluster in region x
    region_cluster = rownames(region_model[region_model$'cluster'=='Hostile',])
    region = gsub('HK','HK(region)',region,fixed = TRUE)
    region = gsub('South_africa','South Africa',region)
    cluster_results[[region]] = region_cluster
}

In [16]:
# caculate similar between regions using Jaccard index, and grab the low triangle of rdm
combine_rdm.cor = data.frame(matrix(NA,19,19))
colnames(combine_rdm.cor) = regions
rownames(combine_rdm.cor) = regions

for (region_x in regions){
    region_x_cluster = cluster_results[[region_x]]
    for (region_y in regions){
        region_y_cluster = cluster_results[[region_y]]
        # Jacard index
        union_num = length(union(region_x_cluster,region_y_cluster))
        intersect_num = length(intersect(region_x_cluster,region_y_cluster))
        combine_rdm.cor[region_x,region_y] = intersect_num/union_num   
    }
}

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist, diag = FALSE)]

write.csv(dissim_dist,
file='output_data/models_rdm/categorical/hostile_dissim_dist.csv')

## Public

In [17]:
# each region model
cluster_results = list()
for (region in regions){
    region = gsub('HK(region)','HK',region,fixed = TRUE)
    region = gsub('South Africa','South_africa',region)
    file_path = '../DataClean_FAVEE_HPP/output_data/categorical_model/raw_dissim/'
    dissim_cluster_file = paste(region,'_raw_3k.csv',sep='')
    dissim_cluster_path = paste(file_path,dissim_cluster_file,sep='')
    region_model = read.csv(dissim_cluster_path,row.names=1)['cl_k3']
    region_model$'cluster' = mgsub(region_model$'cl_k3',
                                   mapping$'cluster_id',mapping[[region]])
    # relationships attributed to specific cluster in region x
    region_cluster = rownames(region_model[region_model$'cluster'=='Public',])
    region = gsub('HK','HK(region)',region,fixed = TRUE)
    region = gsub('South_africa','South Africa',region)
    cluster_results[[region]] = region_cluster
}

In [18]:
# caculate similar between regions using Jaccard index, and grab the low triangle of rdm
combine_rdm.cor = data.frame(matrix(NA,19,19))
colnames(combine_rdm.cor) = regions
rownames(combine_rdm.cor) = regions

for (region_x in regions){
    region_x_cluster = cluster_results[[region_x]]
    for (region_y in regions){
        region_y_cluster = cluster_results[[region_y]]
        # Jacard index
        union_num = length(union(region_x_cluster,region_y_cluster))
        intersect_num = length(intersect(region_x_cluster,region_y_cluster))
        combine_rdm.cor[region_x,region_y] = intersect_num/union_num   
    }
}

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist, diag = FALSE)]

write.csv(dissim_dist,
file='output_data/models_rdm/categorical/public_dissim_dist.csv')

## Private

In [20]:
# each region model
cluster_results = list()
for (region in regions){
    region = gsub('HK(region)','HK',region,fixed = TRUE)
    region = gsub('South Africa','South_africa',region)
    file_path = '../DataClean_FAVEE_HPP/output_data/categorical_model/raw_dissim/'
    dissim_cluster_file = paste(region,'_raw_3k.csv',sep='')
    dissim_cluster_path = paste(file_path,dissim_cluster_file,sep='')
    region_model = read.csv(dissim_cluster_path,row.names=1)['cl_k3']
    region_model$'cluster' = mgsub(region_model$'cl_k3',
                                   mapping$'cluster_id',mapping[[region]])
    # relationships attributed to specific cluster in region x
    region_cluster = rownames(region_model[region_model$'cluster'=='Private',])
    region = gsub('HK','HK(region)',region,fixed = TRUE)
    region = gsub('South_africa','South Africa',region)
    cluster_results[[region]] = region_cluster
}

In [21]:
# caculate similar between regions using Jaccard index, and grab the low triangle of rdm
combine_rdm.cor = data.frame(matrix(NA,19,19))
colnames(combine_rdm.cor) = regions
rownames(combine_rdm.cor) = regions

for (region_x in regions){
    region_x_cluster = cluster_results[[region_x]]
    for (region_y in regions){
        region_y_cluster = cluster_results[[region_y]]
        # Jacard index
        union_num = length(union(region_x_cluster,region_y_cluster))
        intersect_num = length(intersect(region_x_cluster,region_y_cluster))
        combine_rdm.cor[region_x,region_y] = intersect_num/union_num   
    }
}

# transform correlation between 19 regions into rdm for regression
regions_selected = rownames(combine_rdm.cor)
dissim_dist = 1-combine_rdm.cor
model_rdm = dissim_dist[lower.tri(dissim_dist, diag = FALSE)]

write.csv(dissim_dist,
file='output_data/models_rdm/categorical/private_dissim_dist.csv')