In [2]:
library(psych)
library(stringr)
library(corrplot)
library(RColorBrewer)
library(tidyr)
library(gghalves) 
library(ggplot2)
library(doParallel)
library(foreach)
library(dplyr)

In [3]:
# Function to sample half elements from each list
sample_half <- function(x) {
  #set.seed(Sys.time())
  sample(x, size = ceiling(length(x) / 2))
  #sample(x, size = 5)
}

scaleN <- function(data){
  data_scaled <- scale(data)*sqrt(nrow(data)/(nrow(data)-1))
  return(as.data.frame(data_scaled))
}

convert_data <- function(data){
  data_list <- lapply(data, function(col) {
    lapply(col, function(cell) {
      as.numeric(unlist(str_extract_all(cell, "\\d+")))
    })
  })
  return(data_list)
}

list2df <- function(data_list){
  data_half_df = as.data.frame(do.call(cbind, data_list))
  data_half_df_mean <- as.data.frame(lapply(data_half_df, function(cell) sapply(cell, mean)))
  data_half_df_mean_scale <- scaleN(data_half_df_mean)
  rownames(data_half_df_mean_scale) = rownames(data)
  return(data_half_df_mean_scale)
}

split_half <- function(data_list){
  
  # Split each inner list into two halves
  split_data_list <- lapply(data_list, function(outer_col) {
    lapply(outer_col, function(inner_col) {
      selected_indices <- sample_half(1:length(inner_col))
      list(
        first_half = inner_col[selected_indices],
        second_half = inner_col[-selected_indices]
      )
    })
  })

  # Convert each half to a dataframe and process it
  process_data <- function(data_list, method){
    combined_data <- lapply(data_list, function(outer_col) {
      sapply(outer_col, function(inner_list, method) {
        mean(inner_list[[method]])
      }, method = method)
    })
    
    data_df = as.data.frame(do.call(cbind, combined_data))
    data_df_mean_scale <- scaleN(data_df)
    rownames(data_df_mean_scale) = rownames(data_df)
    return(data_df_mean_scale)
  }
  
  return(list(
    first_half_df = process_data(split_data_list, method = "first_half"),
    second_half_df = process_data(split_data_list, method = "second_half")
  ))
}

FAVEE_scores <- function(data){
  data.pca = prcomp(data)
  ncomp = 5
  
  loadings_none = data.pca$rotation[,1:ncomp]%*% diag(data.pca$sdev,ncomp,ncomp)
  scores_none = data.pca$x[,1:ncomp] 
  
  loadings_var = varimax(loadings_none)$loadings
  scores_var = scale(scores_none) %*% varimax(loadings_none)$rotmat
  
  return(scores_var)
}

dist_lower_vector <- function(data){
  dist_matrix = as.matrix(dist(data))
  lower_vector = dist_matrix[lower.tri(dist_matrix)]
  return(lower_vector)
}

frobenius_norm_regions <- function(region1, region2){
  data_region1 <- read.csv(paste0('dataframe_results/', region1, '.csv'), row.names = 1)
  data_region1_list <- convert_data(data_region1)
  data_region1_df <- list2df(data_region1_list)
  data_region1_FAVEE_scores <- FAVEE_scores(data_region1_df)
  data_region1_rdm_lower <- dist_lower_vector(data_region1_FAVEE_scores)
  
  data_region2 <- read.csv(paste0('dataframe_results/', region2, '.csv'), row.names = 1)
  data_region2_list <- convert_data(data_region2)
  data_region2_df <- list2df(data_region2_list)
  data_region2_FAVEE_scores <- FAVEE_scores(data_region2_df)
  data_region2_rdm_lower <- dist_lower_vector(data_region2_FAVEE_scores)
  
  frobenius_norm <- sqrt(sum((data_region1_rdm_lower - data_region2_rdm_lower)^2))
  return(frobenius_norm)
}


calculate_distances <- function(region, regions) {
  distances <- data.frame(region1 = character(0), region2 = character(0), distance = numeric(0))
  for (other_region in regions) {
    if (region != other_region) {
      distance <- frobenius_norm_regions(region, other_region)
      distances <- rbind(distances, data.frame(region1 = region, region2 = other_region, distance = distance))
    }
  }
  return(distances)
}

# Calculate the distribution the Frobenius norm for each region

In [4]:
n_cores <- detectCores() - 1
registerDoParallel(n_cores)

files <- list.files(path = 'dataframe_results/')
regions <- tools::file_path_sans_ext(files)
regions_frobenius <- list()

Sys.time()
for (region in regions) {
  data <- read.csv(paste0('dataframe_results/', region, '.csv'), row.names = 1)
  data_list <- convert_data(data)
  frobenius_norm_array <- numeric(1000)  
    
  results <- foreach(k = 1:1000, .combine = 'c') %dopar% {
    # split half
    split_data <- split_half(data_list)
    data_half1 <- split_data$first_half_df
    data_half2 <- split_data$second_half_df
    
    # compute FAVEE scores for each half
    data_half1_scores <- FAVEE_scores(data_half1)
    data_half2_scores <- FAVEE_scores(data_half2)

    # extract the lower triangular data
    data_half1_rdm_lower <- dist_lower_vector(data_half1_scores)
    data_half2_rdm_lower <- dist_lower_vector(data_half2_scores)
    
    # compute frobenius form
    sqrt(sum((data_half1_rdm_lower - data_half2_rdm_lower)^2))
  }
   frobenius_norm_array <- results
   regions_frobenius[[region]] <- frobenius_norm_array
   print(region)
}
Sys.time()


stopImplicitCluster()

[1] "2024-02-21 11:09:55 CST"

[1] "Australia"
[1] "Brazil"
[1] "Chile"
[1] "CHN"
[1] "Egypt"
[1] "France"
[1] "Germany"
[1] "HK(region)"
[1] "India"
[1] "Israel"
[1] "Japan"
[1] "Mexico"
[1] "Portugal"
[1] "Qatar"
[1] "Russia"
[1] "South_africa"
[1] "Spain"
[1] "UK"
[1] "USA"


[1] "2024-02-21 11:17:22 CST"

In [5]:
regions_frobenius_df <- do.call(rbind, lapply(names(regions_frobenius), function(region) {
  data.frame(regions = region, frobenius_norms = regions_frobenius[[region]])
}))

regions_frobenius_df <- regions_frobenius_df %>%
  mutate(Type = paste0("within"))

# Calculate the Frobenius norm between each region and the remaining n-1 regions

In [6]:
n_cores <- detectCores() - 1
registerDoParallel(n_cores)
files <- list.files(path = 'dataframe_results/')
regions <- tools::file_path_sans_ext(files)
region_world_distance <- list()

for(region in regions){
  # clean the data of the region
  data <- read.csv(paste0('dataframe_results/', region, '.csv'), row.names = 1)
  data_list <- convert_data(data)
  
  other_region_data_list <- lapply(data_list, function(dimension) {
  lapply(dimension, function(relation) {
    numeric(0)  
    })
  })
  
  # clean the data of the n-1 regions
  for (other_region in regions) {
    if (region != other_region) {
        other_region_data_sub <- read.csv(paste0('dataframe_results/', other_region, '.csv'), row.names = 1)
        other_region_data_sub_list <- convert_data(other_region_data_sub)
        
        for (dimension_index in 1:length(other_region_data_sub_list)) {
          for (relation_index in 1:length(other_region_data_sub_list[[dimension_index]])) {
            other_region_data_list[[dimension_index]][[relation_index]] <- c(other_region_data_list[[dimension_index]][[relation_index]], other_region_data_sub_list[[dimension_index]][[relation_index]])
          }
        }
    }
  }
  
  results <- foreach(k = 1:1000, .combine = 'c') %dopar% {
    # Randomly split the data of the region in half
    split_data <- split_half(data_list)
    data_half1 <- split_data$first_half_df
    
    # Randomly split the data of the n-1 regions in half
    split_data <- split_half(other_region_data_list)
    data_half2 <- split_data$first_half_df

    # compute FAVEE scores for each half
    data_half1_scores <- FAVEE_scores(data_half1)
    data_half2_scores <- FAVEE_scores(data_half2)

    # extract the lower triangular data
    data_half1_rdm_lower <- dist_lower_vector(data_half1_scores)
    data_half2_rdm_lower <- dist_lower_vector(data_half2_scores)
    
    # compute frobenius form
    sqrt(sum((data_half1_rdm_lower - data_half2_rdm_lower)^2))
  }
  region_world_distance[[region]] <- results
  
  print(region)

    
}

stopImplicitCluster()

[1] "Australia"
[1] "Brazil"
[1] "Chile"
[1] "CHN"
[1] "Egypt"
[1] "France"
[1] "Germany"
[1] "HK(region)"
[1] "India"
[1] "Israel"
[1] "Japan"
[1] "Mexico"
[1] "Portugal"
[1] "Qatar"
[1] "Russia"
[1] "South_africa"
[1] "Spain"
[1] "UK"
[1] "USA"


In [7]:
region_world_distance_df <- do.call(rbind, lapply(names(region_world_distance), function(region) {
  data.frame(regions = region, frobenius_norms = region_world_distance[[region]])
}))
region_world_distance_df <- region_world_distance_df %>%
  mutate(Type = paste0("world"))

In [8]:
color_mapping <- c(
  "world" = "#9DCD82",
  "within" = "#F8B62D"
)

# Perform bootstrap analysis and plot the results.

In [10]:
files <- list.files(path = 'dataframe_results/')
regions <- tools::file_path_sans_ext(files)
regions <- regions[regions != 'CHN_Trans']

results = rbind(regions_frobenius_df, region_world_distance_df)

# Initialize the dataframe to store results
bootstrap_results <- data.frame(region = character(),
                               proportion_positive = numeric(),
                               stringsAsFactors = FALSE)

# Number of bootstrap samples
num_samples <- 1000

for(region in regions){
  
  df_subset = results[results$regions == region, ]
  within_value = df_subset$frobenius_norms[df_subset$Type == 'within']
  between_value = df_subset$frobenius_norms[df_subset$Type == 'world']
  
  differences <- numeric(num_samples)

  for (k in 1:num_samples) {
    # Randomly sample one value from each group
    sample_within <- sample(within_value, 1000, replace = TRUE)
    sample_between <- sample(between_value, 1000, replace = TRUE)

    # Store the difference
    differences[k] <- sample_within - sample_between
  }

  # Compute the proportion of differences greater than zero
  proportion_positive <- mean(differences > 0)

  # Append the results to the bootstrap_results dataframe
  bootstrap_results <- rbind(bootstrap_results, data.frame(region = region,
                                                           proportion_positive = proportion_positive))
  
  plot <- ggplot(df_subset, aes(x=df_subset$region, y=frobenius_norms, fill=Type, color=Type)) +
  geom_half_violin(aes(x=df_subset$region, y=frobenius_norms, fill=Type, color=Type), position=position_nudge(x=0.15,y=0),
                   side='R', adjust=1.2, trim=T, color=NA, alpha=0.8) +
  geom_boxplot(aes(x=region, y=frobenius_norms, fill=Type, color=Type), width=0.25, fill="white", size = 3, outlier.shape = NA) +
  geom_point(aes(x = as.numeric(region)-0.1, y = frobenius_norms, color = Type),
             position = position_jitter(width =0.03), size = 0.2, shape = 20) +
  scale_color_manual(values=color_mapping[df_subset$Type]) +
  scale_fill_manual(values=color_mapping[df_subset$Type]) +
  coord_flip() +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text = element_text(size=30),
        axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        legend.position = 'none',
        axis.text.x = element_text(size=50, vjust=25)) +
  ylab('Frobenius Norm Value')

  ggsave(paste0("../output_graph/FAVEE_structure_reliability/", region, ".png"), plot, width=12, height=12, dpi=300)
}

bootstrap_results$adjusted_p <- p.adjust(bootstrap_results$proportion_positive, method = "bonferroni")

"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值长度的倍数"
"被替换的项目不是替换值