In [2]:
library(ggplot2)
library(corrplot)
library(RColorBrewer)

library(tidyverse)
library(reshape)#melt
library(openxlsx)
library(dplyr)#rename
library(mgsub)

# Functions

## abstract subset

In [3]:
subset_abstract = function(num){
    # abstract relationships
    robustness = read.csv('Robustness/output_data/robustness_loadings_data.csv',row.names=1)
    subset = robustness %>% filter(times==(159-num)) 
    subset = subset$'rel_list'
    subset = unlist(strsplit(subset,", "))
    subset = data.frame(trimws(subset)) 
    colnames(subset) = 'old'
    # replace labels
    mapping =read.csv('../../Study2/input_data/Rel_labels/labels_different_versions.csv',row.names=1)
    subset$'fix' = mgsub(subset$'old',mapping$'two',mapping$'one')
    subset = subset$'fix' %>% str_replace('Employer and Employee ,','Employer and Employee')
    
    write.csv(subset,paste(paste('data/subset_',num,sep=''),'.csv',sep=''))
    return(subset)
}

## full feature & dimensional model

In [4]:
pca_var = function(dim_rel_scaled){
    dim_rel.pca = prcomp(dim_rel_scaled)
    ncomp = 5

    loadings_none = dim_rel.pca$rotation[,1:ncomp]%*% diag(dim_rel.pca$sdev,ncomp,ncomp)
    scores_none = dim_rel.pca$x[,1:ncomp] 
    loadings_var = varimax(loadings_none)$loadings
    scores_var = scale(scores_none) %*% varimax(loadings_none)$rotmat

    loadings_var = as.data.frame(loadings_var[,1:ncomp])
    scores_var = as.data.frame(scores_var)
    
    result_dict = {}
    result_dict[['loadings']] = loadings_var
    result_dict[['relationship']] = scores_var
    return(result_dict)
}


full_dim_region = function(num){
    path = '../DataCleanPCA/output_data/cleaning_results/'
    pca_var_loading_subset = {}
    pca_var_relationship_subset = {}
    
    dir.create(paste('output_data/full_feature/',num,sep=''))
    dir.create(paste('output_data/dimensional_model/loading_var_score/',num,sep=''))
    dir.create(paste('output_data/dimensional_model/relationship_var_score/',num,sep=''))
    
    for (region in dir(path)){
        # 1.import data
        file = paste(path,region,sep='')
        file = paste(file,'/',sep='')
        file = paste(file,region,sep='')
        file = paste(file,'_dim_rel_scaled.csv',sep='')
        all_scaled = read.csv(file,encoding='UTF-8',row.names=1)

        # 2.abstract subset
        subset = subset_abstract(num)
        subset_scaled = all_scaled[subset,]
        output_path = paste(paste(paste('output_data/full_feature/',num,sep=''),'/',sep=''),
                            region,sep='')
        output_path = paste(output_path,'.csv',sep='')
        write.csv(subset_scaled,output_path)

        # 3. pca with varimax rotation
        pca_results = pca_var(subset_scaled)
        loadings_var = pca_results$loadings
        scores_var = pca_results$relationship

        # save results
        pca_var_loading_subset[[region]] = loadings_var
        output_path = paste(paste(paste('output_data/dimensional_model/loading_var_score/',num,sep=''),
                            '/',sep=''),region,sep='')
        output_path = paste(output_path,'.csv',sep='')
        write.csv(loadings_var,output_path)

        pca_var_relationship_subset[[region]] = scores_var
        output_path = paste(paste(paste('output_data/dimensional_model/relationship_var_score/',num,sep=''),
                            '/',sep=''),region,sep='')
        output_path = paste(output_path,'.csv',sep='')
        write.csv(scores_var,output_path)}
        
    return(pca_var_relationship_subset)
}

***

# Run

In [5]:
pca_rel_score = {}
for (num in seq(40,70)){
    pca_rel_score[[num]] = full_dim_region(num)
}

In [6]:
pca_rel_score = {}
for (num in c(seq(5,39),seq(71,158))){
    pca_rel_score[[num]] = full_dim_region(num)
}