In [2]:
library(lsa) #cosine
library(mgsub)
library(openxlsx)

library(corrplot)
library(RColorBrewer)
library(ggplot2)
library(patchwork)

Helper function

In [3]:
cosine_matrix = function(raw_data){
    # rel*rel matrix caculated by cosine similarity
    n <- nrow(raw_data)  # raw_data is an n x n matrix, 159*dimensions
    # create an empty matrix to store cosine similarity
    raw_data_cor <- matrix(0, nrow = n, ncol = n)
    colnames(raw_data_cor) <- rownames(raw_data)
    rownames(raw_data_cor) <- rownames(raw_data)
    # calculate the cosine similarity of the upper triangular part
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            cosine_value <- cosine(as.numeric(raw_data[i, ]), as.numeric(raw_data[j, ]))
            raw_data_cor[i, j] <- cosine_value
        }
    }
    # symmetrically fill the cosine similarity in the lower triangular part
    raw_data_cor <- raw_data_cor + t(raw_data_cor)

    # set the diagonal to 1
    diag(raw_data_cor) <- 1
    return(raw_data_cor)
}

lowerTriangle <- function(m){
  return(m[lower.tri(m,diag = FALSE)])
}

# R.scale() uses ddof=1, but sklearn.preprocessing.StandardScaler() uses ddof=0
scaleN <- function(data){
  data_scaled <- scale(data)*sqrt(nrow(data)/(nrow(data)-1))  
  return(data_scaled)
}

# Human rating

In [5]:
# modern: 258 relationships
human_258r_33d = read.csv('human_rating_data/study2_modern/CHN_dim_rel_NLP_258r_33d_scaled_chi.csv',
                        row.names=1,encoding='gb18030')
rels_order_modern = rownames(human_258r_33d)
human_258r_33d_sim = cosine_matrix(human_258r_33d)
write.csv(human_258r_33d_sim,'human_rating_data/human_model/human_33d_258r_cosine.csv')

In [6]:
# modern: 120 relationships
rels_order_ancient = read.csv('human_rating_data/ancient_modern_map_120r.csv',row.names=1)$'对应的现代人际关系'
human_120r_33d_sim = human_258r_33d_sim[rels_order_ancient,rels_order_ancient]
write.csv(human_120r_33d_sim,'human_rating_data/human_model/human_33d_120r_cosine.csv')

# Bert embedding

## modern

In [8]:
prompt_rels_map = read.csv('../1.GPT4_DESC/CHN_modern/labels_chinese.csv')
name_mapping <- setNames(prompt_rels_map$'Relationships', # new name
                         prompt_rels_map$'query') # old name

In [9]:
rel_emb_sim_modern = function(input_path,output){
    files = list.files(input_path)
    for (file in files){
        file_path = paste0(input_path,file)
        bert_data = read.csv(file_path,row.names=1,encoding='gb18030')

        # change the suitable labels
        bert_data$'word' = name_mapping[bert_data$'word']
        rownames(bert_data) = bert_data$'word'
        bert_data = bert_data[,-1]
        bert_data = bert_data[rels_order_modern,]

        # scale by column(1024d)
        bert_data_scaled = scaleN(bert_data)
        # caculate cosine_similarity
        bert_data_cor = cosine_matrix(bert_data_scaled)

        output_path = paste0(output,file)
        dir.create(dirname(output_path), recursive = TRUE, showWarnings = FALSE)
        write.csv(bert_data_cor,output_path)
    }
    print(input_path)
}

In [10]:
querys = dir('bert_embedding_data/modern/')
for (query in querys){
    print(query)
    input_path = paste0('bert_embedding_data/modern/',query,'/')
    output = paste0('sim_bert_embedding/modern/',query,'/')
    rel_emb_sim_modern(input_path = input_path, output = output)
}

[1] "q1.1_bert_embedding_data"
[1] "bert_embedding_data/modern/q1.1_bert_embedding_data/"
[1] "q1.2_bert_embedding_data"
[1] "bert_embedding_data/modern/q1.2_bert_embedding_data/"
[1] "q2.1_bert_embedding_data"
[1] "bert_embedding_data/modern/q2.1_bert_embedding_data/"
[1] "q2.2_bert_embedding_data"
[1] "bert_embedding_data/modern/q2.2_bert_embedding_data/"
[1] "q3.1_bert_embedding_data"
[1] "bert_embedding_data/modern/q3.1_bert_embedding_data/"
[1] "q3.2_bert_embedding_data"
[1] "bert_embedding_data/modern/q3.2_bert_embedding_data/"
[1] "q4.0_bert_embedding_data"
[1] "bert_embedding_data/modern/q4.0_bert_embedding_data/"


## ancient

In [11]:
prompt_rels_map = read.csv('human_rating_data/ancient_modern_map_120r.csv',row.names=1,check.names=FALSE)
name_mapping <- setNames(prompt_rels_map$'对应的现代人际关系', # new name
                         prompt_rels_map$'query') # old name

In [12]:
rel_emb_sim_ancient = function(input_path,output){
    files = list.files(input_path)
    for (file in files){
        file_path = paste0(input_path,file)
        bert_data = read.csv(file_path,row.names=1,encoding='gb18030')

        # change the suitable labels
        bert_data = bert_data[bert_data$'word' %in% prompt_rels_map$'query',]
        bert_data$'word' = prompt_rels_map$'对应的现代人际关系'
        rownames(bert_data) = bert_data$'word'
        bert_data = bert_data[,-1]

        # scale by column(768d)
        bert_data_scaled = scaleN(bert_data)
        # caculate cosine_similarity
        bert_data_cor = cosine_matrix(bert_data_scaled)

        output_path = paste0(output,file)
        dir.create(dirname(output_path), recursive = TRUE, showWarnings = FALSE)
        write.csv(bert_data_cor,output_path)
    }
    print(input_path)
}

average '關' and '系'

In [13]:
querys = dir('bert_embedding_data/ancient/')
for (query in querys[1:2]){
    input_path = paste0('bert_embedding_data/ancient/',query,'/')
    files = dir(input_path)
    files = files[grep("^(關_)",files)]
    for (file in files){
        file_pair = gsub('關','系',file)
        output_file = gsub('關','關系',file)
        embedding_results1 = read.csv(paste0(input_path,file), row.names=1)
        embedding_results2 = read.csv(paste0(input_path,file_pair), row.names=1)
        
        embedding_results = embedding_results1
        embedding_results[,2:769] = (embedding_results1[,2:769]+embedding_results2[,2:769])/2
        write.csv(embedding_results,
                  paste0('bert_embedding_data/ancient/',query,'/',output_file))
    }
}

In [14]:
for (query in querys){
    print(query)
    input_path = paste0('bert_embedding_data/ancient/',query,'/')
    output = paste0('sim_bert_embedding/ancient/',query,'/')
    rel_emb_sim_ancient(input_path = input_path, output = output)
}

[1] "q1.1_bert_embedding_data"
[1] "bert_embedding_data/ancient/q1.1_bert_embedding_data/"
[1] "q1.2_bert_embedding_data"
[1] "bert_embedding_data/ancient/q1.2_bert_embedding_data/"
[1] "q2.1_bert_embedding_data"
[1] "bert_embedding_data/ancient/q2.1_bert_embedding_data/"
[1] "q2.2_bert_embedding_data"
[1] "bert_embedding_data/ancient/q2.2_bert_embedding_data/"
[1] "q3.1_bert_embedding_data"
[1] "bert_embedding_data/ancient/q3.1_bert_embedding_data/"
[1] "q3.2_bert_embedding_data"
[1] "bert_embedding_data/ancient/q3.2_bert_embedding_data/"
[1] "q3.3_bert_embedding_data"
[1] "bert_embedding_data/ancient/q3.3_bert_embedding_data/"
[1] "q4.1_bert_embedding_data"
[1] "bert_embedding_data/ancient/q4.1_bert_embedding_data/"
[1] "q4.2_bert_embedding_data"
[1] "bert_embedding_data/ancient/q4.2_bert_embedding_data/"
[1] "q4.3_bert_embedding_data"
[1] "bert_embedding_data/ancient/q4.3_bert_embedding_data/"
