In [1]:
import pandas as pd
import numpy as np

import glob
from sklearn.metrics.pairwise import euclidean_distances#transform dataframe into rdm
from scipy.stats import spearmanr #Loocv

import seaborn as sns
import matplotlib.pyplot as plt

#Clustering analysis
import umap.umap_ as umap
from sklearn.cluster import KMeans

import sys
sys.path.append('../../custom_function')
import clustering

In [2]:
#filter target subjects    
def Filter(string, substr):
    return [str for str in string if
             any(sub in str for sub in substr)]

In [3]:
def cluster_results(input_dissim_or_df,task = 'categorization',neighbors = 15, dist = 0.01, random_seed = 2022):
    if task == 'categorization':
        dissim_ind_within = input_dissim_or_df
        umap_within = umap.UMAP(n_neighbors=neighbors, min_dist=dist, n_components=2,
                                metric = 'precomputed',
                              random_state=random_seed).fit_transform(dissim_ind_within)

        umap_within = pd.DataFrame(umap_within, columns=['Dim1','Dim2'], index=dissim_ind_within.index)

        umap_results_within = clustering.create_cluster_models(data_type = "df",data =umap_within,
                                                          methods= ['KMeans'],ks=[3,6], keep_orig=True)
        cluster_results_df = umap_results_within
        
    if task == 'dimension_pca_3k':
        cluster_results_df = pd.DataFrame(index=input_dissim_or_df.index)
        cluster_results_df = input_dissim_or_df
        df_ind_within = input_dissim_or_df
        fit_cluster = KMeans(n_clusters=3, random_state=random_seed).fit(df_ind_within)
        cluster_results_df['cl_k'+str(3)] = fit_cluster.labels_
        
    if task == 'dimension_umap':
        dissim_ind_within = input_dissim_or_df
        umap_within = umap.UMAP(n_neighbors=10, min_dist=0.01, n_components=2,
                              random_state=random_seed).fit_transform(dissim_ind_within)
        umap_within = pd.DataFrame(umap_within, columns=['Dim1','Dim2'], index=dissim_ind_within.index)
        umap_results_within = clustering.create_cluster_models(data_type = "df",data =umap_within,
                                                          methods= ['KMeans'],ks=[3,6], keep_orig=True)
        cluster_results_df = umap_results_within
        
    return cluster_results_df

# Import data  
- Dimension rating task: 
    - pca_var+KMeans = 3 clusters
    - UMAP+KMeans = 3 clusters
- Categorization task: UMAP(n_neighbors=15,min_dist=0.01)+KMeans  
    - Explicit task: 6 clusters
    - Implicit task: 3 clusters

In [4]:
# dimension
dim_pca_var_data_paths = glob.glob('../../output_data/individual/dimension/pca_results/scores_var/*.csv')    
dim_raw_data_paths = glob.glob('../../output_data/individual/dimension/clean_results/dim_rel_scaled/*.csv')


# category
# all 60 participants
ca_subj_data_paths_all = glob.glob('../../output_data/individual/category/Subject_Category_RDMs_revised/*.csv')
ma_subj_data_paths_all = glob.glob('../../output_data/individual/category/Subject_MA_RDMs_revised/*.csv')
#participants who finished dimension rating
sub_within_list = [i.split('\\')[1].split('.')[0] for i in dim_pca_var_data_paths]
print('Found participants who finished dimension rating task: '+str(len(sub_within_list))+' participants.')
#Explicit task
ca_within_data_paths = Filter(ca_subj_data_paths_all, sub_within_list)
print('Explicit task:From {} subjects filter {} subjects.'.format(len(ca_subj_data_paths_all),len(sub_within_list)))

ma_within_data_paths = Filter(ma_subj_data_paths_all, sub_within_list)
print('Implicit task:From {} subjects filter {} subjects.'.format(len(ma_subj_data_paths_all),len(sub_within_list)))

Found participants who finished dimension rating task: 14 participants.
Explicit task:From 60 subjects filter 14 subjects.
Implicit task:From 60 subjects filter 14 subjects.


# Generate clustering results individually

## dimensional survey

In [5]:
for ind_dir in dim_pca_var_data_paths:
    ind_input = pd.read_csv(ind_dir, index_col=0)
    ind_cluster = cluster_results(input_dissim_or_df = ind_input,
                                  task = 'dimension_pca_3k',random_seed = 2022)
    
    ind_csv = ind_dir.split('\\')[1]
    ind_cluster.to_csv('../../output_data/individual/dimension/cluster_results/pca_3k/'+ind_csv)

In [6]:
for ind_dir in dim_raw_data_paths:
    ind_input = pd.read_csv(ind_dir, index_col=0)
    ind_cluster = cluster_results(input_dissim_or_df = ind_input,
                                  task = 'dimension_umap',random_seed = 2022)
    
    ind_csv = ind_dir.split('\\')[1]
    ind_cluster.to_csv('../../output_data/individual/dimension/cluster_results/umap/'+ind_csv)

## category task

In [8]:
for ind_dir in ca_within_data_paths:
    ind_input = pd.read_csv(ind_dir, index_col=0)
    ind_cluster = cluster_results(input_dissim_or_df = ind_input,
                                  task = 'categorization',neighbors = 15, dist = 0.01, random_seed = 2022)
    
    ind_csv = ind_dir.split('\\')[1]
    ind_cluster.to_csv('../../output_data/individual/category/cluster_results/exp/'+ind_csv)

  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inve

In [9]:
for ind_dir in ma_within_data_paths:
    ind_input = pd.read_csv(ind_dir, index_col=0)
    ind_cluster = cluster_results(input_dissim_or_df = ind_input,
                                  task = 'categorization',neighbors = 15, dist = 0.01, random_seed = 2022)
    
    ind_csv = ind_dir.split('\\')[1]
    ind_cluster.to_csv('../../output_data/individual/category/cluster_results/imp/'+ind_csv)

  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inverse_transform will be unavailable")
  warn("using precomputed metric; inve