In [1]:
import os, glob
import pandas as pd
from scipy.io import loadmat
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
from scipy.spatial.distance import squareform
import seaborn as sns

from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection

from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA

from scipy.stats import spearmanr

# Import data

In [2]:
# Meadows ID and corresponding fMRI ID
meadows_soc_id_key = pd.read_csv('../input_data/meadows_data/participants.csv',index_col=0)
# Create a list of paths for each subject's data
ma_subj_data_paths = glob.glob('../input_data/meadows_data/meadows_subject_data/*.mat')
ca_subj_data_paths = glob.glob('../input_data/meadows_data/meadows_subject_data/*.csv')

Notably, there were two versions of relationships labels. One version left out "Between" in some relationships and the order was different from another one. Without unify one type of labels and the order, the corresponding different matrix added would make mistakes.
The two types of versions were shown below:

In [3]:
num = 10 #depend on specific file path
mat = loadmat(ma_subj_data_paths[30]) #the index was chosen randomly but prefer to choose the first few
    
# Extract each social relationship stimuli and clean them up
relationships=[]
for i in mat['stimuli']:
    temp_str = i.replace('  ','')
    temp_str = temp_str.strip()
    relationships.append(temp_str)


soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == \
                            ma_subj_data_paths[30].split('_')[num]].index[0]
soc_id = soc_id.replace('soc','sub')
# Turn the 1-D matlab array into a 2-D distance matrix
dissim_v1 = pd.DataFrame(squareform(mat['rdmutv'][0]), columns=relationships, index=relationships)
dissim_v1.index

Index(['Wife - Husband', 'Nurse - Patient', 'Officer - Soldier', 'Neighbors',
       'Athletic trainer - Trainee', 'Police officer - Offender',
       'Principal - Teacher', 'Doctor - Nurse', 'Victim - Criminal',
       'Coach - Athlete',
       ...
       'Sorority sisters', 'Classmates', 'Club-member - Club-president',
       'Political opponents', 'Friends-with-benefits',
       'Foster-parent - Foster-child', 'Business rivals',
       'Manager - Assistant', 'Victim - Witness', 'Business partners'],
      dtype='object', length=159)

In [4]:
mat = loadmat(ma_subj_data_paths[56])#the index was chosen randomly but prefer to choose the last few
    
# Extract each social relationship stimuli and clean them up
relationships=[]
for i in mat['stimuli']:
    temp_str = i.replace('  ','')
    temp_str = temp_str.strip()
    relationships.append(temp_str)


soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == ma_subj_data_paths[56].split('_')[num]].index[0]
soc_id = soc_id.replace('soc','sub')
# Turn the 1-D matlab array into a 2-D distance matrix
dissim_v2 = pd.DataFrame(squareform(mat['rdmutv'][0]), columns=relationships, index=relationships)
dissim_v2.index

Index(['Parent - Child', 'Parent - Teenager', 'Wife - Husband',
       'Fiance - Fiancee', 'Siblings', 'Divorced spouses',
       'A person - Their in-laws', 'Cousins', 'Employer - Employee',
       'Interviewer - Job applicant',
       ...
       'Monarch - Minister', 'Confidants', 'Person - Significant other',
       'Companions', 'Playmates', 'Competitors in sports', 'Person - Crush',
       'Cohabitants', 'Long-distance-lovers', 'Brothers-in-arms'],
      dtype='object', length=159)

Due to the two versions of relatioships, the codes adjusting the labels were used.

In [5]:
#In categorization task, there were two different versions in labels of relationships, which were shown in '2.2 different relationship labels'.
#This function is used to turn relationships in version 2 into version 1.
def rel_v2_to_v1(df): 
    df = df.rename(columns = {'Diplomats': 'Between Diplomats'}, 
                  index = {'Diplomats': 'Between Diplomats'})
    df = df.rename(columns = {'Second-cousins': 'Between Second-cousins'}, 
                   index = {'Second-cousins': 'Between Second-cousins'})
    df = df.rename(columns = {'Co-workers': 'Between Co-workers'}, 
                   index = {'Co-workers': 'Between Co-workers'})
    df = df.rename(columns = {'Person - Significant other': 'Person - their Significant other'}, 
                   index = {'Person - Significant other': 'Person - their Significant other'})
    df = df.rename(columns = {'Step-siblings': 'Between Step-siblings'}, 
                   index = {'Step-siblings': 'Between Step-siblings'}) 
    df = df.rename(columns = {'Twins': 'Between Twins'}, 
                   index = {'Twins': 'Between Twins'}) 
    df = df.rename(columns = {'Frenemies': 'Between Frenemies'}, 
                   index = {'Frenemies': 'Between Frenemies'}) 
    df = df.rename(columns = {'Alumni': 'Between Alumni'}, 
                   index = {'Alumni': 'Between Alumni'}) 
    df = df.rename(columns = {'Playmates': 'Between Playmates'}, 
                   index = {'Playmates': 'Between Playmates'})      
    df = df.rename(columns = {'Half-siblings': 'Between Half-siblings'}, 
                   index = {'Half-siblings': 'Between Half-siblings'}) 
    df = df.rename(columns = {'Peers': 'Between Peers'}, 
                   index = {'Peers': 'Between Peers'}) 
    df = df.rename(columns = {'Companions': 'Between Companions'}, 
                   index = {'Companions': 'Between Companions'}) 
    df = df.rename(columns = {'Person - Social media follower': 'Person - their Social media follower'}, 
                   index = {'Person - Social media follower': 'Person - their Social media follower'})
    df = df.rename(columns = {'Cousins': 'Between Cousins'}, 
                   index = {'Cousins': 'Between Cousins'}) 
    df = df.rename(columns = {'Cohabitants': 'Between Cohabitants'}, 
                   index = {'Cohabitants': 'Between Cohabitants'}) 
    df = df.rename(columns = {'Confidants': 'Between Confidants'}, 
                   index = {'Confidants': 'Between Confidants'}) 
    df = df.rename(columns = {'Siblings': 'Between Siblings'}, 
                   index = {'Siblings': 'Between Siblings'}) 
    return df

In [6]:
#Match the labels in categorization task with that in dimension rating task.
def fix_df_labels(df):
    # Make conditions list and the columns/rows of the distance matrix lowercase
    df.columns = [x.lower() for x in df.columns]
    df.index = [x.lower() for x in df.index]

    # Fix up the conditions lists and the columns/rows of the matrix so that they match
    ## There were some small typos and string differences between the two
    df.columns = df.columns.str.replace('–','and')
    df.index = df.index.str.replace('–','and')
    df.columns = df.columns.str.replace(' - ',' and ')
    df.index = df.index.str.replace(' - ',' and ')
    df.columns = df.columns.str.replace('  ',' ')
    df.index = df.index.str.replace('  ',' ')  
    df.columns = df.columns.str.replace('between ','')
    df.index = df.index.str.replace('between ','')
    
    #unify the labels in categorization task and those in dimension rating task.
    df.columns = df.columns.str.replace('-',' ')
    df.index = df.index.str.replace('-',' ') 
    
    df = df.rename(columns = {'monarch and minister': 'a monarch and their minister'}, 
                   index = {'monarch and minister': 'a monarch and their minister'})
    df = df.rename(columns = {'person and crush': 'a person and their crush'}, 
                   index = {'person and crush': 'a person and their crush'})
    df = df.rename(columns = {'person and deceased spouse': 'a person and their deceased spouse'}, 
                   index = {'person and deceased spouse': 'a person and their deceased spouse'})
    df = df.rename(columns = {'person and family friends': 'a person and their family friends'}, 
                   index = {'person and family friends': 'a person and their family friends'}) 
    df = df.rename(columns = {'person and their significant other': 'a person and their significant other'}, 
                   index = {'person and their significant other': 'a person and their significant other'})
    df = df.rename(columns = {'person and their social media follower': 'a person and their social media follower'}, 
                   index = {'person and their social media follower': 'a person and their social media follower'}) 
    df = df.rename(columns = {'uncle and neice/nephew': 'uncle and niece/nephew'}, 
                   index = {'uncle and neice/nephew': 'uncle and niece/nephew'})     
    df = df.rename(columns = {"man's wife and man's mistress": 'man wife and man mistress'}, 
                   index = {"man's wife and man's mistress": 'man wife and man mistress'})    
    return df

# Multi-Arrangement Task Analysis

## Extract the list of participants in the second version including relative less members

In [7]:
mat_v1 = loadmat(ma_subj_data_paths[1]) # the index of zero is a null file
relationship_ma_v1 = []
for i in mat_v1['stimuli']:
    temp_str = i.replace('  ','')
    temp_str = temp_str.strip()
    relationship_ma_v1.append(temp_str)

# Loop through the data file paths
rel_ma_v2 = []
for p in ma_subj_data_paths:
    mat = loadmat(p)
    
    # Check whether an soc ID match was found for the meadows ID.
    if meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == p.split('_')[num]].index.empty:
        continue
    else:  # If match is found, this runs
        soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == p.split('_')[num]].index[0]
        soc_id = soc_id.replace('soc','sub')    
        
    # Extract each social relationship stimuli and clean them up
    relationships=[]
    for i in mat['stimuli']:
        temp_str = i.replace('  ','')
        temp_str = temp_str.strip()
        relationships.append(temp_str)
    
    if any(set(relationships)-set(relationship_ma_v1)): # If two sets of relationships are different, the results are true.
        print(len(relationships),end='')
        print(soc_id,end='')
        print(set(relationships)-set(relationship_ma_v1))
        rel_ma_v2.append(soc_id)
rel_ma_v2

159sub-762{'Peers', 'Companions', 'Confidants', 'Person - Significant other', 'Cohabitants', 'Half-siblings', 'Siblings', 'Person - Social media follower', 'Second-cousins', 'Co-workers', 'Diplomats', 'Twins', 'Playmates', 'Frenemies', 'Cousins', 'Alumni', 'Step-siblings'}
159sub-763{'Peers', 'Companions', 'Confidants', 'Person - Significant other', 'Cohabitants', 'Half-siblings', 'Siblings', 'Person - Social media follower', 'Second-cousins', 'Co-workers', 'Diplomats', 'Twins', 'Playmates', 'Frenemies', 'Cousins', 'Alumni', 'Step-siblings'}
159sub-765{'Peers', 'Companions', 'Confidants', 'Person - Significant other', 'Cohabitants', 'Half-siblings', 'Siblings', 'Person - Social media follower', 'Second-cousins', 'Co-workers', 'Diplomats', 'Twins', 'Playmates', 'Frenemies', 'Cousins', 'Alumni', 'Step-siblings'}
159sub-766{'Peers', 'Companions', 'Confidants', 'Person - Significant other', 'Cohabitants', 'Half-siblings', 'Siblings', 'Person - Social media follower', 'Second-cousins', 'Co-

['sub-762', 'sub-763', 'sub-765', 'sub-766', 'sub-502', 'sub-764', 'sub-767']

## Main transfer code

In [8]:
# Loop through the data file paths
for p in ma_subj_data_paths:
    mat = loadmat(p)
    
    # Extract each social relationship stimuli and clean them up
    relationships=[]
    for i in mat['stimuli']:
        temp_str = i.replace('  ','')
        temp_str = temp_str.strip()
        relationships.append(temp_str)
    
    # Check whether an soc ID match was found for the meadows ID.
    if meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == p.split('_')[num]].index.empty:
        continue
    else:  # If match is found, this runs
        soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == p.split('_')[num]].index[0]
        soc_id = soc_id.replace('soc','sub')
        # Turn the 1-D matlab array into a 2-D distance matrix
        dissim = pd.DataFrame(squareform(mat['rdmutv'][0]), columns=relationships, index=relationships)

        ### Turn relationships labels in version 2 into version 1.
        if soc_id in rel_ma_v2:
            dissim = rel_v2_to_v1(dissim)
            dissim = dissim.loc[relationship_ma_v1,relationship_ma_v1]
        ###
        
        #Add the labels' adjustment
        dissim = fix_df_labels(dissim) 
        
        dissim.to_csv('../output_data/individual/category/Subject_MA_RDMs_revised/'+soc_id+'.csv')

# Category Arrangment

In [9]:
#The dataframe of following subjects had no index, which resulted in a few relationships failing to be recorded.
subj_no_list = []
meadows_id_no_list = []
for subj in ca_subj_data_paths:
    subj_cat_df = pd.read_csv(subj)
    if subj_cat_df.iloc[0,0] != 0:
        if meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == subj.split('_')[num]].index.empty:
            continue
        else:  # If match is found, this runs
            soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == subj.split('_')[num]].index[0]
            soc_id = soc_id.replace('soc','sub')
            meadows_id = subj.split('_')[num]

        subj_no_list.append(soc_id)
        meadows_id_no_list.append(meadows_id)
        
print(len(subj_no_list))
subj_no_list

10


['sub-760',
 'sub-761',
 'sub-759',
 'sub-762',
 'sub-763',
 'sub-765',
 'sub-766',
 'sub-502',
 'sub-764',
 'sub-767']

There were also two versions of relationships, like what happened in Multi-Arrangement Task.

In [10]:
subject_cat_relationships_frames = {}

for subj in ca_subj_data_paths:
    
    if meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == subj.split('_')[num]].index.empty:
        continue
    else:  # If match is found, this runs
        soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == subj.split('_')[num]].index[0]
        soc_id = soc_id.replace('soc','sub')
        
    ############
    #Separate the two conditons where some had index of numbers while others not.
    if soc_id in subj_no_list:
        subj_cat_df = pd.read_csv(subj)
    else:
        subj_cat_df = pd.read_csv(subj).iloc[:, 1:]
    ############ 
    
    # abstract relationships from each matrix, for checking whether relationships were the same or different?
    relationships = []
    for col in subj_cat_df.columns:
        for i in list(subj_cat_df.index):
            temp_rel = subj_cat_df.loc[i,col]
            relationships.append(temp_rel)
    relationships = [x for x in relationships if pd.isnull(x) == False]
    relationships = list(set(relationships))
    
    subject_cat_relationships_frames[soc_id] = relationships

In [11]:
relationship_ca_v1 = subject_cat_relationships_frames[list(subject_cat_relationships_frames.keys())[0]]
relationship_ca_v2 = subject_cat_relationships_frames[list(subject_cat_relationships_frames.keys())[59]]

In [12]:
subj_rel_v2 = []
for i in list(subject_cat_relationships_frames.keys()):
    if any(set(relationship_ca_v1) - set(subject_cat_relationships_frames[i])):
        subj_rel_v2.append(i)
        #print(len(subject_cat_relationships_frames[i]),end='')
        #print(i,end='')
        #print(set(subject_cat_relationships_frames[i]) - set(relationship_ca_v1))
subj_rel_v2

['sub-762', 'sub-763', 'sub-765', 'sub-766', 'sub-502', 'sub-764', 'sub-767']

## Main transfer code

In [14]:
subject_cat_frames = {}

for subj in ca_subj_data_paths:
    
    if meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == subj.split('_')[num]].index.empty:
        continue
    else:  # If match is found, this runs
        soc_id = meadows_soc_id_key[meadows_soc_id_key['meadows_id'] == subj.split('_')[num]].index[0]
        soc_id = soc_id.replace('soc','sub')
        
    ############ different format of dataframe
    #Separate the two conditons where some had index of numbers while others not.
    if soc_id in subj_no_list:
        subj_cat_df = pd.read_csv(subj)
    else:
        subj_cat_df = pd.read_csv(subj).iloc[:, 1:]
    ############ 
    
    ############ different relationship labels
    #There were two versions of relationships.
    if soc_id in subj_rel_v2:
        relationships = relationship_ca_v2
    else:
        relationships = relationship_ca_v1
    ############ 
    
    rdm_cat = pd.DataFrame(columns=relationships)
    for rel_a in relationships:
        temp_relv_cols = []
        [temp_relv_cols.append(col) for col in subj_cat_df.columns if rel_a in subj_cat_df[col].tolist()]

        for rel_b in relationships:
            temp_relv_cols_b = []
            [temp_relv_cols_b.append(col) for col in subj_cat_df.columns if rel_b in subj_cat_df[col].tolist()]

            if subj_cat_df[temp_relv_cols].isin([rel_b]).any().any():
                rdm_cat.loc[rel_a,rel_b] = 0          
                # set strict criterion
                if (len(temp_relv_cols) > 1)|(len(temp_relv_cols_b) > 1):               
                    if temp_relv_cols != temp_relv_cols_b:
                        rdm_cat.loc[rel_a,rel_b] = 1
            else:
                rdm_cat.loc[rel_a,rel_b] = 1
            
    ### Turn relationships labels in version 2 into version 1.
    if soc_id in subj_rel_v2:
        rdm_cat = rel_v2_to_v1(rdm_cat)
        rdm_cat = rdm_cat.loc[relationship_ca_v1,relationship_ca_v1]
    ###
            
    #rdm_cat = rdm_cat.fillna(0) I think this step was not necessary. 
    
    ############
    #Add the labels' adjustment
    rdm_cat = fix_df_labels(rdm_cat)
    ############
    subject_cat_frames[soc_id] = rdm_cat
    
    rdm_cat.to_csv('../output_data/individual/category/Subject_Category_RDMs_revised/'+ soc_id + '.csv')