# NOTICE: Before running, make sure you have the following datasets (.csv files) in the same folder/directory as this program (YokedDataSelector.ipynb):

1. Current participant's decisions from pretest (.csv format, preferred)
2. SizeModelPretest.csv
3. MaterialModelPretest.csv
4. MassModelPretest.csv
5. RandomModelPretest.csv
6. ClusterYokesChoices.csv

In [22]:
import pandas as pd
import math
from decimal import *
import numpy as np
import scipy.stats

fileName = input("Please enter the name of the subject's pretest phase response file: ")

if ".csv" in fileName:
    
    fileNameCSV = fileName   
    
else:
    
    fileNameCSV = fileName + ".csv"


CurrentResponses = pd.DataFrame(pd.read_csv(fileNameCSV))

#####################################################################################
### Can cut this bit below out if data arrives in horizontal format 
subChoices = pd.DataFrame(columns = ['pre1', 'pre2', 'pre3', 'pre4', 'pre5', 'pre6', 'pre7', 'pre8'])

for c in range(8):
    
    newcol = str("pre" + str(c+1))
    subChoices.loc[0, newcol] = CurrentResponses.loc[c, 'resp.keys'] - 1

### Can cut these bits above out if data arrives in horizontal format
#####################################################################################

# Operationalized task per our four theory models

sizeModelPre = pd.DataFrame(pd.read_csv('SizeModelPretest.csv'))
matModelPre = pd.DataFrame(pd.read_csv('MaterialModelPretest.csv'))
massModelPre = pd.DataFrame(pd.read_csv('MassModelPretest.csv'))
randomModelPre = pd.DataFrame(pd.read_csv('RandomModelPretest.csv'))

#####################################################################################

# Calculate likelihoods of children's choices per each model to obtain their theory distribution

# First, make a blank dataframe for the log-likelihoods (for visualizations, etc)
subPriors = pd.DataFrame()

# Size Prior Slots
sub_Size_Pre = pd.DataFrame()
prob_sub_Size = []
    
# Material Prior Slots
sub_Mat_Pre = pd.DataFrame()
prob_sub_Mat = []
    
# Mass Prior Slots
sub_Mass_Pre = pd.DataFrame()
prob_sub_Mass = []

for i in range(8):    
    
    newcol = str("pre" + str(i+1))
    
    #sub_Size_Pre = sub_Size_Pre.append([sizeModelPre.iloc[i][subChoices.loc[0, newcol]]])
    sub_Size_Pre = sub_Size_Pre.append([sizeModelPre.iloc[i][subChoices.loc[0, newcol]]])

    # Material Theory likelihood
    #sub_Mat_Pre = sub_Mat_Pre.append([matModelPre.iloc[i][subChoices.loc[0, newcol]]])
    sub_Mat_Pre = sub_Mat_Pre.append([matModelPre.iloc[i][subChoices.loc[0, newcol]]])

    # Mass Theory likelihood
    #sub_Mass_Pre = sub_Mass_Pre.append([massModelPre.iloc[i][subChoices.loc[0, newcol]]])
    sub_Mass_Pre = sub_Mass_Pre.append([massModelPre.iloc[i][subChoices.loc[0, newcol]]])

# take the product of participant's fit to the model; assuming independence due to no feedback
    
# Size Theory Pretest likelihood
prob_sub_Size = abs(sub_Size_Pre.product())
    
# Material Theory Pretest likelihood
prob_sub_Mat = abs(sub_Mat_Pre.product())
    
# Mass Theory Pretest likelihood
prob_sub_Mass = abs(sub_Mass_Pre.product())
    
# take the logs for simplification
size_log_like = math.log(prob_sub_Size)
mat_log_like = math.log(prob_sub_Mat)
mass_log_like = math.log(prob_sub_Mass)
    
# Adding each theory's fit to the Pretest log-likelihoods tables
subPriors.loc[0, 'Size'] = size_log_like
subPriors.loc[0, 'Material'] = mat_log_like
subPriors.loc[0, 'Mass'] = mass_log_like   

# "Fitting" the Random Choice Model
subPriors['Random'] = math.log(Decimal(0.2 ** 8))  
#subPriors['Random'] = 0.2 ** 8

#####################################################################################

# Grab info about the yoked simulations (child representatives from Exp1)
clusterKids = pd.DataFrame(pd.read_csv('ClusterYokesChoices.csv'))

clusterKids['WD'] = None

for c in range(6):
        
    dist_x = subPriors[['Size', 'Material', 'Mass', 'Random']].loc[0]
    dist_y = clusterKids[['Size', 'Material', 'Mass', 'Random']].loc[c]
    
    vars()['compare_' + str(c)] = scipy.stats.wasserstein_distance(dist_x, dist_y)
    
    clusterKids.loc[c, 'WD'] = vars()['compare_' + str(c)]

closest_CK = clusterKids.ParticipantID[clusterKids.WD == clusterKids.WD.min()].to_string(index=False)
print("The child with the closest priors to the current participant is PID" + str(closest_CK))

Please enter the name of the subject's pretest phase response file: SimulationPracticeData
The child with the closest priors to the current participant is PID 1187
