### Read Data

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import itertools
from sklearn.model_selection import train_test_split
from scipy.stats import anderson_ksamp
from ast import literal_eval

# Get data source
data_dir = 'data_process/12-04/'

# Read raw simulated data
simulated_raw_data = data_dir + 'simulated_data_aa_wsr_rsr.csv'
# Read Human Data
human_data = data_dir + 'human_data_aa_wsr_rsr.csv'

# Convert to a dataframe
df_raw_sim = pd.read_csv(simulated_raw_data)
df_human = pd.read_csv(human_data)

# Identify the parameters that to be tuned later
WEIGHTS = 'weights'             # Ranges from 0-1
WALK_FACTORS = 'walk_factors'   # Ranges from 0-1
SIGN_FACTORS = 'sign_factors'   # Ranges from 0-1
params = {
    WEIGHT: None,
    WALK_FACTOR: None,
    SIGN_FACTOR: None,
}
# Some other variables from the simulated file
STEPS = 'steps'
SIGN_READ = 'sign_read'
STEP_WISE_WALKING_POSITIONS = 'step_wise_walking_positions'
EXP_WALKING_POSITIONS = 'walking_positions'
STEP_WISE_WALKING_SPEEDS = 'step_wise_walking_speeds'
EXP_WALKING_SPEEDS = 'walking_speeds'
WALKING_SPEED_RATIOS = 'walking_speed_ratios'
STEP_WISE_ATTENTIONS = 'step_wise_attentions'
EXP_ATTENTION = 'attention_allocation'
ATTENTION_ON_SIGN = 'attention_on_sign'
ATTENTION_ON_OHMD = 'attention_on_ohmd'
SIGN_POSITIONS = 'sign_positions'
STEP_WISE_READING_RATIOS = 'step_wise_reading_ratios'
READING_SPEED_RATIOS = 'reading_speed_ratios'
EXP_READING_RATIOS = 'reading_ratios'
RECTANGLE_PATH_LENGTH = 'rectangle_path_length'
EP_LEN = 'ep_len'

SIGN = 'Sign'
ZERO_DOT_ONE = 0.1
WALK_FACTORS = 'walk_factors'

# Determine the metrics
ATTENTION_ALLOCATION = 'attention_allocation'
WALKING_SPEED_RATIO = 'walking_speed_ratio'
READING_SPEED_RATIO = 'reading_speed_ratio'
metrics = [ATTENTION_ALLOCATION, WALKING_SPEED_RATIO, READING_SPEED_RATIO]

# Parameters that can be pre-defined
DEFAULT_WALK_FACTOR = 0.5
DEFAULT_SIGN_FACTOR = 0.5

# Number of signs in the experiment
NUM_SIGN_IN_EXP = 8

### Process the Raw Simulated Data

In [64]:
# Function to convert string representations of lists into actual lists
def literal_eval_col(col):
    return col.apply(literal_eval)
# def literal_eval_col(col):
#     return col.apply(lambda x: literal_eval(x) if not pd.isnull(x) else [])

# Round values
def round_nested_lists(column):
    rounded_column = []
    for row in column:
        if isinstance(row, list):
            rounded_row = [round(element, 1) if isinstance(element, float) else element for element in row]
            rounded_column.append(rounded_row)
        else:
            rounded_column.append(row)
    return rounded_column

# Process raw simulated data - main function - process data line by line
def process_raw_simulated_data(df_raw_sim):
    # Convert strings to lists in the whole df
    df_raw_sim.dropna(how='all', inplace=True)
    
#     df_raw_sim[STEP_WISE_WALKING_POSITIONS] = literal_eval_col(df_raw_sim[STEP_WISE_WALKING_POSITIONS])
#     df_raw_sim[STEP_WISE_WALKING_SPEEDS] = literal_eval_col(df_raw_sim[STEP_WISE_WALKING_SPEEDS])
#     df_raw_sim[STEP_WISE_ATTENTIONS] = literal_eval_col(df_raw_sim[STEP_WISE_ATTENTIONS])
#     df_raw_sim[SIGN_READ] = literal_eval_col(df_raw_sim[SIGN_READ])
#     df_raw_sim[STEP_WISE_READING_RATIOS] = literal_eval_col(df_raw_sim[STEP_WISE_READING_RATIOS])
    rectangle_path_length = df_raw_sim.at[1, 'rectangle_path_length']
    
    #####################################################################
    # Get rid of the rows whose len(sign_read) < num_sign_in_exp
    df_read_eight_signs = df_raw_sim[df_raw_sim[SIGN_READ].apply(lambda x: len(x) >= NUM_SIGN_IN_EXP)]
#     df_read_eight_signs = df_read_eight_signs[df_read_eight_signs[WALK_FACTORS] == DEFAULT_WALK_FACTOR]
    
    #####################################################################
    empty_lists = [[] for _ in range(len(df_read_eight_signs))]
    
    # Creating a new column initialized with empty lists
    df_read_eight_signs[EXP_ATTENTION] = [[] for _ in range(len(df_read_eight_signs))]
    df_read_eight_signs[EXP_WALKING_SPEEDS] = [[] for _ in range(len(df_read_eight_signs))]
    df_read_eight_signs[EXP_READING_RATIOS] = [[] for _ in range(len(df_read_eight_signs))]
    # Iterate over each row
    for index, row in df_read_eight_signs.iterrows():
        # Find the index where the value in 'step_wise_walking_positions' first exceeds 60
        index_exceeding_60 = next((i for i, val in enumerate(row[STEP_WISE_WALKING_POSITIONS]) if val > rectangle_path_length), None)
        
        # If such an index is found, slice 'step_wise_attention_allocation' up to that index
        if index_exceeding_60 is not None:
            df_read_eight_signs.at[index, EXP_ATTENTION] = row[STEP_WISE_ATTENTIONS][:index_exceeding_60]
            df_read_eight_signs.at[index, EXP_WALKING_SPEEDS] = row[STEP_WISE_WALKING_SPEEDS][:index_exceeding_60]
            df_read_eight_signs.at[index, EXP_READING_RATIOS] = row[STEP_WISE_READING_RATIOS][:index_exceeding_60]
        else:
            # If no value exceeds 60, the column remains an empty list for this row
            df_read_eight_signs.at[index, EXP_ATTENTION] = []
            df_read_eight_signs.at[index, EXP_WALKING_SPEEDS] = []
            df_read_eight_signs.at[index, EXP_READING_RATIOS] = []
    
    #####################################################################
    # Get indexes where the attention is on the sign, which are on the OHMD
    # Initialize the new columns with empty lists
    df_read_eight_signs[ATTENTION_ON_SIGN] = [[] for _ in range(len(df_read_eight_signs))]
    df_read_eight_signs[ATTENTION_ON_OHMD] = [[] for _ in range(len(df_read_eight_signs))]
    # Initialize the new column with empty lists
    df_read_eight_signs[READING_SPEED_RATIOS] = [[] for _ in range(len(df_read_eight_signs))]
    # Iterate over each row
    for index, row in df_read_eight_signs.iterrows():
        # Lists to store attention data
        attention_on_sign = []
        attention_on_ohmd = []
        reading_speed_ratios = []

        # Check each item in the 'EXP_ATTENTION' list
        for i, attention_item in enumerate(row[EXP_ATTENTION]):
            if attention_item != 'Sign':
                attention_on_ohmd.append(attention_item)
                # Append the corresponding item from 'EXP_READING_RATIOS'
                reading_speed_ratios.append(row[EXP_READING_RATIOS][i])
            else:
                attention_on_sign.append(attention_item)         

        # Assign the lists to the new columns
        df_read_eight_signs.at[index, ATTENTION_ON_SIGN] = attention_on_sign
        df_read_eight_signs.at[index, ATTENTION_ON_OHMD] = attention_on_ohmd
        df_read_eight_signs.at[index, READING_SPEED_RATIOS] = reading_speed_ratios
        
    #####################################################################
    df_read_eight_signs[WALKING_SPEED_RATIOS] = [[] for _ in range(len(df_read_eight_signs))]
    # Iterate over each row
    for index, row in df_read_eight_signs.iterrows():
        # Filter walking speeds greater than 0.1
        walking_speeds_above_threshold = [speed for speed in row[EXP_WALKING_SPEEDS] if speed > ZERO_DOT_ONE]
        # Assign the filtered speeds to the new column
        df_read_eight_signs.at[index, WALKING_SPEED_RATIOS] = walking_speeds_above_threshold
    
    #####################################################################
    df_read_eight_signs[ATTENTION_ALLOCATION] = 0
    df_read_eight_signs[WALKING_SPEED_RATIO] = 0
    df_read_eight_signs[READING_SPEED_RATIO] = 0
    
    # Iterate over each row to calculate the ratios
    for index, row in df_read_eight_signs.iterrows():
        # Calculate the attention allocation ratio
        if len(row[ATTENTION_ON_OHMD]) > 0:
            df_read_eight_signs.at[index, ATTENTION_ALLOCATION] = len(row[ATTENTION_ON_SIGN]) / (len(row[ATTENTION_ON_OHMD]) + len(row[ATTENTION_ON_SIGN]))
        else:
            df_read_eight_signs.at[index, ATTENTION_ALLOCATION] = float('inf')  # or some other value to indicate undefined

        # Calculate the walking speed ratio (average)
        if row[WALKING_SPEED_RATIOS]:
            df_read_eight_signs.at[index, WALKING_SPEED_RATIO] = sum(row[WALKING_SPEED_RATIOS]) / len(row[WALKING_SPEED_RATIOS])

        # Calculate the reading speed ratio (average)
        if row[READING_SPEED_RATIOS]:
            df_read_eight_signs.at[index, READING_SPEED_RATIO] = sum(row[READING_SPEED_RATIOS]) / len(row[READING_SPEED_RATIOS])
        
    #####################################################################
    # Write the DataFrame to a CSV file
    df_read_eight_signs.to_csv(data_dir+'processed_simulated_data.csv', index=False)

    

In [65]:
process_raw_simulated_data(df_raw_sim=df_raw_sim)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_read_eight_signs[EXP_ATTENTION] = [[] for _ in range(len(df_read_eight_signs))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_read_eight_signs[EXP_WALKING_SPEEDS] = [[] for _ in range(len(df_read_eight_signs))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_read_eight_signs[EXP_READING_RA

### Parameter Inference: Attention Allocation (s), Walking Speed Ratio (%), Reading Speed Ratio (%)

In [None]:
# Normalize a list
def normalize(lst):
    min_val = min(lst)
    max_val = max(lst)
    if min_val == max_val:  # Avoid division by zero
        return [0.5 for _ in lst]  # Return 0.5 (middle) if all values are same
    return [(x - min_val) / (max_val - min_val) for x in lst]

# Define the cost function
def compute_cost(sim_data, human_data):
    return sum(abs(sim - human) for sim, human in zip(sim_data, human_data))

# Find the best parameter
def find_best_params(df_sim, mean_train_aa, mean_train_wsr, mean_train_rsr):
#     # Filter by adding constraints
#     df_sim = df_sim[df_sim['xxxx'] == xxxx]
    
    unique_params = df_sim[['layout', 'steps', 'error']].drop_duplicates()
    min_cost = float('inf')
    best_params = None

    for index, row in unique_params.iterrows():
        filtered_df = df_sim[
            (df_sim[WEIGHTS] == row[WEGHTS]) &
            (df_sim[WALK_FACTOR] == row[WALK_FACTOR])
        ]
        
        # Get simulated data
        sim_aa = filtered_df[ATTENTION_ALLOCATION].values[0]
        sim_wsr = filtered_df[WALKING_SPEED_RATIOS].values[0]
        sim_rsr = filtered_df[READING_SPEED_RATIOS].values[0]
        
        # Get human data
        human_aa = 
        human_wsr
        human_rsr
        
        cost_aa = compute_cost(normalize(sim_aa), normalize(human_aa))
        cost_wsr = compute_cost(normalize(sim_wsr), normalize(human_wsr))
        cost_rsr = compute_cost(normalize(sim_rsr), normalize(human_rsr))
        
        total_cost = cost_aa + cost_wsr + cost_rsr

        if total_cost < min_cost:
            min_cost = total_cost
            best_params = row
    
    # Given the best parameters and the dataframe, return the simulated results
    sim_aa_best = []
    sim_wsr_best = []
    sim_rsr_best = []
    
    # Extract the simulated steps and error for each layout based on best_params
    filtered_df = df_sim[
        (df_sim[WEIGHTS] == best_params[WEIGHTS]) &
        (df_sim[WALK_FACTORS] == best_params[WALK_FACTORS])
    ]

    sim_aa_best.append(filtered_df[ATTENTION_ALLOCATION].values[0])
    sim_wsr_best.append(filtered_df[WALKING_SPEED_RATIO].values[0])
    sim_rsr_best.append(filtered_df[READING_SPEED_RATIO].values[0])

    return best_params, min_cost, sim_steps_best, sim_errors_best

In [None]:
%%time

num_iterations = 500
all_costs = []
all_best_params = []

# # Lists to store sim-to-real mapping ratios for each iteration
# sim_to_real_ratios_duration = []
# sim_to_real_ratios_error = []

# Initialize dictionaries for storing results
simulated_durations = {'L100': []}
simulated_errors = {'L100': []}
human_train_durations = {'L100': []}
human_train_errors = {'L100': []}
human_test_durations = {'L100': []}
human_test_errors = {'L100': []}
human_train_durations_details = {'L100': []}  # Collect detailed data in each iterations, not aggregated
human_train_errors_details = {'L100': []}
human_test_durations_details = {'L100': []}
human_test_errors_details = {'L100': []}

for i in range(num_iterations):
    # Number of participants
    num_participants = len(human_duration_data[0])

    # Generate list of indices based on participants
    indices = list(range(num_participants))

    # Split indices to ensure consistency
    train_indices, test_indices = train_test_split(indices, test_size=0.5, random_state=i)  
    # using i as the seed for reproducibility

    # Use these indices to split the human data consistently across participants
    def split_data_based_on_indices(data, train_indices, test_indices):
        train_set = [data[i] for i in train_indices]
        test_set = [data[i] for i in test_indices]
        return train_set, test_set

    # Split human duration data
    train_duration = [split_data_based_on_indices(condition, train_indices, test_indices) for condition in human_duration_data]
    train_duration = list(zip(*train_duration))[0]  # Extracting the training data

    test_duration = [split_data_based_on_indices(condition, train_indices, test_indices) for condition in human_duration_data]
    test_duration = list(zip(*test_duration))[1]  # Extracting the test data

    # Split human error data
    train_error = [split_data_based_on_indices(condition, train_indices, test_indices) for condition in human_error_data]
    train_error = list(zip(*train_error))[0]  # Extracting the training data

    test_error = [split_data_based_on_indices(condition, train_indices, test_indices) for condition in human_error_data]
    test_error = list(zip(*test_error))[1]  # Extracting the test data

    # Compute mean for the training and test sets
    mean_train_duration = [np.mean(data) for data in train_duration]
    mean_train_error = [np.mean(data) for data in train_error]
    mean_test_duration = [np.mean(data) for data in test_duration]
    mean_test_error = [np.mean(data) for data in test_error]

    # Using the function to find the best parameters
    best_params, _, sim_durations, sim_errors = find_best_params(df_simulations, mean_train_duration, mean_train_error)
    all_best_params.append(best_params)
    
    # Get sim to real mapping ratios for duration and error metrics respectively
    # Compute sim-to-real ratio for the current iteration
    sim_to_real_ratio_duration = sum(mean_train_duration) / sum(sim_durations)
    sim_to_real_ratio_error = sum(mean_train_error) / sum(sim_errors)
    
    # Store the computed ratios
    sim_to_real_ratios_duration.append(sim_to_real_ratio_duration)
    sim_to_real_ratios_error.append(sim_to_real_ratio_error)
    
    for label, sd, se, ted, tee, trd, tre in zip(
        ['L100'], 
        sim_durations, 
        sim_errors, 
        test_duration, 
        test_error, 
        train_duration, 
        train_error,
    ):
        simulated_durations[label].append(sd)
        simulated_errors[label].append(se)
        human_train_durations[label].append(np.mean(trd))
        human_train_errors[label].append(np.mean(tre))
        human_test_durations[label].append(np.mean(ted))
        human_test_errors[label].append(np.mean(tee))
        human_train_durations_details[label].append(trd)
        human_train_errors_details[label].append(tre)
        human_test_durations_details[label].append(ted)
        human_test_errors_details[label].append(tee)

### Reading Resumption Time Cost (s), Error Rate (%).
Done in the neighbor jupyter-notebook file.