# Experiment Analysis

## Imports

In [None]:
import os
import glob
import pandas as pd
import re
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Set notebook up to load files from Science repo
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Ensure that we re-load changes automagically
%load_ext autoreload
%autoreload 2

In [None]:
from science.agents import GridEnvironment, QLearningAgent_Bernoulli_greedy, QLearningAgent_Bernoulli_PbRL

In [None]:
num_iters_per_experiment = 4
grid_height = 5
grid_width = 10
num_steps = 10

## Load Data

In [None]:
# Define the base directory where the pkl files are located
base_dir = 'human_experiments'

# List all .pkl files in the human_experiments directory
pkl_files = glob.glob(os.path.join(base_dir, 'nov*_*_*.pkl')) + glob.glob(os.path.join(base_dir, 'dec*_*_*.pkl'))

# Regex to extract day, participant, order, potential version, and experiment number
pattern = r'(nov|dec)(\d+)_postirb(\d+)_([a-z]+)_(\d+)_exp(\d+)\.pkl'

# To store the loaded data along with their metadata
experiments_data = []

# Loop over each .pkl file
for file_path in pkl_files:
    # Extract the metadata from the filename using regex
    filename = os.path.basename(file_path)
    match = re.match(pattern, filename.lower())
    if match:
        month, day, participant, experiment_type, order, experiment_number = match.groups()
        # Load the .pkl file
        data = pd.read_pickle(file_path)
        participant = int(participant)
        experiment_number = int(experiment_number)
        # Calculate deviations for each trajectory
        road = data['road'][1:]
        if experiment_type == 'rlhtf':
            deviations = [calculate_deviation(trajectory, road) for trajectory in data['trajectory']]
        else:
            deviations = data['reward']
        final_score = calculate_deviation(data['learned_trajectory'], road)
        
        # Append a tuple of the metadata and data to the experiments_data list
        experiments_data.append((month, day, participant, experiment_type, order, experiment_number, data, deviations, final_score))
    else:
        print(f"Filename {filename} did not match the pattern and was skipped.")
# Convert the list to a DataFrame
experiments_df = pd.DataFrame(experiments_data, columns=['Month', 'Day', 'participant', 'ExperimentType', 'AgentType', 'ExperimentNumber', 'Data', 'Deviation', 'Score'])

In [None]:
filtered_df = experiments_df[
    experiments_df['Data'].apply(lambda data: len(data['trajectory']) == 4)
]

## Compute Experiment Scores

In [None]:
def calculate_deviation(trajectory, road):   
    deviation = 0
    for step in trajectory:
        if not any(np.array_equal(step, road_step) for road_step in road):
            deviation -= 1
    return 10+deviation

In [None]:
def calculate_computed_scores(row):
    index = row.name  # Get the index of the row
    data_entry = experiments_df['Data'][index]
    
    # Extract road and trajectories
    road = data_entry['road']
    trajectories = data_entry['trajectory']  # Assuming it's a list of four trajectories
    learned_trajectory = data_entry['learned_trajectory']  # The additional trajectory
    
    # Compute deviations
    scores = [calculate_deviation(traj, road) for traj in trajectories]
    learned_score = calculate_deviation(learned_trajectory, road)
    
    # Append the learned score
    scores.append(learned_score)
    return scores

In [None]:
# Safely assigning the new column
filtered_df = filtered_df.copy()  # Ensure it's a separate copy to avoid the warning

filtered_df['ComputedScore'] = filtered_df.apply(calculate_computed_scores, axis=1)

#### Deviations for pbrl

In [None]:
to_complete = filtered_df[(filtered_df['ExperimentType'] == 'pbrl')]

In [None]:
deviation_PbRL = np.zeros([len(to_complete), 5])
for ee in range(len(to_complete)):
    road = to_complete['Data'].iloc[ee]['road'][1:]
    env = GridEnvironment(grid_width, grid_height, num_steps, [0,0])
    agent_Bernoulli = QLearningAgent_Bernoulli_PbRL(env, alpha_init = 0.5, beta_init = 0.5)
    trajectory = agent_Bernoulli.get_optimal_trajectory()
    deviation_PbRL[ee, 0] = calculate_deviation(trajectory, road)
    for ii in range(num_iters_per_experiment):
        traj1 = to_complete['Data'].iloc[ee]['trajectory'][ii][0]
        traj2 = to_complete['Data'].iloc[ee]['trajectory'][ii][1]
        human_feedback = to_complete['Data'].iloc[ee]['human_feedback'][ii]
            
        # Update alpha and beta based on feedback
        for state1, state2 in zip(traj1, traj2):
            idx1 = agent_Bernoulli.state_to_index(state1)
            idx2 = agent_Bernoulli.state_to_index(state2)

            if idx1 != idx2:
                if human_feedback == 0: # Trajectory 1 is prefered
                    agent_Bernoulli.alpha[idx1] += agent_Bernoulli.scale
                    agent_Bernoulli.beta[idx2] += agent_Bernoulli.scale
                else: # Trajectory 2 is prefered
                    agent_Bernoulli.beta[idx1] += agent_Bernoulli.scale
                    agent_Bernoulli.alpha[idx2] += agent_Bernoulli.scale
        trajectory = agent_Bernoulli.get_optimal_trajectory()
        #print(f"trajectory: {trajectory}")
        deviation_PbRL[ee, ii+1] = calculate_deviation(trajectory, road)

In [None]:
# Get the indices where ['ExperimentType'] == 'pbrl'
pbrl_indices = filtered_df[(filtered_df['ExperimentType'] == 'pbrl')].index
for i, idx in enumerate(pbrl_indices):
    filtered_df.at[idx, 'ComputedScore'] = deviation_PbRL[i].tolist()

## Experiment Count

In [None]:
# Create the summary table
summary_table = (
    filtered_df
    .groupby(['ExperimentType', 'AgentType'])
    .size()  # Count occurrences
    .unstack(fill_value=0)  # Make columns for each 'AgentType', filling missing values with 0
    .rename(columns={1: 'Count_AgentType_1', 2: 'Count_AgentType_2'})  # Rename columns
)

# Add total counts for each 'ExperimentType'
summary_table['Total'] = summary_table.sum(axis=1)

# Reset the index to turn it into a standard table
summary_table = summary_table.reset_index()

print(summary_table)

## Experiment Results

In [None]:
score_sentiment = filtered_df[(filtered_df['ExperimentType'] == 'sentiment')]['ComputedScore']
score_pbrl = filtered_df[(filtered_df['ExperimentType'] == 'pbrl')]['ComputedScore']
score_rlhtfnoinstruct = filtered_df[(filtered_df['ExperimentType'] == 'rlhtfnoinstruct')]['ComputedScore']
score_rlhtf = filtered_df[(filtered_df['ExperimentType'] == 'rlhtf')]['ComputedScore']

In [None]:
df = pd.DataFrame(score_sentiment.values.tolist())
score_sentiment_mean = df.mean(axis=0).tolist()
score_sentiment_ste = (df.std(axis=0) / np.sqrt(len(df))).tolist()

df = pd.DataFrame(score_pbrl.values.tolist())
score_pbrl_mean = df.mean(axis=0).tolist()
score_pbrl_ste = (df.std(axis=0) / np.sqrt(len(df))).tolist()

df = pd.DataFrame(score_rlhtfnoinstruct.values.tolist())
score_rlhtfnoinstruct_mean = df.mean(axis=0).tolist()
score_rlhtfnoinstruct_ste = (df.std(axis=0) / np.sqrt(len(df))).tolist()

df = pd.DataFrame(score_rlhtf.values.tolist())
score_rlhtf_mean = df.mean(axis=0).tolist()
score_rlhtf_ste = (df.std(axis=0) / np.sqrt(len(df))).tolist()

In [None]:
with open('human_experiments/grid_true_trajectory_greedy.pickle', 'rb') as f:
    reward_true_UCB_trajectory = pickle.load(f)
with open('human_experiments/grid_true_state_greedy.pickle', 'rb') as f:
    reward_true_UCB_state = pickle.load(f)

In [None]:
# Define the means and standard errors (replace these with the values you computed)
means = {
    'score_sentiment': score_sentiment_mean,
    'score_pbrl': score_pbrl_mean,
    'score_rlhtfnoinstruct': score_rlhtfnoinstruct_mean,
    'score_rlhtf': score_rlhtf_mean
}

stes = {
    'score_sentiment': score_sentiment_ste,
    'score_pbrl': score_pbrl_ste,
    'score_rlhtfnoinstruct': score_rlhtfnoinstruct_ste,
    'score_rlhtf': score_rlhtf_ste
}

In [None]:
from scipy.stats import ttest_ind
df_rlhtf = pd.DataFrame(score_rlhtf.values.tolist())
df_pbrl = pd.DataFrame(score_pbrl.values.tolist())
df_rlhtfnoinstruct = pd.DataFrame(score_rlhtfnoinstruct.values.tolist())
df_sentiment = pd.DataFrame(score_sentiment.values.tolist())
df_true_state = pd.DataFrame(reward_true_UCB_state)
df_true_trajectory = pd.DataFrame(reward_true_UCB_trajectory)

# Compute error (10 - score)
error_rlhtf = 10 - df_rlhtf
error_pbrl = 10 - df_pbrl
error_rlhtfnoinstruct = 10 - df_rlhtfnoinstruct
error_sentiment = 10 - df_sentiment
error_true_state  = - df_true_state
error_true_trajectory  = - df_true_trajectory 

# Extract the last column (final step)
last_step_error_rlhtf = error_rlhtf.iloc[:, -1]  # Last column
last_step_error_pbrl = error_pbrl.iloc[:, -1]    # Last column
last_step_error_rlhtfnoinstruct = error_rlhtfnoinstruct.iloc[:, -1] 
last_step_error_sentiment = error_sentiment.iloc[:, -1]  # Last column
last_step_error_true_state = error_true_state.iloc[:, -1]  
last_step_error_true_trajectory = error_true_trajectory.iloc[:, -1]  

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_rlhtf, last_step_error_true_state, equal_var=False)
print(f"RLHTF & true state: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_rlhtf, last_step_error_true_trajectory, equal_var=False)
print(f"RLHTF & true traj: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_rlhtfnoinstruct, last_step_error_true_state, equal_var=False)
print(f"RLHTFnoinstruct & true state: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_rlhtfnoinstruct, last_step_error_true_trajectory, equal_var=False)
print(f"RLHTF noinstruct& true traj: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_pbrl, last_step_error_true_state, equal_var=False)
print(f"pbrl & true state: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_sentiment, last_step_error_true_trajectory, equal_var=False)
print(f"sentiment & true traj: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_sentiment, last_step_error_true_state, equal_var=False)
print(f"sentiment & true state: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_true_trajectory, last_step_error_true_state, equal_var=False)
print(f"true trajectory & true state: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_pbrl, last_step_error_true_trajectory, equal_var=False)
print(f"pbrl & true traj: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_rlhtf, last_step_error_pbrl, equal_var=False)
print(f"RLHTF & PbRL: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_sentiment, last_step_error_pbrl, equal_var=False)
print(f"sentiment & PbRL: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
t_stat, p_value = ttest_ind(last_step_error_rlhtfnoinstruct, last_step_error_pbrl, equal_var=False)
print(f"RLHTF No Instructions & PbRL: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
t_stat, p_value = ttest_ind(last_step_error_rlhtfnoinstruct, last_step_error_sentiment, equal_var=False)
print(f"RLHTF No Instructions & sentiment: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# Perform Welch's t-test (independent t-test with unequal variances)
t_stat, p_value = ttest_ind(last_step_error_rlhtf, last_step_error_sentiment, equal_var=False)
print(f"RLHTF & sentiment: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
t_stat, p_value = ttest_ind(last_step_error_rlhtf, last_step_error_rlhtfnoinstruct,equal_var=False)
print(f"RLHTF & RLHTF No Instructions: T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
import pickle
# Load the variables from the file
with open('true_trajectory_vs_state.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Access the loaded variables
deviation_mean_true_state = loaded_data['deviation_mean_true_state']
deviation_ste_true_state = loaded_data['deviation_ste_true_state']
deviation_mean_true_traj = loaded_data['deviation_mean_true_traj']
deviation_ste_true_traj = loaded_data['deviation_ste_true_traj']

In [None]:
# Create the x-axis positions for the bars
x = np.arange(5)  # 5 steps

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))

# Width of the bars
bar_width = 0.15

# Position offsets for the bars
positions = {
    'true_traj': x - 2.5 * bar_width,
    'true_state': x - 1.5 * bar_width,
    'score_sentiment': x - 0.5 * bar_width,
    'score_pbrl': x + 0.5 * bar_width,
    'score_rlhtfnoinstruct': x + 1.5 * bar_width,
    'score_rlhtf': x + 2.5 * bar_width
}

ax.bar(positions['true_traj'], -deviation_mean_true_traj,
       bar_width,
       label='True trajectory level',
       color='#d62728',
       alpha=0.8,
       yerr=deviation_ste_true_traj,
       capsize=5)

ax.bar(positions['true_state'], -deviation_mean_true_state,
       bar_width,
       label='True state level',
       color='#1f77b4',
       alpha=0.8,
       yerr=deviation_ste_true_state,
       capsize=5)

# Define custom labels for the legend
legend_labels = ['Sentiment', 'PbRL', 'RLHTF no instructions (ours)', 'RLHTF with instructions (ours)']
colors=['#9467bd', '#2ca02c', '#7f7f7f', '#ff7f0e']

# Plot the bars and error bars for each score type
for i, score_type in enumerate(means):
    adjusted_means = [10 - value for value in means[score_type]]
    ax.bar(positions[score_type], adjusted_means, bar_width, label=legend_labels[i],
           color=colors[i], alpha=0.8, yerr=stes[score_type], capsize=5)
    

# Labeling the plot with increased font size
ax.set_xlabel('Interaction', fontsize=16)  # Change the fontsize as needed
ax.set_ylabel('Average Error Â± Standard Error', fontsize=16)
#ax.set_title('Scores by Step with Mean and STE', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels([f'{i}' for i in x], fontsize=14)  # Adjust fontsize for x-tick labels
ax.legend(loc='lower left', fontsize=16)  # Adjust fontsize for the legend

# Show the plot
plt.tight_layout()

# Optionally, you can adjust the y-tick labels font size as well
ax.tick_params(axis='y', labelsize=12)  # Adjust fontsize for y-tick labels


# Show grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()