# Human Feedback to Rewards

In [70]:
from config import SECRET_KEY
import openai
openai.api_key = SECRET_KEY

In [71]:
import json
import os
import pandas as pd
import numpy as np
import torch

In [72]:
# Set notebook up to load files from Science repo
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Ensure that we re-load changes automagically
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
from mujoco_to_GPT import get_landmarks, coor_to_letter, fill_missing_entries, get_human_feedback, map_coordinates_to_algebraic
from config import COOR_MAX, NUM_SEGMENTS, SYSTEM_PROMPT, FUNCTION_STRUCTURE

### Initializations 

In [74]:
gpt_model = 'gpt-4-1106-preview'
temperature = 0.5 # Degree of randomness of GPT's output

In [75]:
#landmarks = get_landmarks()
landmarks = {'yellow': 'c15', 'blue': 's17', 'white': 't3', 'orange': 'd2'}

In [147]:
video_directory = 'videos'
video_name = 'experiment_step5'

In [249]:
data_trajectories = torch.load(os.path.join(video_directory,f"{video_name}_data.pth"))
info_trajectories =  torch.load(os.path.join(video_directory,f"{video_name}_data_infoGPT.pth"))

In [250]:
feedback_name = ['HumanFeedback_' + video_name]

In [251]:
human_feedback = get_human_feedback(video_directory, feedback_name)

In [252]:
assert len(info_trajectories) == len(human_feedback)

### Ask ChatGPT for Rewards

In [253]:
total_exp_num = len(human_feedback)
reward_data = []
for exp_num in range(total_exp_num):
    print("Experiment ", exp_num, "/", total_exp_num)
    exp_angle = info_trajectories[exp_num][1]
    exp_angular_velocity = info_trajectories[exp_num][2]
    exp_target_location = data_trajectories[exp_num][0][6:8].tolist() # Shift two because length is 13!
    data_trajectories[0][0][6:8]
    target_pos =  map_coordinates_to_algebraic(exp_target_location)
    landmarks.update({'red': target_pos})
    user_prompt = f'''
          'feedback': '{human_feedback[exp_num]}',
          'landmarks': {landmarks},
          'fingertip_position': {info_trajectories[exp_num][0]},
          'angle': {exp_angle},
          'angular_speed': {exp_angular_velocity}
        '''
    gptResponse = openai.chat.completions.create(
                                model = gpt_model,
                                temperature= temperature,
                                messages= [{'role': 'system', 'content': SYSTEM_PROMPT},
                                           {"role": "user", "content": user_prompt}],
                                functions=[FUNCTION_STRUCTURE],
                                    function_call= { "name": "getReward" })

    json_data = gptResponse.choices[0].message.function_call.arguments
    answer_data = json.loads(json_data)
    reward_data = reward_data + fill_missing_entries(answer_data['referred_steps'], exp_angle, exp_angular_velocity, exp_target_location)

Experiment  0 / 2
Experiment  1 / 2


In [255]:
filepath = os.path.join(video_directory,f"{video_name}_data_rewards.json")
with open(filepath, 'w') as json_file:
    json.dump(reward_data, json_file)

In [256]:
reward_data

[{'fingertip_position': 'w9',
  'angle': [-12.5, -18.9],
  'angular_speed': [-7.8, -9.07],
  'reward': -1,
  'angular_velocity': [-89.13, -0.03],
  'target_location': [-0.07187984883785248, 0.047796741127967834]},
 {'fingertip_position': 'v6',
  'angle': [-21.5, -28.8],
  'angular_speed': [-7.89, -8.24],
  'reward': -1,
  'angular_velocity': [-66.52, 1.35],
  'target_location': [-0.07187984883785248, 0.047796741127967834]},
 {'fingertip_position': 't5',
  'angle': [-31.4, -35.9],
  'angular_speed': [-9.33, -4.11],
  'reward': -1,
  'angular_velocity': [-77.37, 0.05],
  'target_location': [-0.07187984883785248, 0.047796741127967834]},
 {'fingertip_position': 'q4',
  'angle': [-44.2, -38.7],
  'angular_speed': [-13.11, -0.81],
  'reward': -1,
  'angular_velocity': [-37.62, -7.08],
  'target_location': [-0.07187984883785248, 0.047796741127967834]},
 {'fingertip_position': 'n3',
  'angle': [-61.4, -40.7],
  'angular_speed': [-16.8, -2.74],
  'reward': -1,
  'angular_velocity': [-43.99, -7.