In [None]:
import gymnasium as gym
from stable_baselines3 import SAC, TD3, A2C
import os
from gymnasium import RewardWrapper
import numpy as np

from gymnasium.wrappers import ClipAction

from langchain.schema.messages import HumanMessage, AIMessage
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI

# Basic Function Definitions

In [None]:
model_dir = "models"
log_dir = "logs"
os.makedirs(model_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

def train(env, sb3_algo):
    # episode is 1000 steps
    if sb3_algo == 'SAC':
        model = SAC('MlpPolicy', env, verbose=0, device='cuda', tensorboard_log=log_dir)
    elif sb3_algo == 'TD3':
        model = TD3('MlpPolicy', env, verbose=0, device='cuda', tensorboard_log=log_dir)
    elif sb3_algo == 'A2C':
        model = A2C('MlpPolicy', env, verbose=0, device='cuda', tensorboard_log=log_dir)
    else:
        print('Algorithm not found')
        return

    TIMESTEPS = 25000
    iters = 0
    while True:
        iters += 1
        model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False)
        model.save(f"{model_dir}/{sb3_algo}_{TIMESTEPS*iters}")
def test(env, sb3_algo, path_to_model):
    if sb3_algo == 'SAC':
        model = SAC.load(path_to_model, env=env)
    elif sb3_algo == 'TD3':
        model = TD3.load(path_to_model, env=env)
    elif sb3_algo == 'A2C':
        model = A2C.load(path_to_model, env=env)
    else:
        print('Algorithm not found')
        return

    obs = env.reset()
    done = False
    extra_steps = 500
    while True:
        action, _ = model.predict(obs)
        obs, _, done, _ = env.step(action)
        
        # Print average reward on that step
        if done:
            extra_steps -= 1
            if extra_steps < 0:
                break


# Reward function testing

# Environment Setup

In [None]:
from gym import RewardWrapper
import matplotlib.pyplot as plt
env_name = 'Humanoid-v4'
env = gym.make(env_name, render_mode='Human', exclude_current_positions_from_observation=False)

step = 0
past_rewards = []

past_torso_positions = []
past_max_torso_position = [1.4]
def jump_reward(reward):
    global step, past_rewards, past_max_torso_position, past_torso_positions
    
    
    x_torso = env.state_vector()[0]
    y_torso = env.state_vector()[1]
    z_torso = env.state_vector()[2] 
    
    past_torso_positions.append(z_torso) 
    
    if z_torso < 0:
        salto = z_torso*5
    else:
        salto = z_torso*20 
        
    if z_torso > past_max_torso_position[-1]:
        past_max_torso_position.append(z_torso)
        salto = z_torso*100

    reward=reward + salto
    past_rewards.append(reward) 
    # Every 1000 steps, print average reward
    if step % 1000 == 0:
        # we clear plots 
        print(f"Max torso position on this iter: {max(past_torso_positions[-1000:])}")
        print(f"Average reward: {np.mean(past_rewards[-1000:])}")
        print(f"Max torso position: {past_max_torso_position[-1]}\n")
        
    step += 1
    return reward


def jump_reward_2(reward):
    global step,past_max_torso_position, past_rewards, past_torso_positions
    step += 1
    z_torso = env.state_vector()[2]
    past_torso_positions.append(z_torso)
    salto = 0
    
    if z_torso < 0:
        salto += z_torso*5
    else:
        # Avoid giving reward for holding position
        if len(past_torso_positions) > 10:
            if abs(past_torso_positions[-10] - z_torso) > 0.2:
                salto += z_torso*20
        
    if z_torso > past_max_torso_position[-1]:
        salto += z_torso*100
        past_max_torso_position.append(z_torso)
    
    # If it holds position, then penalize 
    if len(past_torso_positions) > 100:
        if z_torso- past_torso_positions[-100] < 0.3:
            salto -= 20 
            
    if step % 1000 == 0:
        print(f"Max torso position on this iter: {max(past_torso_positions[-1000:])}")
        print(f"Average reward: {np.mean(past_rewards[-1000:])}")
        print(f"Max torso position: {past_max_torso_position[-1]}\n")
        
    reward = salto
    past_rewards.append(reward) 
    return reward

def jump_reward_3(reward):
    global step, past_max_torso_position, past_rewards, past_torso_positions
    step += 1
    z_torso = env.state_vector()[2]
    jump_reward = 0
    
    if len(past_torso_positions) > 10:
        if z_torso < 1.4:  # Define a threshold for low height
            jump_reward = -5
            
    # Reward for increasing height
    height_increase = (z_torso - past_max_torso_position[-1] if past_max_torso_position else 0)
    #height_increase = z_torso - (past_torso_positions[-1] if past_torso_positions else 0)
    if height_increase > 0 and z_torso > 0:
        #jump_reward = height_increase * 500  # Increase multiplier as needed
        jump_reward = z_torso * 500  # Increase multiplier as needed

    # Penalize for staying too low or static
        #elif abs(past_torso_positions[-10] - z_torso) < 0.3:  # Not moving significantly
            #jump_reward -= 10

    # Update past torso positions and max position
    past_torso_positions.append(z_torso)
    if z_torso > past_max_torso_position[-1]:
        past_max_torso_position.append(z_torso)
        #jump_reward += height_increase*200

    # Penalize for unnatural postures if necessary
    # posture_penalty = calculate_posture_penalty(env)
    # jump_reward -= posture_penalty

    # Update reward
    reward += jump_reward
    past_rewards.append(reward)

    # Periodic logging
    if step % 1000 == 0:
        print(f"Max torso position on this iter: {max(past_torso_positions)}")
        past_torso_positions = []
        print(f"Average reward: {np.mean(past_rewards[-1000:])}")
        print(f"Max torso position: {past_max_torso_position[-1]}\n")
        

    return reward 
     

memory = {
    "past_z_coordinates": [1.38],
    "max_z_coordinates": [1.38],
    "steps": 0,
    "past_rewards": []
    
}    
def jump_reward_4(reward):
    global memory
    reward=0
    
    memory["steps"] += 1
    x_torso = env.state_vector()[0]
    y_torso = env.state_vector()[1]
    z_torso = env.state_vector()[2]
    diff = z_torso - memory["past_z_coordinates"][-1]
    
    if abs(x_torso) > 0.1 or abs(y_torso) > 0.1:
        reward = reward - 10
    # We penalize if it stays put for too long
    if abs(np.mean(memory["past_z_coordinates"][-10:]) - z_torso) < 1:
        reward = reward - 10 
        
    if diff > 0:
        reward += diff*100
        
    if z_torso > memory["max_z_coordinates"][-1]:
        memory["max_z_coordinates"].append(z_torso)
        reward += diff*300
        
    memory["past_z_coordinates"].append(z_torso)
    
    print(f"x_torso: {x_torso}, y_torso: {y_torso}, z_torso: {z_torso}, diff: {diff}, reward: {reward}", end="\r") 
    if memory["steps"] % 1000 == 0:
        print(f"Max torso position on this iter: {max(memory['past_z_coordinates'][-1000:])}")
        print(f"Average reward: {np.mean(past_rewards[-1000:])}")
        print(f"Max torso position: {memory['max_z_coordinates'][-1]}\n")
        print(memory["past_rewards"][-1000:])
        
    past_rewards.append(reward) 
    return reward 


# Initialize global variables to keep track of past data
step = 0
past_rewards = []
max_height_reached = -float('inf')  # Use negative infinity as starting point

def jump_reward_5(reward):
    global step, past_rewards, max_height_reached
    
    observation = env.state_vector() 
    # Extract observations
    z_torso = observation[2]  # z-coordinate of the torso
    torso_orientations = observation[1:5]  # x, y, z, w-orientation of the torso
    
    # Define constants for reward calculation
    JUMP_REWARD_WEIGHT = 20
    ORIENTATION_PENALTY_WEIGHT = 5
    
    # Reward for jumping
    jump_reward = z_torso * JUMP_REWARD_WEIGHT if z_torso > 0 else 0
    
    # Update the maximum height reached
    if z_torso > max_height_reached:
        max_height_reached = z_torso
    
    # Penalty for not maintaining upright orientation
    orientation_penalty = sum(abs(o) for o in torso_orientations) * ORIENTATION_PENALTY_WEIGHT
    
    # Calculate total reward
    reward = jump_reward - orientation_penalty
    
    
    # Track rewards
    past_rewards.append(reward)
    step += 1
    
    # Reset variables at the end of an episode (1000 steps)
    if step % 1000 == 0:
        print(f"Max torso position on this iter: {max_height_reached}")
        print(f"Average reward: {np.mean(past_rewards[-1000:])}")
        step = 0
        past_rewards.clear()
        max_height_reached = -float('inf')
    
    return reward

max_height_reached = -float('inf')  # Use negative infinity as starting point
steps = 0
past_rewards = []
def jump_reward_6(_):
    global max_height_reached, steps, past_rewards
    z_torso = env.state_vector()[2]
    reward = -1
    if z_torso > max_height_reached:
        max_height_reached = z_torso
        reward = 300
    
    if steps % 1000 == 0:
        print(f"Average reward: {np.mean(past_rewards[-1000:])}")
        print(f"Max torso position: {max_height_reached}\n")    
      
    steps += 1
    past_rewards.append(reward)
        
    return reward

step = 0
past_rewards = []
past_max_x_position = [-9999]

def walk_reward(reward):
    global step, past_rewards, past_max_x_position
    x_coor_torso = env.state_vector()[0]
    
    # Reward for moving forward on the x-axis
    displacement_reward = x_coor_torso
    
    # Check if the current x position is the furthest the humanoid has been
    if x_coor_torso > past_max_x_position[-1]:
        past_max_x_position.append(x_coor_torso)
        
    if step%1000 == 0:
        print(f"Max x position on this iter: {max(past_max_x_position[-1000:])}")
        
    reward += displacement_reward
    past_rewards.append(reward)
    step += 1
    return reward
    
env = gym.wrappers.TransformReward(env, walk_reward)
env.reset()
print()

# LLM

In [None]:

os.environ["OPENAI_API_KEY"] = "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"
gpt3 = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
gpt4 = ChatOpenAI(temperature=0.7, model_name="gpt-4-1106-preview")


input_template = """{0}"""
system_template = """
Given an objective, you write the code for the reward function of a humanoid robot. Here is the observation space available to you:
    | Num | Observation                                                                                                     | Min  | Max | Name (in corresponding XML file) | Joint | Unit                       |
| --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- |
| 0   | x-coordinate of the torso (centre)                                                                              | -Inf | Inf | root                             | free  | position (m)               |
| 1   | y-coordinate of the torso (centre)                                                                              | -Inf | Inf | root                             | free  | position (m)               |
| 2   | z-coordinate of the torso (centre)                                                                              | -Inf | Inf | root                             | free  | position (m)               |
| 3   | x-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
| 4   | y-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
| 5   | z-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
| 6   | w-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
| 7   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf | Inf | abdomen_z                        | hinge | angle (rad)                |
| 8   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf | Inf | abdomen_y                        | hinge | angle (rad)                |
| 10  | x-angle of the abdomen (in pelvis)                                                                              | -Inf | Inf | abdomen_x                        | hinge | angle (rad)                |
| 11  | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_x                      | hinge | angle (rad)                |
| 12  | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_z                      | hinge | angle (rad)                |
| 13  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_y                      | hinge | angle (rad)                |
| 14  | angle between right hip and the right shin (in right_knee)                                                      | -Inf | Inf | right_knee                       | hinge | angle (rad)                |
| 15  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_x                       | hinge | angle (rad)                |
| 16  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_z                       | hinge | angle (rad)                |
| 17  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_y                       | hinge | angle (rad)                |
| 18  | angle between left hip and the left shin (in left_knee)                                                         | -Inf | Inf | left_knee                        | hinge | angle (rad)                |
| 19  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf | Inf | right_shoulder1                  | hinge | angle (rad)                |
| 20  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf | Inf | right_shoulder2                  | hinge | angle (rad)                |
| 21  | angle between right upper arm and right_lower_arm                                                               | -Inf | Inf | right_elbow                      | hinge | angle (rad)                |
| 22  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf | Inf | left_shoulder1                   | hinge | angle (rad)                |
| 23  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf | Inf | left_shoulder2                   | hinge | angle (rad)                |
| 24  | angle between left upper arm and left_lower_arm                                                                 | -Inf | Inf | left_elbow                       | hinge | angle (rad)                |
| 25  | x-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
| 26  | y-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
| 27  | z-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
| 28  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
| 29  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
| 30  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
| 31  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf | Inf | abdomen_z                        | hinge | anglular velocity (rad/s)  |
| 32  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf | Inf | abdomen_y                        | hinge | anglular velocity (rad/s)  |
| 33  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf | Inf | abdomen_x                        | hinge | aanglular velocity (rad/s) |
| 34  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_x                      | hinge | anglular velocity (rad/s)  |
| 35  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_z                      | hinge | anglular velocity (rad/s)  |
| 36  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_y                      | hinge | anglular velocity (rad/s)  |
| 37  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf | Inf | right_knee                       | hinge | anglular velocity (rad/s)  |
| 38  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_x                       | hinge | anglular velocity (rad/s)  |
| 39  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_z                       | hinge | anglular velocity (rad/s)  |
| 40  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_y                       | hinge | anglular velocity (rad/s)  |
| 41  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf | Inf | left_knee                        | hinge | anglular velocity (rad/s)  |
| 42  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1                  | hinge | anglular velocity (rad/s)  |
| 43  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2                  | hinge | anglular velocity (rad/s)  |
| 44  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf | Inf | right_elbow                      | hinge | anglular velocity (rad/s)  |
| 45  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf | Inf | left_shoulder1                   | hinge | anglular velocity (rad/s)  |
| 46  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf | Inf | left_shoulder2                   | hinge | anglular velocity (rad/s)  |
| 47  | angular velocitty of the angle between left upper arm and left_lower_arm                                        | -Inf | Inf | left_elbow                       | hinge | anglular velocity (rad/s)  |

Example code:
```
step = 0
past_rewards = []

past_max_torso_position = [-9999]
def new_reward(reward):
    global step, past_rewards, past_max_torso_position
    #x_coor_torso = env.state_vector()[0]
    #y_coor_torso = env.state_vector()[1]
    z_coor_torso = env.state_vector()[2]
    
    if z_coor_torso < 0:
        salto = z_coor_torso*5
    else:
        salto = z_coor_torso*20 
        
    if z_coor_torso > past_max_torso_position[-1]:
        past_max_torso_position.append(z_torso)

    reward=reward + salto
    past_rewards.append(reward) 
    step += 1
    return reward
    
env = gym.wrappers.TransformReward(env, walk_reward)
env.reset()
```  

EVERY EPISODE IS 1000 STEPS LONG, SO YOU CAN STORE DATA FROM PREVIOUS STEPS FOR THE NEXT STEP
Keep the same function prototype which is `new_reward(reward)`. And to get the observations from the table use `env.state_vector()[i]` for the i'th observation of the table.

"""

memory = ConversationBufferMemory(
    llm=gpt4,
    memory_key="chat_history",
    return_messages=True,
    )
            
            
messages = [
SystemMessagePromptTemplate.from_template(system_template),
MessagesPlaceholder(variable_name="chat_history"),
HumanMessagePromptTemplate.from_template("{input}")
]
prompt = ChatPromptTemplate.from_messages(messages=messages)

chain = LLMChain(
llm=gpt4,
prompt=prompt,
verbose=True,
memory=memory,
)


user_prompt = input("> ")
#user_prompt = "Give me a reward function for the humanoid to jump"
output = chain.run({"input":user_prompt})
# Extract python code (should be wrapped in ```python ```)
code = output.split("```python")[1].split("```")[0]
exec(code)

# Training

In [None]:
# Rest of your code remains the same
if __name__ == '__main__':
    gymenv_name = 'Humanoid-v4'
    sb3_algo = 'SAC'
    path_to_model = './SAC_2500000.zip'

    train_model = True
    test_model = False

    if train_model:
        train(env, sb3_algo)

    if test_model:
        if os.path.isfile(path_to_model):
            test(env, sb3_algo, path_to_model=path_to_model)
        else:
            print(f'{path_to_model} not found.')



# Give feedback to the agent

In [None]:
# Since it has memory, it remembers the last code and can fix upon it
user_prompt = input("> ")
output = chain.run({"input":user_prompt})