# Bipedal Walker - PPO

In [24]:
# Importing relevant functions

import gym 
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
import matplotlib.pyplot as plt
import numpy as np
import torch as th

In [25]:
def moving_average(values, window):
    """
    Applies moving average based on a time window
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
    """
    Plots results of training model
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')

    # Plotting results before applying moving average
    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title)
    plt.show()

    # Applying moving average function
    y = moving_average(y, window=50)
    
    # Truncate x
    x = x[len(x) - len(y):]

    # Plotting results with moving average
    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title)
    plt.show()

In [None]:
# Initializing empty arrays
x_timestep_total = []
y_timestep_total = []
x_episode_total = []
y_episode_total = []
mean_reward_total = []
std_reward_total = []
count = 0
environment_name = "BipedalWalker-v3" # specifying environment name

# Specifying parameters for grid search
learning_rate = [0.00005, 0.0005, 0.005]
gamma = [0.99, 0.95]
batch_size = [32, 64, 128]

# Neural network
policy_kwargs = dict(activation_fn=th.nn.Tanh,
                     net_arch=[dict(pi=[100], vf=[75])])


# Looping over parameter values and training the models
for i in learning_rate:
    for j in gamma:
        for k in batch_size:

            print('Model',count)

            env = gym.make(environment_name)

            # Specifying log file environment
            log_dir = "logs/model" + str(count) + "/"
            env = Monitor(env, log_dir)

            # Applying parameters to PPO model
            model = PPO("MlpPolicy", env, verbose=0,
                        learning_rate = i,
                        gamma = j,
                        batch_size = k,
                        policy_kwargs = policy_kwargs,
                        seed = 42)

            # Training for 400000 timesteps
            model.learn(total_timesteps=400000)
            
            # Evaluating results on 1000 episodes
            mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1000, render=False)

            # Extracting values for plotting
            x_timestep, y_timestep = ts2xy(load_results(log_dir), 'timesteps')
            x_episode, y_episode = ts2xy(load_results(log_dir), 'episodes')

            plot_results(log_dir)

            # Printing mean and std reward for each model
            print('Mean reward', mean_reward)
            print('Std dev. reward', std_reward)
            
            # Appending to array
            x_timestep_total.append(x_timestep)
            y_timestep_total.append(y_timestep)
            x_episode_total.append(x_episode)
            y_episode_total.append(y_episode)

            mean_reward_total.append(mean_reward)
            std_reward_total.append(std_reward)

            print('----------------')

            count +=1 
            env.close()

Model 0


In [None]:
import numpy as np
from sklearn.preprocessing import normalize
from sklearn import preprocessing

# Converting the arrays to numpy arrays for processing
x_timestep_total = np.array(x_timestep_total)
y_timestep_total = np.array(y_timestep_total)
x_episode_total = np.array(x_episode_total)
y_episode_total = np.array(y_episode_total)

# Applying scaler so all models are plotted on the same scale
min_max_scaler = preprocessing.MinMaxScaler()

# Plotting results
plt.figure(figsize=(8, 4))
for i in range(len(x_timestep_total)):
    x_timestep_total_norm = min_max_scaler.fit_transform(x_timestep_total[i].reshape(-1,1))
    x_timestep_total_norm = x_timestep_total_norm * 400000
    plt.plot(x_timestep_total_norm,y_timestep_total[i])
    plt.title('Reward vs Timestep')