In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:

import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from src.PPO.PPO import PPO, PPOContinuous
import pdb
import time

env_name = "BipedalWalker-v2"
env = gym.make(env_name)

env.seed(10)

state_size = env.observation_space.shape[0]
action_size =env.action_space.shape[0]

print(state_size)
print(action_size)

# PPO Settings
update_every = 4000
num_learn = 80
win_condition = 300

# Agent settings
hidden_size=64
epsilon=0.2
entropy_beta=0.01
gamma=0.99
lr=0.0003

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
agent = PPOContinuous(state_size, action_size, hidden_size=hidden_size, epsilon=epsilon, entropy_beta=entropy_beta, gamma=gamma, lr=lr)

In [None]:

scores_deque = deque(maxlen=100)
scores = []
average_scores = []
max_score = -np.Inf

def train(n_episodes=4000, max_t=700):
#   agent = PPO(state_size, action_size, hidden_size=hidden_size, epsilon=epsilon, entropy_beta=entropy_beta, gamma=gamma, lr=lr)
  steps = 0

  for episode in range(1, n_episodes+1):
    state = env.reset()
    score = 0
    
    for t in range(max_t):
      steps += 1

      actions_tensor, log_prob = agent.act(torch.FloatTensor(state).to(device))
      actions = actions_tensor.cpu().data.numpy().flatten()
      next_state, reward, done, _ = env.step(actions_tensor.cpu().numpy())

      agent.mem.add(torch.FloatTensor(state), actions, reward, log_prob, done)

      # Update 
      state = next_state
      score += reward

      if steps >= update_every:
        start_time = time.time()
        
        agent.learn(num_learn)
        agent.mem.clear()
        steps = 0

        print("\rLearning Time: {}".format(time.time()-start_time))

      if done:
        break
    
    # Book Keeping
    scores_deque.append(score)
    scores.append(score)
    average_scores.append(np.mean(scores_deque))
      

    print("\rEpisode {}	Average Score: {:.2f}	Score: {:.2f}".format(episode, np.mean(scores_deque), score), end="")

    if episode % 100 == 0:
      print("\rEpisode {}	Average Score: {:.2f}".format(episode, np.mean(scores_deque)))   
      torch.save(agent.model.state_dict(), "walker_ppo_model_{}.pth".format(episode))
      torch.save(agent.model_old.state_dict(), "walker_ppo_model_old_{}.pth".format(episode))
    
    if np.mean(scores_deque) > win_condition:
      print("\rEnvironment Solved in {} episodes!".format(episode))
      break


  return scores, average_scores

scores, average_scores = train()

Episode 7	Average Score: -57.31	Score: -101.06

  loss = -torch.min(surrogate_1, surrogate_2) + 0.5*F.mse_loss(values, discounted_returns) - self.entropy_beta*entropy


Learning Time: 173.14997100830078
Learning Time: 37.81682467460632	Score: -51.081
Learning Time: 37.67802619934082	Score: -40.688
Learning Time: 37.010924100875854Score: -102.19
Learning Time: 37.54847598075867	Score: -37.142
Learning Time: 37.223000049591064Score: -122.36
Learning Time: 37.405943870544434Score: -31.210
Learning Time: 37.213414907455444Score: -32.988
Learning Time: 37.33086395263672	Score: -116.63
Learning Time: 37.73776102066046	Score: -50.057
Learning Time: 37.38165807723999	Score: -32.945
Episode 100	Average Score: -69.29	Score: -58.84
Learning Time: 38.490575075149536	Score: -32.424
Learning Time: 86.510607957839976	Score: -34.394
Learning Time: 88.148627996444701	Score: -42.167
Learning Time: 44.283063650131226	Score: -104.17
Learning Time: 48.121879100799564	Score: -45.157
Episode 147	Average Score: -72.44	Score: -45.921

In [None]:
plt.plot(scores)
plt.plot(average_scores)

In [None]:
for episode in range(0):
    state = env.reset()
    score = 0
    input()

    for t in range(30000):
        actions_tensor, log_prob = agent.act(torch.FloatTensor(state))
        actions = actions_tensor.cpu().data.numpy().flatten()
        next_state, reward, done, _ = env.step(actions_tensor)

        
        score += reward
        
        env.render()
        
        if done:
            break;
        
        state = next_state

    print(score)
env.close()

In [None]:
Learning Time: 40.54673504829407
Learning Time: 41.96533799171448	Score: -50.15
Learning Time: 40.52155685424805	Score: -32.68
Learning Time: 39.72044110298157	Score: -35.29
Learning Time: 39.37555003166199	Score: -34.61
Learning Time: 39.28577804565433	Score: -32.512
Learning Time: 39.48905801773071	Score: -41.78
Learning Time: 37.73234200477635	Score: -53.93
Learning Time: 37.45758390426636	Score: -100.15
Learning Time: 37.49617815017761	Score: -35.22
Learning Time: 37.77951502799988	Score: -34.89
Learning Time: 37.48916983604431	Score: -40.78
Learning Time: 37.69424891471863	Score: -35.52
Learning Time: 38.07879686355591	Score: -102.34
Episode 100	Average Score: -51.22	Score: -102.81
Learning Time: 37.465858221054083	Score: -100.50
Learning Time: 37.470584869384766	Score: -100.71
Learning Time: 37.296790838241581	Score: -100.70
Learning Time: 58.151881933212287	Score: -102.24
Learning Time: 58.688025951385556	Score: -41.10
Learning Time: 61.566057920455933	Score: -40.929
Learning Time: 50.702331066131590	Score: -101.60
Learning Time: 37.473351955413823	Score: -43.493
Episode 200	Average Score: -90.31	Score: -116.74
Learning Time: 37.415262937545776	Score: -102.61
Learning Time: 37.678329229354865	Score: -106.37
Learning Time: 36.9206130504608158	Score: -101.46
Learning Time: 36.7081129550933847	Score: -101.11
Episode 300	Average Score: -107.58	Score: -127.65
Learning Time: 36.6818459033966061	Score: -100.49
Learning Time: 36.6596653461456322	Score: -100.17
Learning Time: 36.7583160400390674	Score: -110.83
Learning Time: 36.5183839797973635	Score: -102.69
Learning Time: 36.7789819240570147	Score: -135.37
Episode 400	Average Score: -107.91	Score: -99.783
Learning Time: 36.9678218364715682	Score: -66.369
Learning Time: 36.5511198043823242	Score: -121.94
Learning Time: 36.6014571189880467	Score: -103.22
Learning Time: 36.9071187973022464	Score: -100.30
Episode 500	Average Score: -108.07	Score: -106.43
Learning Time: 43.9723660945892351	Score: -117.30
Learning Time: 40.5132012367248553	Score: -125.65
Learning Time: 42.3397359848022463	Score: -104.43
Learning Time: 43.1013031005859400	Score: -100.21
Learning Time: 49.8743638992309674	Score: -109.04
Episode 600	Average Score: -105.86	Score: -113.37
Learning Time: 47.3777740001678587	Score: -100.14
Learning Time: 50.6292541027069152	Score: -100.24
Learning Time: 45.6308989524841393	Score: -100.59
Learning Time: 41.7782220840454160	Score: -102.72
Learning Time: 39.8999428749084507	Score: -103.85
Episode 700	Average Score: -97.33	Score: -126.92
Learning Time: 40.285196065902712	Score: -101.06
Learning Time: 40.153511047363284	Score: -131.01
Learning Time: 41.997211933135986	Score: -43.998
Learning Time: 40.493303060531616	Score: -46.248
Learning Time: 40.311293840408325	Score: -52.013
Learning Time: 40.479155063629151	Score: -103.25
Learning Time: 40.168627023696996	Score: -104.30
Episode 800	Average Score: -96.21	Score: -47.271
Learning Time: 40.477483987808234	Score: -43.73
Learning Time: 40.945652008056645	Score: -52.607
Learning Time: 40.066672801971436	Score: -41.097
Learning Time: 38.698684930801396	Score: -43.758
Learning Time: 39.487704753875735	Score: -49.63
Learning Time: 40.026069164276129	Score: -103.65
Learning Time: 39.121330261230478	Score: -103.18
Learning Time: 39.961763143539431	Score: -121.05
Learning Time: 39.563637018203735	Score: -47.603
Learning Time: 40.388886213302616	Score: -43.777
Episode 900	Average Score: -85.55	Score: -45.70
Learning Time: 38.532398939132698	Score: -40.347
Learning Time: 37.279074907302856	Score: -59.321
Learning Time: 37.005691289901738	Score: -49.743
Learning Time: 37.948124885559089	Score: -36.23
Learning Time: 39.795126199722290	Score: -45.60
Learning Time: 40.372703075408936	Score: -33.18
Learning Time: 37.719485044479377	Score: -37.318
Learning Time: 40.048268079757694	Score: -35.95
Learning Time: 38.895526170730599	Score: -32.90
Learning Time: 39.228843927383429	Score: -36.23
Learning Time: 40.090528011322029	Score: -35.04
Learning Time: 40.450932025909424	Score: -36.85
Learning Time: 40.717709064483648	Score: -33.64
Learning Time: 40.037404060363772	Score: -37.16
Learning Time: 40.125184059143066	Score: -33.95
Learning Time: 39.859297037124634	Score: -30.23
Episode 1000	Average Score: -42.72	Score: -38.51
Learning Time: 39.3126468658447334	Score: -112.81
Learning Time: 43.1875679492950440	Score: -43.952
Learning Time: 43.7944908142089846	Score: -42.03
Learning Time: 45.3535907268524204	Score: -37.260
Learning Time: 43.1859469413757391	Score: -32.99
Learning Time: 40.1571178436279300	Score: -35.44
Learning Time: 40.4242286682128906	Score: -34.655
Learning Time: 40.7080061435699464	Score: -37.61
Learning Time: 40.3114900588989266	Score: -35.05
Learning Time: 40.4351949691772465	Score: -44.15
Learning Time: 40.4176392555236854	Score: -35.94
Learning Time: 40.2799270153045653	Score: -37.88
Learning Time: 40.6512277126312269	Score: -36.84
Learning Time: 38.4469420909881631	Score: -28.376
Learning Time: 38.8761558532714842	Score: -29.18
Learning Time: 38.2418050765991280	Score: -24.89
Learning Time: 38.1750431060791.78	Score: -33.66
Episode 1100	Average Score: -39.76	Score: -36.17
Learning Time: 38.1902880668640143	Score: -41.48
Learning Time: 38.6219561100006188	Score: -32.64
Learning Time: 38.3393511772155761	Score: -37.25
Learning Time: 38.5834698677063.37	Score: -28.70
Learning Time: 38.3533101081848146	Score: -112.73
Learning Time: 38.0739839076995852	Score: -129.32
Learning Time: 38.1755278110504156	Score: -28.88
Learning Time: 38.4516592025756845	Score: -29.71
Learning Time: 39.0045912265777634	Score: -30.619
Learning Time: 55.2141561508178761	Score: -30.441
Learning Time: 45.9561002254486145	Score: -29.213
Learning Time: 48.2231998443603539	Score: -30.91
Learning Time: 41.7471559047699.00	Score: -32.92
Learning Time: 41.6580412387847968	Score: -23.79
Learning Time: 42.2896039485931422	Score: -16.27
Episode 1200	Average Score: -41.73	Score: -32.78
Learning Time: 54.279879093170166
Learning Time: 42.2410078048706057	Score: -24.96
Learning Time: 42.0439379215240567	Score: -26.68
Learning Time: 42.6554787158966067	Score: -16.73
Learning Time: 45.6630799770355282	Score: -27.09
Learning Time: 41.9235117435455361	Score: -17.77
Learning Time: 42.0443558692932146	Score: -20.91
Learning Time: 42.2250380516052258	Score: -23.95
Learning Time: 47.1960237026214685	Score: -23.75
Learning Time: 43.4167301654815746	Score: -34.70
Learning Time: 41.8681437969207768	Score: -27.97
Learning Time: 47.9095478057861319	Score: -18.41
Learning Time: 41.6986820697784404	Score: -28.15
Learning Time: 49.3256709575653180	Score: -18.83
Learning Time: 42.3512620925903395	Score: -25.61
Learning Time: 41.9399950504303.14	Score: -44.35
Learning Time: 42.0089368820190424	Score: -31.94
Learning Time: 41.7282967567443850	Score: -30.816
Episode 1300	Average Score: -26.92	Score: -24.74
Learning Time: 42.9301068782806490	Score: -25.28
Learning Time: 42.1512911319732757	Score: -18.79
Learning Time: 42.9341220855712908	Score: -11.85
Learning Time: 42.3626670837402343	Score: -13.46
Learning Time: 43.1760270595550545	Score: -14.05
Learning Time: 39.0333838462829650	Score: -23.025
Learning Time: 39.2813110351562569	Score: -23.63
Learning Time: 40.1567578315734861	Score: -25.58
Learning Time: 43.0799899101257304	Score: -113.61
Learning Time: 47.1953339576721210	Score: -42.52
Learning Time: 43.6321978569030766	Score: -30.29
Learning Time: 43.7692868709564291	Score: -16.636
Learning Time: 50.6364340782165599	Score: -105.12
Learning Time: 47.9819059371948241	Score: -108.53
Episode 1400	Average Score: -38.10	Score: -108.70
Learning Time: 45.7070763111114508	Score: -18.28
Learning Time: 47.9037268161773704	Score: -15.655
Learning Time: 47.9066970348358153	Score: -15.75
Learning Time: 43.8121950626373354	Score: -24.359
Learning Time: 46.6754729747772230	Score: -12.442
Learning Time: 48.5181260108947751	Score: -21.611
Learning Time: 45.6160356998443606	Score: -111.23
Learning Time: 46.6777288913726827	Score: -121.98
Learning Time: 42.7118852138519319	Score: -116.59
Learning Time: 46.6691117286682165	Score: -28.622
Learning Time: 45.9012007713317923	Score: -36.36
Learning Time: 47.1364438533782961	Score: -30.17
Learning Time: 47.7135169506073.48	Score: -24.95
Learning Time: 50.4145550727844248	Score: -17.46
Learning Time: 47.3316240310668956	Score: -30.86
Episode 1500	Average Score: -37.64	Score: -18.61
Learning Time: 48.9877791404724162	Score: -16.32
Learning Time: 51.1212730407714845	Score: -29.79
Learning Time: 44.1742072105407785	Score: -22.40
Learning Time: 42.8843109607696506	Score: -19.83
Learning Time: 42.4952540397644040	Score: -21.16
Learning Time: 44.3001699447631846	Score: -20.202
Learning Time: 44.1330888271331879	Score: -25.75
Learning Time: 44.8928189277648926	Score: -30.13
Learning Time: 46.4897408485412676	Score: -28.37
Learning Time: 55.8327958583831826	Score: -109.42
Learning Time: 46.6438789367675867	Score: -33.81
Learning Time: 46.9498980045318645	Score: -113.82
Learning Time: 51.0569839477539066	Score: -28.11
Learning Time: 42.1068120002746637	Score: -38.530
Learning Time: 44.2449059486389160	Score: -125.17
Episode 1600	Average Score: -40.73	Score: -23.792
Learning Time: 46.5267801284790048	Score: -103.94
Learning Time: 46.0171959400177.01	Score: -14.02
Learning Time: 46.5032598972320567	Score: -30.075
Learning Time: 46.0360748767852804	Score: -33.885
Learning Time: 45.3479828834533731	Score: -22.92
Learning Time: 42.8594799041748058	Score: -30.40
Learning Time: 44.9049100875854580	Score: -23.28
Learning Time: 50.5000438690185554	Score: -40.65
Learning Time: 50.0993680953979503	Score: -27.12
Learning Time: 48.6950767040252787	Score: -24.37
Learning Time: 46.4828779697418267	Score: -31.60
Learning Time: 51.4060847759246884	Score: -25.74
Learning Time: 55.5760419368743960	Score: -10.560
Learning Time: 42.8585369586944689	Score: -26.373
Learning Time: 42.4714128971099855	Score: -39.32
Learning Time: 42.3689308166503936	Score: -24.106
Episode 1700	Average Score: -36.37	Score: -13.89
Learning Time: 46.1764481067657528	Score: -17.33
Learning Time: 46.0451016426086484	Score: -15.049
Learning Time: 46.5364708900451661	Score: -17.685
Learning Time: 46.2100329399108904	Score: -17.90
Learning Time: 42.5851671695709208	Score: -105.59
Learning Time: 45.3486309051513750	Score: -12.64
Learning Time: 48.3145380020141681	Score: -16.790
Learning Time: 43.3543767929077150	Score: -23.17
Learning Time: 51.0337831974029545	Score: -24.25
Learning Time: 44.0231702327728337	Score: -11.26
Learning Time: 49.8830740451812741	Score: -105.96
Learning Time: 44.0449199676513745	Score: -24.35
Learning Time: 46.9313490390777644	Score: -110.25
Learning Time: 46.9195659160614.32	Score: -20.162
Learning Time: 45.7335951328277618	Score: -24.059
Episode 1800	Average Score: -38.20	Score: -119.77
Learning Time: 45.1481337547302254	Score: -110.49
Learning Time: 44.2870111465454140	Score: -24.869
Learning Time: 46.7266621589660645	Score: -17.593
Learning Time: 45.1171488761901867	Score: -24.778
Learning Time: 42.3740322589874312	Score: -20.05
Learning Time: 40.1253099441528398	Score: -24.892
Learning Time: 39.8999710083007884	Score: -103.34
Learning Time: 39.5641746520996166	Score: -27.726
Learning Time: 44.6625883579254153	Score: -30.299
Learning Time: 40.6032531261444133	Score: -27.692
Learning Time: 39.2227330207824761	Score: -29.22
Episode 1900	Average Score: -59.90	Score: -111.90
Learning Time: 39.2748320102691658	Score: -35.452
Learning Time: 38.8785018920898449	Score: -23.337
Learning Time: 38.7682087421417240	Score: -31.315
Learning Time: 39.2044599056243951	Score: -103.58
Learning Time: 39.0119411945343.74	Score: -103.85
Learning Time: 38.9267399311065745	Score: -36.870
Learning Time: 37.5925519466400157	Score: -33.664
Learning Time: 37.7142670154571523	Score: -30.215
Learning Time: 37.9283688068389928	Score: -31.495
Learning Time: 37.9681451320648256	Score: -21.54
Learning Time: 37.7515597343444869	Score: -22.71
Learning Time: 38.2299361228942942	Score: -104.41
Learning Time: 53.2224099636077953	Score: -109.46
Learning Time: 1782.70367169380194	Score: -21.94
Episode 2000	Average Score: -48.73	Score: -28.547
Learning Time: 40.1072757244110108	Score: -115.36
Learning Time: 39.8173630237579354	Score: -28.811
Learning Time: 39.0033209323883069	Score: -29.15
Learning Time: 38.7237589359283459	Score: -35.722
Learning Time: 40.9716911315918.72	Score: -33.50
Learning Time: 38.9425621032714840	Score: -36.300
Learning Time: 38.5508911609649661	Score: -38.866
Learning Time: 38.7329928874969529	Score: -108.92
Learning Time: 38.4791398048400951	Score: -32.397
Learning Time: 38.9325821399688756	Score: -110.71
Learning Time: 38.3454651832580639	Score: -54.637
Learning Time: 38.3532819747924805	Score: -59.573
Episode 2100	Average Score: -65.38	Score: -49.498
Learning Time: 38.4015991687774663	Score: -47.63
Learning Time: 37.0621423721313536	Score: -55.291
Learning Time: 37.3237171173095754	Score: -66.309
Learning Time: 36.9170858860015967	Score: -61.239
Learning Time: 37.0461139678955159	Score: -72.121
Learning Time: 37.6828618049621667	Score: -60.106
Learning Time: 38.8642320632934682	Score: -67.406
Learning Time: 37.3779289722442639	Score: -66.653
Learning Time: 37.2195560932159484	Score: -62.954
Learning Time: 36.7853131294250589	Score: -53.538
Learning Time: 37.0135233402252219	Score: -109.16
Episode 2200	Average Score: -82.47	Score: -45.004
Learning Time: 37.131163120269775
Learning Time: 38.5603919029235847	Score: -121.66
Learning Time: 38.0560498237609866	Score: -62.626
Learning Time: 39.2211518287658778	Score: -50.847
Learning Time: 39.4723792076110848	Score: -109.20
Learning Time: 38.3620851039886531	Score: -109.77
Learning Time: 38.2402420043945392	Score: -106.02
Learning Time: 38.2057130336761557	Score: -129.22
Learning Time: 38.1684286594390962	Score: -54.908
Episode 2300	Average Score: -90.15	Score: -56.921
Learning Time: 38.32826018333435
Learning Time: 37.9962320327758855	Score: -50.290
Learning Time: 38.1408498287200930	Score: -45.573
Learning Time: 37.9218039512634350	Score: -55.913
Learning Time: 38.8516297340393116	Score: -46.63
Learning Time: 38.4694867134094243	Score: -134.72
Learning Time: 38.0232138633728.26	Score: -102.87
Learning Time: 38.4803090095525.60	Score: -110.81
Learning Time: 38.0516288280487060	Score: -27.235
Learning Time: 37.8877630233764652	Score: -40.397
Learning Time: 37.8675010204315256	Score: -47.857
Learning Time: 38.1522800922393870	Score: -103.62
Episode 2400	Average Score: -74.44	Score: -130.62
Learning Time: 38.1433091163635254	Score: -104.51
Learning Time: 38.4205458164215127	Score: -109.41
Learning Time: 38.5601353645324717	Score: -102.33
Learning Time: 38.5608069896698.89	Score: -55.043
Learning Time: 38.2737209796905508	Score: -38.90
Learning Time: 38.8787438869476391	Score: -42.629
Learning Time: 38.4177069664001544	Score: -31.080
Learning Time: 39.5383489131927567	Score: -29.319
Learning Time: 38.8329219818115204	Score: -109.38
Learning Time: 39.6512629985809315	Score: -38.142
Learning Time: 39.6943120956420925	Score: -28.777
Learning Time: 38.4932961463928233	Score: -44.205
Episode 2500	Average Score: -71.13	Score: -104.94
Learning Time: 38.7969405651092511	Score: -14.491
Learning Time: 38.6863617897033721	Score: -49.108
Episode 2519	Average Score: -63.25	Score: -114.65