In [26]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Intel(R) Core(TM) i5-2410M CPU @ 2.30GHz
stepping	: 7
microcode	: 0x1b
cpu MHz		: 2693.745
cache size	: 3072 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm epb pti tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm ida arat pln pts
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs itlb_multihit
bogomips	: 4589.46
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physica

# Setup

Install the dependencies:
```sh
pip install gym
pip install gym[atari]
```

In [None]:
#!pip install gym
#!pip install gym[atari]

---

# Useful Resources
* [Manual of the game](https://www.gamesdatabase.org/Media/SYSTEM/Atari_2600/Manual/formated/Freeway_-_1981_-_Zellers.pdf)
* [Freeway Disassembly](http://www.bjars.com/disassemblies.html)
* [Atari Ram Annotations](https://github.com/mila-iqia/atari-representation-learning/blob/master/atariari/benchmark/ram_annotations.py)
* [Freeway Benchmarks](https://paperswithcode.com/sota/atari-games-on-atari-2600-freeway)

---

# Description of the problem

TODO

---

# Imports

In [None]:
import sys
sys.path.append('../')  # Enable importing from `src` folder

In [None]:
%matplotlib inline
from collections import defaultdict
from typing import List

import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

import gym

import src.agents as agents
import src.episode as episode
import src.environment as environment
import src.aux_plots as aux_plots

In [None]:
def print_result(i, scores, total_reward, score):
    if i % 10 == 0:
        print(f"Run [{i:4}] - Total reward: {total_reward:7.2f} Mean scores: {sum(scores) / len(scores):.2f} Means Scores[:-10]: {sum(scores[-10:]) / len(scores[-10:]):5.2f} Score: {score:2} ")

---

# Environment

We will be using the Open AI Gym framework in this study.......

In [None]:
env, initial_state = environment.get_env()

print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(3)
Observation Space: Box(0, 255, (128,), uint8)


The agent in this game has three possible actions:

* 0: Stay
* 1: Move forward
* 2: Move back

TODO: Talk a bit about the observation space of 128 bytes of RAM...

---

# Representing the state of the game

TODO: explain why we must reduce the state space

```
      14  # Chicken Y
    , 16  # Chicken Lane Collide
    , 18  # Chicken Collision flag (with the bottom car)
    , 22  # Car X Direction
    , 23, 24, 25, 26, 27, 28, 29, 30, 31, 32  # Z Car Patterns
    , 33, 34, 35, 36, 37, 38, 39, 40, 41, 42  # Car Motion Timmers
    , 43, 44, 45, 46, 47, 48, 49, 50, 51, 52  # Car Motions
    , 87, 88  # Car Shape Ptr
    # TODO: test if this makes any difference
    , 89, 90  # Chicken Shape Ptr
    # TODO: test if this makes any difference
    , 106, 107  # Chicken Sounds
    , 108, 109, 110, 111, 112, 113, 114, 115, 116, 117  # Car X Coords
```

In [None]:
RAM_mask = [
      14  # Chicken Y
    , 16  # Chicken Lane Collide
    , 108, 109, 110, 111, 112, 113, 114, 115, 116, 117  # Car X Coords
]

In [None]:
def reduce_state(ob):
    # Doesn't matter where we were hit
    ob[16] = 1 if ob[16] != 255 else 0

    # Reduce chicken y-position 
    ob[14] = ob[14] // 3

    for b in range(108, 118):
        # The chicken is in the x-posistion ~49
        if ob[b] < 19 or ob[b] > 79:
            # We don't need to represent cars far from the chicken
            ob[b] = 0
        else:
            # Reduce the cars x-positions sample space 
            ob[b] = ob[b] // 3

    return ob

---

# Reward Policy

In [None]:
def reward_policy(reward, ob, action):
    if reward == 1:
        reward = reward_policy.REWARD_IF_CROSS
    
    elif ob[16] == 1:  # Collision!
        reward = reward_policy.REWARD_IF_COLISION
       
    elif action == 0:  # Don't move
        reward = reward_policy.REWARD_IF_STILL
        
    elif action == 1:  # Move forward
        reward = reward_policy.REWARD_IF_FW
    
    elif action == 2:  # Move backward
        reward = reward_policy.REWARD_IF_BW

    return reward

---

# Baseline: 1 action (only move forward):

In [None]:
baseline_scores = environment.run(agents.Baseline, render=False, n_runs=50, verbose=False)

with open("baseline_scores.txt", "w") as f:
    for item in baseline_scores:
        f.write("%s\n" % item)

In [None]:
# with open("baseline_scores.txt") as f:
#     baseline_scores = [int(x) for x in  f.read().splitlines()]

In [None]:
# Mean score
baseline_mean_score = sum(baseline_scores) / len(baseline_scores) 
print("Baseline mean score:", baseline_mean_score)

Baseline mean score: 21.7


---

# Q-Learning

## Changing hyper parameters: number of actions

### - 2 actions (move forward or stay):

In [54]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 = 2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 7000

In [55]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_2act = []
total_rewards_2act = []

In [56]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_2act.append(score)
    total_rewards_2act.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_2act, total_reward, score)

Run [   0] - Total reward: 2908.00 Mean scores: 10.00 Means Scores[:-10]: 10.00 Score: 10 
CPU times: user 1min 53s, sys: 32 ms, total: 1min 53s
Wall time: 1min 53s


In [57]:
with open("QL/QL_scores_2act.txt", "w") as f:
   for item in scores_2act:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_2act.txt", "w") as f:
   for item in total_rewards_2act:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_2act.txt") as f:
#    scores_2act = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_2act.txt") as f:
#    total_rewards_2act = [int(x) for x in  f.read().splitlines()]

### - 3 actions (move forward, stay or move backard):

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 3
N0 = 2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 
reward_policy.REWARD_IF_BW = -9

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_3act = []
total_rewards_3act = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_3act.append(score)
    total_rewards_3act.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_3act, total_reward, score)

Run [   0] - Total reward: -9239.00 Mean scores: 0.00 Means Scores[:-10]:  0.00 Score:  0 
CPU times: user 1min 54s, sys: 80 ms, total: 1min 54s
Wall time: 1min 54s


In [None]:
with open("QL/QL_scores_3act.txt", "w") as f:
   for item in scores_3act:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_3act.txt", "w") as f:
   for item in total_rewards_3act:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_3act.txt") as f:
#    scores_3act = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_3act.txt") as f:
#    total_rewards_3act = [int(x) for x in  f.read().splitlines()]

### - Results with different number of actions:

In [None]:
import importlib
importlib.reload(aux_plots)

<module 'src.aux_plots' from '/home/aline/Documents/GitHub/FreewayGame/aline.almeida/src/aux_plots.py'>

In [None]:
baseline = [baseline_mean_score for i in range(3000)]
aux_plots.plot_3scores(scores_3act[:3000], scores_2act[:3000], baseline[:3000], "3 actions (BW, FW or STAY)", "2 actions (FW or STAY)", "Baseline mean score (FW only)")

---

## Changing hyper parameters: Reward values

### - Sparse reward: $+1$ if cross the street, $-1$ if collide

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 = 2.5

reward_policy.REWARD_IF_CROSS = 1
reward_policy.REWARD_IF_COLISION = -1
reward_policy.REWARD_IF_STILL = 0
reward_policy.REWARD_IF_FW = 0 

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_2act_sparseR = []
total_rewards_2act_sparseR = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_2act_sparseR.append(score)
    total_rewards_2act_sparseR.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_2act_sparseR, total_reward, score)

Run [   0] - Total reward:  -58.00 Mean scores: 11.00 Means Scores[:-10]: 11.00 Score: 11 
CPU times: user 1min 58s, sys: 104 ms, total: 1min 59s
Wall time: 1min 59s


In [None]:
with open("QL/QL_scores_2act_sparseR.txt", "w") as f:
   for item in scores_2act_sparseR:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_2act_sparseR.txt", "w") as f:
   for item in total_rewards_2act_sparseR:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_2act_sparseR.txt") as f:
#    scores_2act_sparseR = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_2act_sparseR.txt") as f:
#    total_rewards_2act_sparseR = [int(x) for x in  f.read().splitlines()]

### - Results with sparse and dense rewards:

In [None]:
baseline = [baseline_mean_score for i in range(3000)]
aux_plots.plot_3scores(scores_2act_sparseR[:3000], scores_2act[:3000], baseline[:3000], "Sparse Reward", "Dense Reward", "Baseline mean score")

---

## Changing hyper parameters: Learning rate

### - $N0$ = 0.001

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 =  0.001

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 7000

In [None]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_2act_N0_0 = []
total_rewards_2act_N0_0 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_2act_N0_0.append(score)
    total_rewards_2act_N0_0.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_2act_N0_0, total_reward, score)

Run [   0] - Total reward: 4732.00 Mean scores: 13.00 Means Scores[:-10]: 13.00 Score: 13 
CPU times: user 1min 57s, sys: 60 ms, total: 1min 57s
Wall time: 1min 57s


In [None]:
with open("QL/QL_scores_2act_N0_0.txt", "w") as f:
   for item in scores_2act_N0_0:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_2act_N0_0.txt", "w") as f:
   for item in total_rewards_2act_N0_0:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_2act_N0_0.txt") as f:
#    scores_2act_N0_0 = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_2act_N0_0.txt") as f:
#    total_rewards_2act_N0_0 = [int(x) for x in  f.read().splitlines()]

### - $N0$ = 5

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 =  5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 7000

In [None]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_2act_N0_5 = []
total_rewards_2act_N0_5 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_2act_N0_5.append(score)
    total_rewards_2act_N0_5.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_2act_N0_5, total_reward, score)

Run [   0] - Total reward: 4057.00 Mean scores: 12.00 Means Scores[:-10]: 12.00 Score: 12 
CPU times: user 2min 1s, sys: 72 ms, total: 2min 1s
Wall time: 2min 1s


In [None]:
with open("QL/QL_scores_2act_N0_5.txt", "w") as f:
   for item in scores_2act_N0_5:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_2act_N0_5.txt", "w") as f:
   for item in total_rewards_2act_N0_5:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_2act_N0_5.txt") as f:
#    scores_2act_N0_5 = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_2act_N0_5.txt") as f:
#    total_rewards_2act_N0_5 = [int(x) for x in  f.read().splitlines()]

### - Results with different $N0$ values:

In [None]:
aux_plots.plot_3scores(scores_2act_N0_0d, scores_2act, scores_2act_N0_5, "N0=0.0", "N0=2.5", "N0=5.0")

---

## Changing hyper parameters: Discount factor

### - $ɣ$ = 0.90

In [None]:
GAMMA = 0.90
AVAILABLE_ACTIONS = 2
N0 =  2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_2act_gamma_0p9 = []
total_rewards_2act_gamma_0p9 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_2act_gamma_0p9.append(score)
    total_rewards_2act_gamma_0p9.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_2act_gamma_0p9, total_reward, score)

Run [   0] - Total reward: 3505.00 Mean scores: 11.00 Means Scores[:-10]: 11.00 Score: 11 
CPU times: user 2min 7s, sys: 124 ms, total: 2min 7s
Wall time: 2min 7s


In [None]:
with open("QL/QL_scores_2act_gamma_0p9.txt", "w") as f:
   for item in scores_2act_gamma_0p9:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_2act_gamma_0p9.txt", "w") as f:
   for item in total_rewards_2act_gamma_0p9:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_2act_gamma_0p9.txt") as f:
#    scores_2act_gamma_0p9 = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_2act_gamma_0p9.txt") as f:
#    total_rewards_2act_gamma_0p9 = [int(x) for x in  f.read().splitlines()]

### - $ɣ$ = 0.75

In [None]:
GAMMA = 0.75
AVAILABLE_ACTIONS = 2
N0 =  2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.QLearning(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

scores_2act_gamma_0p75 = []
total_rewards_2act_gamma_0p75 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0

    game_over = False
    state = env.reset()
    state = reduce_state(state)[RAM_mask].data.tobytes()  # Select useful bytes
    action = agent.act(state)
   
    score = 0
    total_reward = 0

    while not game_over:
        if render:
            time.sleep(0.015)
            env.render()

        old_state = state
        ob, reward, game_over, _ = env.step(action)

        ob = reduce_state(ob)
        reward = reward_policy(reward, ob, action)

        total_reward += reward

        if reward == reward_policy.REWARD_IF_CROSS:
            score += 1

        state = ob[RAM_mask].data.tobytes()

        agent.update_Q(old_state, state, action, reward)

        action = agent.act(state)  # Next action

    scores_2act_gamma_0p75.append(score)
    total_rewards_2act_gamma_0p75.append(total_reward)
    
    if i % 100 == 0:
        print_result(i, scores_2act_gamma_0p75, total_reward, score)

Run [   0] - Total reward: 4733.00 Mean scores: 13.00 Means Scores[:-10]: 13.00 Score: 13 
CPU times: user 2min 4s, sys: 104 ms, total: 2min 4s
Wall time: 2min 4s


In [None]:
with open("QL/QL_scores_2act_gamma_0p75.txt", "w") as f:
   for item in scores_2act_gamma_0p75:
       f.write("%s\n" % item)

with open("QL/QL_total_rewards_2act_gamma_0p75.txt", "w") as f:
   for item in total_rewards_2act_gamma_0p75:
       f.write("%s\n" % item)

In [None]:
# with open("QL/QL_scores_2act_gamma_0p75.txt") as f:
#    scores_2act_gamma_0p75 = [int(x) for x in  f.read().splitlines()]

# with open("QL/QL_total_rewards_2act_gamma_0p75.txt") as f:
#    total_rewards_2act_gamma_0p75 = [int(x) for x in  f.read().splitlines()]

### - Results with different $ɣ$ values:

In [None]:
aux_plots.plot_3scores(scores_2act_gamma_0p9[:3000], scores_2act[:3000], scores_2act_gamma_0p75[:3000], "gamma=0.90", "gamma=0.99", "gamma=0.75")

---

# Monte Carlo Control

## Changing hyper parameters: number of actions

### - 2 actions (move forward or stay):

In [49]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 = 2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 7000

In [50]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)

scores_2act = []
total_rewards_2act = []

In [51]:
%%time
MonteCarloES(RAM_mask=RAM_mask, render=False)

CPU times: user 3.74 s, sys: 4 ms, total: 3.75 s
Wall time: 3.74 s


(13, 4551)

In [52]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_2act.append(score)
    total_rewards_2act.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_2act, total_reward, score)

Run [   0] - Total reward: 5797.00 Mean scores: 15.00 Means Scores[:-10]: 15.00 Score: 15 
CPU times: user 2min 6s, sys: 76 ms, total: 2min 6s
Wall time: 2min 6s


In [53]:
with open("MC/MC_scores_2act.txt", "w") as f:
   for item in scores_2act:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_2act.txt", "w") as f:
   for item in total_rewards_2act:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_2act.txt") as f:
#    scores_2act = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_2act.txt") as f:
#    total_rewards_2act = [int(x) for x in  f.read().splitlines()]

### - 3 actions (move forward, stay or move backard):

In [40]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 3
N0 = 2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 
reward_policy.REWARD_IF_BW = -9

n_runs = 3000

In [45]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)

scores_3act = []
total_rewards_3act = []

In [46]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_3act.append(score)
    total_rewards_3act.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_3act, total_reward, score)

Run [   0] - Total reward: -9550.00 Mean scores: 0.00 Means Scores[:-10]:  0.00 Score:  0 
CPU times: user 2min, sys: 27.9 ms, total: 2min
Wall time: 2min


In [43]:
with open("MC/MC_scores_3act.txt", "w") as f:
   for item in scores_3act:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_3act.txt", "w") as f:
   for item in total_rewards_3act:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_3act.txt") as f:
#    scores_3act = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_3act.txt") as f:
#    total_rewards_3act = [int(x) for x in  f.read().splitlines()]

### - Results with different number of actions:

In [None]:
baseline = [baseline_mean_score for i in range(3000)]
aux_plots.plot_3scores(scores_3act[:3000], scores_2act[:3000], baseline[:3000], "3 actions (BW, FW or STAY)", "2 actions (FW or STAY)", "Baseline mean score (FW only)")

---

## Changing hyper parameters: Reward values

### - Sparse reward: $+1$ if cross the street, $-1$ if collide

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 = 2.5

reward_policy.REWARD_IF_CROSS = 1
reward_policy.REWARD_IF_COLISION = -1
reward_policy.REWARD_IF_STILL = 0
reward_policy.REWARD_IF_FW = 0 

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)


scores_2act_sparseR = []
total_rewards_2act_sparseR = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_2act_sparseR.append(score)
    total_rewards_2act_sparseR.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_2act_sparseR, total_reward, score)

In [None]:
with open("MC/MC_scores_2act_sparseR.txt", "w") as f:
   for item in scores_2act_sparseR:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_2act_sparseR.txt", "w") as f:
   for item in total_rewards_2act_sparseR:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_2act_sparseR.txt") as f:
#    scores_2act_sparseR = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_2act_sparseR.txt") as f:
#    total_rewards_2act_sparseR = [int(x) for x in  f.read().splitlines()]

### - Results with sparse and dense rewards:

In [None]:
baseline = [baseline_mean_score for i in range(3000)]
aux_plots.plot_3scores(scores_2act_sparseR[:3000], scores_2act[:3000], baseline[:3000], "Sparse Reward", "Dense Reward", "Baseline mean score")

---

## Changing hyper parameters: Learning rate

### - $N0$ = 0.001

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 =  0.001

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 7000

In [None]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)

scores_2act_N0_0 = []
total_rewards_2act_N0_0 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_2act_N0_0.append(score)
    total_rewards_2act_N0_0.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_2act_N0_0, total_reward, score)

In [None]:
with open("MC/MC_scores_2act_N0_0.txt", "w") as f:
   for item in scores_2act_N0_0:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_2act_N0_0.txt", "w") as f:
   for item in total_rewards_2act_N0_0:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_2act_N0_0.txt") as f:
#    scores_2act_N0_0 = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_2act_N0_0.txt") as f:
#    total_rewards_2act_N0_0 = [int(x) for x in  f.read().splitlines()]

### - $N0$ = 5

In [None]:
GAMMA = 0.99
AVAILABLE_ACTIONS = 2
N0 =  5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 7000

In [None]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)

scores_2act_N0_5 = []
total_rewards_2act_N0_5 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_2act_N0_5.append(score)
    total_rewards_2act_N0_5.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_2act_N0_5, total_reward, score)

In [None]:
with open("MC/MC_scores_2act_N0_5.txt", "w") as f:
   for item in scores_2act_N0_5:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_2act_N0_5.txt", "w") as f:
   for item in total_rewards_2act_N0_5:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_2act_N0_5.txt") as f:
#    scores_2act_N0_5 = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_2act_N0_5.txt") as f:
#    total_rewards_2act_N0_5 = [int(x) for x in  f.read().splitlines()]

### - Results with different $N0$ values:

In [None]:
aux_plots.plot_3scores(scores_2act_N0_0, scores_2act, scores_2act_N0_5, "N0=0.0", "N0=2.5", "N0=5.0")

---

## Changing hyper parameters: Discount factor

### - $ɣ$ = 0.90

In [None]:
GAMMA = 0.90
AVAILABLE_ACTIONS = 2
N0 =  2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)

scores_2act_gamma_0p9 = []
total_rewards_2act_gamma_0p9 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_2act_gamma_0p9.append(score)
    total_rewards_2act_gamma_0p9.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_2act_gamma_0p9, total_reward, score)

In [None]:
with open("MC/MC_scores_2act_gamma_0p9.txt", "w") as f:
   for item in scores_2act_gamma_0p9:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_2act_gamma_0p9.txt", "w") as f:
   for item in total_rewards_2act_gamma_0p9:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_2act_gamma_0p9.txt") as f:
#    scores_2act_gamma_0p9 = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_2act_gamma_0p9.txt") as f:
#    total_rewards_2act_gamma_0p9 = [int(x) for x in  f.read().splitlines()]

### - $ɣ$ = 0.75

In [None]:
GAMMA = 0.75
AVAILABLE_ACTIONS = 2
N0 =  2.5

reward_policy.REWARD_IF_CROSS = 500
reward_policy.REWARD_IF_COLISION = -10
reward_policy.REWARD_IF_STILL = -1
reward_policy.REWARD_IF_FW = 0 

n_runs = 3000

In [None]:
env, initial_state = environment.get_env()
agent = agents.MonteCarloControl(gamma=GAMMA, available_actions=AVAILABLE_ACTIONS, N0=N0)

def MonteCarloES(RAM_mask: List[int], render: bool=False):
    epi = episode.generate_episode(env, reduce_state=reduce_state, reward_policy=reward_policy, agent=agent, RAM_mask=RAM_mask, render=render)
    return agent.update_policy(epi)

scores_2act_gamma_0p75 = []
total_rewards_2act_gamma_0p75 = []

In [None]:
%%time

for i in range(n_runs):
    #render = i % 100 == 0
    render = 0
    
    score, total_reward = MonteCarloES(RAM_mask=RAM_mask, render=render)

    scores_2act_gamma_0p75.append(score)
    total_rewards_2act_gamma_0p75.append(total_reward)

    if i % 100 == 0:
        print_result(i, scores_2act_gamma_0p75, total_reward, score)

In [None]:
with open("MC/MC_scores_2act_gamma_0p75.txt", "w") as f:
   for item in scores_2act_gamma_0p75:
       f.write("%s\n" % item)

with open("MC/MC_total_rewards_2act_gamma_0p75.txt", "w") as f:
   for item in total_rewards_2act_gamma_0p75:
       f.write("%s\n" % item)

In [None]:
# with open("MC/MC_scores_2act_gamma_0p75.txt") as f:
#    scores_2act_gamma_0p75 = [int(x) for x in  f.read().splitlines()]

# with open("MC/MC_total_rewards_2act_gamma_0p75.txt") as f:
#    total_rewards_2act_gamma_0p75 = [int(x) for x in  f.read().splitlines()]

### - Results with different $ɣ$ values:

In [None]:
aux_plots.plot_3scores(scores_2act_gamma_0p9[:3000], scores_2act[:3000], scores_2act_gamma_0p75[:3000], "gamma=0.90", "gamma=0.99", "gamma=0.75")