### Anjali Pugalia MDS202107
### Ankush Dey MDS202108
### Rititrupa Dey MDS202136

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import imageio
import os
from IPython.display import HTML

def save_frames_as_gif(frames, path='./', filename='gym_animation.gif'):
    """Takes a list of frames (each frame can be generated with the `env.render()` function from OpenAI gym)
    and converts it into GIF, and saves it to the specified location.
    Code adapted from this gist: https://gist.github.com/botforge/64cbb71780e6208172bbf03cd9293553
    Args:
        frames (list): A list of frames generated with the env.render() function
        path (str, optional): The folder in which to save the generated GIF. Defaults to './'.
        filename (str, optional): The target filename. Defaults to 'gym_animation.gif'.
    """
    imageio.mimwrite(os.path.join(path, filename), frames, fps=15)

In [3]:
env = gym.make("CartPole-v1" , render_mode="rgb_array")


In [4]:
def create_bins_and_q_table():

    numBins = 20
    obsSpaceSize = len(env.observation_space.high)

    # Get the size of each bucket
    bins = [
        np.linspace(-4.8, 4.8, numBins),
        np.linspace(-4, 4, numBins),
        np.linspace(-.418, .418, numBins),
        np.linspace(-4, 4, numBins)
    ]

    qTable = np.random.uniform(low=-2, high=0, size=([numBins] * obsSpaceSize + [env.action_space.n]))

    return bins, obsSpaceSize, qTable


In [5]:
def get_discrete_state(state, bins, obsSpaceSize):
    stateIndex = []
    for i in range(obsSpaceSize):
        stateIndex.append(np.digitize(state[i], bins[i]) - 1) # -1 will turn bin into index
    return tuple(stateIndex)


In [6]:
frames = []
LEARNING_RATE = 0.05
DISCOUNT = 0.90
RUNS = 8000  # Number of iterations run
SHOW_EVERY = 2000  
UPDATE_EVERY = 250  

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = RUNS // 2
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

bins, obsSpaceSize, qTable = create_bins_and_q_table()

previousCnt = []  # array of all scores over runs
metrics = {'ep': [], 'avg': [], 'min': [], 'max': []}  # metrics recorded for graph

for run in range(RUNS + 1):
    state1 = env.reset()
    discreteState = get_discrete_state(state1[0], bins, obsSpaceSize)
    done = False  # has the enviroment finished?
    cnt = 0  # how may movements cart has made

    while not done:
        if run == 8000: #//render
            frame = env.render()
            frames.append(frame) # if running RL comment this out

        cnt += 1
        # Get action from Q table
        if np.random.random() > epsilon:
            action = np.argmax(qTable[discreteState])
		# Get random action
        else:
            action = np.random.randint(0, env.action_space.n)
        newState, reward, done, _, extra = env.step(action)  # perform action on enviroment

        newDiscreteState = get_discrete_state(newState, bins, obsSpaceSize)

        maxFutureQ = np.max(qTable[newDiscreteState])  # estimate of optiomal future value
        currentQ = qTable[discreteState + (action, )]  # old value

		# pole fell over / went out of bounds, negative reward
		# if done and cnt < 200:
		# 	reward = -375

		# formula to caculate all Q values
        newQ = (1 - LEARNING_RATE) * currentQ + LEARNING_RATE * (reward + DISCOUNT * maxFutureQ)
        qTable[discreteState + (action, )] = newQ  # Update qTable with new Q value

        discreteState = newDiscreteState

    previousCnt.append(cnt)

	# Decaying is being done every run if run number is within decaying range
    if END_EPSILON_DECAYING >= run >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

	# Add new metrics for graph
    if run % UPDATE_EVERY == 0:
        latestRuns = previousCnt[-UPDATE_EVERY:]
        averageCnt = sum(latestRuns) / len(latestRuns)
        metrics['ep'].append(run)
        metrics['avg'].append(averageCnt)
        metrics['min'].append(min(latestRuns))
        metrics['max'].append(max(latestRuns))
        print("Run:", run, "Average:", averageCnt, "Min:", min(latestRuns), "Max:", max(latestRuns))

#print("Run:", run, "Average:", averageCnt, "Min:", min(latestRuns), "Max:", max(latestRuns))
env.close()
save_frames_as_gif(frames, path='./', filename='random_agent11.gif')

Run: 0 Average: 15.0 Min: 15 Max: 15
Run: 250 Average: 21.708 Min: 8 Max: 82
Run: 500 Average: 25.748 Min: 8 Max: 94
Run: 750 Average: 25.74 Min: 8 Max: 91
Run: 1000 Average: 29.148 Min: 9 Max: 90
Run: 1250 Average: 33.424 Min: 10 Max: 110
Run: 1500 Average: 37.312 Min: 10 Max: 138
Run: 1750 Average: 42.504 Min: 9 Max: 165
Run: 2000 Average: 53.008 Min: 10 Max: 146
Run: 2250 Average: 55.032 Min: 11 Max: 154
Run: 2500 Average: 65.52 Min: 12 Max: 271
Run: 2750 Average: 87.96 Min: 13 Max: 244
Run: 3000 Average: 100.472 Min: 20 Max: 454
Run: 3250 Average: 107.516 Min: 13 Max: 595
Run: 3500 Average: 95.528 Min: 20 Max: 296
Run: 3750 Average: 118.564 Min: 38 Max: 790
Run: 4000 Average: 245.676 Min: 33 Max: 13184
Run: 4250 Average: 110.256 Min: 48 Max: 200
Run: 4500 Average: 112.888 Min: 57 Max: 204
Run: 4750 Average: 110.976 Min: 43 Max: 192
Run: 5000 Average: 112.732 Min: 53 Max: 183
Run: 5250 Average: 114.276 Min: 58 Max: 181
Run: 5500 Average: 113.584 Min: 49 Max: 185
Run: 5750 Average: 1

In [7]:
len(frames)

125

In [8]:
HTML('<img src="./random_agent11.gif">')

In [9]:
print("The Updated Q-Table shows the optimal q values after running")
qTable[-1][-1]

The Updated Q-Table shows the optimal q values after running


array([[[-6.20741571e-01, -8.36379676e-01],
        [-1.32251550e+00, -5.49759246e-01],
        [-5.64592448e-01, -1.20696397e+00],
        [-1.36468021e+00, -3.27526945e-01],
        [-1.08359427e+00, -1.31333995e+00],
        [-4.77734711e-01, -8.15983863e-01],
        [-1.56418595e+00, -4.56863288e-01],
        [-3.20273090e-01, -3.30669346e-01],
        [-9.45208484e-01, -1.00903190e+00],
        [-9.70846287e-01, -1.20344645e-01],
        [-1.61211470e+00, -1.06674501e+00],
        [-1.84286899e+00, -1.55402684e+00],
        [-1.73164549e+00, -1.22494071e+00],
        [-7.38675694e-01, -1.23495314e+00],
        [-1.23770598e+00, -1.46935663e-01],
        [-1.75010252e+00, -9.26970169e-01],
        [-1.15608055e+00, -7.93774696e-01],
        [-4.90209031e-02, -1.90114413e+00],
        [-1.23803546e+00, -3.06782457e-01],
        [-1.10813877e+00, -1.42768459e-02]],

       [[-5.11322329e-01, -4.99029828e-01],
        [-4.84051388e-02, -6.85289022e-01],
        [-1.19404273e+00, -9.7

## Second part

In [10]:
LEARNING_RATE = 0.1

DISCOUNT = 0.95
RUNS = 8000  # Number of iterations run
SHOW_EVERY = 2000  
UPDATE_EVERY = 250  

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = RUNS // 2
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)

bins, obsSpaceSize, qTable = create_bins_and_q_table()

previousCnt = []  # array of all scores over runs
metrics = {'ep': [], 'avg': [], 'min': [], 'max': []}  # metrics recorded for graph


for run in range(RUNS+1):
    state1 = env.reset()
    discreteState = get_discrete_state(state1[0], bins, obsSpaceSize)
    done = False  # has the enviroment finished?
    cnt = 0  # how may movements cart has made

    while not done:
       
        if run == 8000:
            frame = env.render()
            frames.append(frame)  # if running RL comment this out


        cnt += 1
		# Get action from Q table
        if np.random.random() > epsilon:
            action = np.argmax(qTable[discreteState])
		# Get random action
        else:
            action = np.random.randint(0, env.action_space.n)
        newState, reward, done, _ , extra = (env.step(action)) # perform action on enviroment
        if (newState[2] > 0.104 and newState[2] < .209) or (newState[2] < -0.104 and newState[2] > -.209):
            reward = 2
        else:
            reward = 0
        if (abs(newState[2]) > 0.20944 or abs(newState[0]) > 2.4) or cnt >= 500:
            break 
    
    # reward = reward_mod(newState)
    
    
    # add condition for reward here 
    

        newDiscreteState = get_discrete_state(newState, bins, obsSpaceSize)
    
        maxFutureQ = np.max(qTable[newDiscreteState])  # estimate of optiomal future value
        currentQ = qTable[discreteState + (action, )]  # old value
    

		# pole fell over / went out of bounds, negative reward
        

		# formula to calculate all Q values
        newQ = (1 - LEARNING_RATE) * currentQ + LEARNING_RATE * (reward + DISCOUNT * maxFutureQ)
        qTable[discreteState + (action, )] = newQ  # Update qTable with new Q value

        discreteState = newDiscreteState

    previousCnt.append(cnt)
  

	# Decaying is being done every run if run number is within decaying range
    if END_EPSILON_DECAYING >= run >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

	# Add new metrics for graph
    if run % UPDATE_EVERY == 0:
        latestRuns = previousCnt[-UPDATE_EVERY:]
        averageCnt = sum(latestRuns) / len(latestRuns)
        metrics['ep'].append(run)
        metrics['avg'].append(averageCnt)
        metrics['min'].append(min(latestRuns))
        metrics['max'].append(max(latestRuns))
        print("Run:", run, "Average:", averageCnt, "Min:", min(latestRuns), "Max:", max(latestRuns))

#print("Run:", run, "Average:", averageCnt, "Min:", min(latestRuns), "Max:", max(latestRuns))
env.close()
save_frames_as_gif(frames, path='./', filename='random_agent12.gif')

Run: 0 Average: 20.0 Min: 20 Max: 20
Run: 250 Average: 20.828 Min: 9 Max: 71
Run: 500 Average: 19.344 Min: 8 Max: 67
Run: 750 Average: 19.968 Min: 8 Max: 54
Run: 1000 Average: 18.58 Min: 9 Max: 65
Run: 1250 Average: 20.748 Min: 9 Max: 65
Run: 1500 Average: 20.16 Min: 9 Max: 62
Run: 1750 Average: 20.704 Min: 9 Max: 92
Run: 2000 Average: 20.82 Min: 9 Max: 67
Run: 2250 Average: 24.564 Min: 9 Max: 106
Run: 2500 Average: 30.464 Min: 9 Max: 100
Run: 2750 Average: 31.66 Min: 9 Max: 95
Run: 3000 Average: 37.688 Min: 9 Max: 117
Run: 3250 Average: 47.732 Min: 9 Max: 115
Run: 3500 Average: 51.068 Min: 9 Max: 106
Run: 3750 Average: 57.84 Min: 11 Max: 111
Run: 4000 Average: 67.348 Min: 21 Max: 104
Run: 4250 Average: 64.688 Min: 35 Max: 88
Run: 4500 Average: 66.764 Min: 35 Max: 90
Run: 4750 Average: 68.9 Min: 35 Max: 91
Run: 5000 Average: 65.404 Min: 35 Max: 106
Run: 5250 Average: 67.928 Min: 35 Max: 89
Run: 5500 Average: 68.684 Min: 35 Max: 104
Run: 5750 Average: 66.764 Min: 35 Max: 88
Run: 6000 Av

In [11]:
HTML('<img src="./random_agent12.gif">')

In [12]:
print("The Updated Q-Table shows the optimal q values after running")
qTable[-1][-1]

The Updated Q-Table shows the optimal q values after running


array([[[-0.91174789, -0.81806955],
        [-1.44730952, -0.53024561],
        [-0.90999712, -1.35222101],
        [-0.80003105, -1.39046972],
        [-0.16621903, -0.82316347],
        [-1.94182429, -0.87202294],
        [-0.76857238, -0.98058728],
        [-1.18106035, -1.65807264],
        [-0.07478846, -0.88958709],
        [-0.81637823, -0.713458  ],
        [-1.4385511 , -0.06945146],
        [-0.68544282, -1.66397286],
        [-1.70460696, -1.38488906],
        [-1.04612356, -1.57530755],
        [-1.36034006, -1.29675356],
        [-0.84637053, -1.54748808],
        [-0.42026908, -0.72716943],
        [-1.72918025, -1.03524873],
        [-1.38989174, -1.36791417],
        [-1.87508642, -1.77763339]],

       [[-0.07915671, -1.31640454],
        [-0.45516375, -1.78384579],
        [-0.65034228, -0.20891379],
        [-0.6627779 , -0.81214574],
        [-0.01834734, -1.98880729],
        [-0.67815359, -1.86036265],
        [-1.0113034 , -0.07670732],
        [-1.1324417 , -1.0