In [None]:
import gym
import numpy as np
import random
import csv
env = gym.make("Boxing-ram-v0")

In [None]:
def build_state(features):
    return int("".join(map(lambda feature: str(int(feature)), features)))


In [None]:
class QLearn:
    def __init__(self, actions, epsilon, alpha, gamma):
        self.q = {}
        self.epsilon = epsilon  # exploration constant
        self.alpha = alpha  # discount constant
        self.gamma = gamma  # discount factor
        self.actions = actions

    def getQ(self, state, action):
        return self.q.get((state, action), 0.0)

    def learnQ(self, state, action, reward, value):
        '''
        Q-learning:
            Q(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a))
        '''
        oldv = self.q.get((state, action), None)
        if oldv is None:
            self.q[(state, action)] = reward
        else:
            self.q[(state, action)] = oldv + self.alpha * (value - oldv)

    def chooseAction(self, state, return_q=False):
        q = [self.getQ(state, a) for a in self.actions]
        maxQ = max(q)

        if random.random() < self.epsilon:
            minQ = min(q);
            mag = max(abs(minQ), abs(maxQ))
            # add random values to all the actions, recalculate maxQ
            q = [q[i] + random.random() * mag - .5 * mag for i in range(len(self.actions))]
            maxQ = max(q)
            # return random.choice(self.actions)

        count = q.count(maxQ)
        # In case there're several state-action max values
        # we select a random one among them
        if count > 1:
            best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            i = random.choice(best)
        else:
            i = q.index(maxQ)
        action = self.actions[i]
        if return_q:  # if they want it, give it!
            return action, q
        return action

    def learn(self, state1, action1, reward, state2):
        maxqnew = max([self.getQ(state2, a) for a in self.actions])
        self.learnQ(state1, action1, reward, reward + self.gamma * maxqnew)

In [None]:
import csv

def csv_log(episode, g):
    f = open(f'./log.csv', mode='a+')
    writer = csv.writer(f)
    writer.writerow([episode, g])
    f.close()

In [None]:
def learn_one_episode(Q, episode):
  done = False
  G, reward = 0, 0
  state = env.reset()
  while not done:
      action = Q.chooseAction(build_state(state))
      state2, reward, done, info = env.step(action)
      Q.learn(build_state(state), action, reward, build_state(state))
      G += reward
      state = state2
  csv_log(episode, G)
  print('Episode {} Total Reward: {}'.format(episode, G))

In [None]:
Q = QLearn(list(range(0, 18)), 0.4, 0.618, 0.9)

for episode in range(1, 1001):
  learn_one_episode(Q, episode)

Q.epsilon = 0.3
Q.alpha = 0.518
Q.gamma = 0.8

for episode in range(1, 1001):
  learn_one_episode(Q, episode)


Q.epsilon = 0.2
Q.alpha = 0.418
Q.gamma = 0.7

for episode in range(1, 1001):
  learn_one_episode(Q, episode)

Q.epsilon = 0.1
Q.alpha = 0.318
Q.gamma = 0.6

for episode in range(1, 1001):
  learn_one_episode(Q, episode)

Episode 1 Total Reward: -2.0
Episode 2 Total Reward: 1.0
Episode 3 Total Reward: 10.0
Episode 4 Total Reward: 10.0
Episode 5 Total Reward: -3.0
Episode 6 Total Reward: 13.0
Episode 7 Total Reward: 6.0
Episode 8 Total Reward: 4.0
Episode 9 Total Reward: 1.0
Episode 10 Total Reward: -2.0
Episode 11 Total Reward: -5.0
Episode 12 Total Reward: 1.0
Episode 13 Total Reward: 5.0
Episode 14 Total Reward: -2.0
Episode 15 Total Reward: -2.0
Episode 16 Total Reward: 1.0
Episode 17 Total Reward: 2.0
Episode 18 Total Reward: 1.0
Episode 19 Total Reward: 1.0
Episode 20 Total Reward: 0.0
Episode 21 Total Reward: -1.0
Episode 22 Total Reward: 4.0
Episode 23 Total Reward: -2.0
Episode 24 Total Reward: 1.0
Episode 25 Total Reward: 1.0
Episode 26 Total Reward: 4.0
Episode 27 Total Reward: -5.0
Episode 28 Total Reward: 5.0
Episode 29 Total Reward: -9.0
Episode 30 Total Reward: 0.0
