# Q-Learning

* Q Table은 초기에 0값 (np.zeros) 로 시작합니다. 
* Q Table의 row는 states가 되고, column은 actions로 간주

### ForzenLake v0

```
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)
```

Actions 

* 오른쪽으로 이동: 3


### Bellman Equation 



## $$ Q(s, a) = r + \gamma( max(Q(s^{\prime}, a^{\prime}))) $$ 

# Code Implementation

### Load Dependencies

In [51]:
import gym
import numpy as np
from time import sleep
from IPython.display import clear_output

In [170]:
class QLearning(object):
    
    def __init__(self, game, epoch = 20000, lr=0.85, gamma=0.90):
        self.env = gym.make(game)
        self.epoch = epoch
        self.lr = lr
        self.gamma = gamma
        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        
    def train(self):
        env, Q, lr, gamma = self.env, self.Q, self.lr, self.gamma
        for i in xrange(1, self.epoch+1):
            state = self.env.reset()
            
            while True:
                action = np.argmax(Q[state,:] + np.random.randn(1, env.action_space.n)*(1./(i+1)))
                next_state, reward, done, info = env.step(action)
                
                Q[state, action] += lr*(reward + gamma * max(Q[next_state,:]) - Q[state, action])
                if done:
                    break
                    
                state = next_state
            
            if i% 1000 == 0:
                self.render()
                print 'epoch:', i
                print Q
            
    def play(self):
        env = self.env
        score = 0
        count = 0
        state = env.reset()
        while True:
            action = np.argmax(Q[state, :] + np.random.randn(1, env.action_space.n) * 0.01)
            state, reward, done, info = env.step(action)
            
            self.render()
            count += 1
            score += reward
            if done:
                self.render()
                break
        print 'Total Score:', score, 'Move Count:', count
    
    def render(self):
        clear_output(True)
        self.env.render()
        sleep(0.1)

q = QLearning('FrozenLake-v0')
q.train()
q.play()

SFFF
FHF[41mH[0m
FFFH
HFFG
  (Right)
Total Score: 0.0 Move Count: 18


In [155]:

env = gym.make('FrozenLake-v0')
#Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)

print Q

[2016-11-19 22:43:54,665] Making new env: FrozenLake-v0


[[  7.88759455e-01   6.54992123e-03   8.41997691e-03   5.37595561e-03]
 [  7.16261270e-03   1.18899485e-03   5.84362665e-03   5.15822614e-01]
 [  6.04489565e-03   4.55799800e-03   1.21974078e-03   6.54431115e-01]
 [  6.51110492e-04   4.13315422e-03   4.46765414e-03   3.89867052e-01]
 [  8.78078560e-01   1.19218779e-03   1.12876108e-03   1.33154362e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.27943330e-03   1.57400011e-05   8.31065691e-02   2.47979630e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.13719353e-03   7.91424504e-04   1.48208782e-04   9.55378413e-01]
 [  3.18288318e-04   9.78880595e-01   4.16395414e-04   0.00000000e+00]
 [  9.57102244e-01   3.90364280e-04   5.96811765e-04   1.89703368e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   2.38438588e-03   9.89929312e-01   2.83349481e-03]
 [  0.