## Introduction:

This notebook is devoted to implement Q learning algorithm on Left_Right problem .. a case study in Batch Learning paper (Page 14) .. 

Paper Link ==> http://www.jmlr.org/papers/v6/ernst05a.html

Guided by this tutorial ==> http://mnemstudio.org/path-finding-q-learning-tutorial.htm

and also this one ==> https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0


## Experimental Setup:

num of simulations = 100,000

s = rand(0, 10)     selected at random 

a = [-2, 2] ==> -2: left, 2: right

r = [100, 0, 50] ==> 0: inside the interval, 50: left, 100: right

num of episods = 300 

In [229]:
import numpy as np 
import matplotlib.pyplot as plt
from sklearn import linear_model
import tensorflow as tf
import pandas as pd
%matplotlib inline

import warnings

## Variables Decleration: 

__Episods:

We'll call each exploration an episode.  
Each episode consists of the agent moving from the initial state to the goal state.  
Each time the agent arrives at the goal state, the program goes to the next episode.
hence, the episod is treated as if it's the number of runs! (Each episode is equivalent to one training session)


In [230]:
u=[-2,2]
gamma=0.75
N = 10
# M =  N + 4
j=0

s_next = np.zeros([N, 1])
next_state = np.zeros([N, 1])
r = np.zeros([N, 1])
s_current = np.zeros([N, 1])
action = list(np.zeros([N, 1]))
reward = np.zeros([N, 1])

## Q_table
Q = np.zeros([N, 2])

## Predicted Q
Q_ = np.zeros([N, 2])

In [231]:
## Creating r matrix 
for i in xrange(N):
    x = np.random.randint(10)
    a = np.random.randint(2)
    
    while True:
        s_next[i] = x + u[a] + np.random.rand()
        
        if s_next[i] > 10:
            r[i] = np.power(gamma, (i-1))*100
            reward[i] = r[i]
            s_current[i] = x
            action[i] = u[a]
            next_state[i] = s_next[i]
            break
        if s_next[i] < 0:
            r[i] = np.power(gamma, (i-1))*50
            reward[i] = r[i]
            s_current[i] = x
            action[i] = u[a]
            next_state[i] = s_next[i]
            break
        else: 
            r[i] = 0
            reward[i] = r[i]
            s_current[i] = x
            action[i] = u[a]
            next_state[i] = s_next[i]
            break
#     j = j + 1

In [232]:
s_next

array([[  4.2590229 ],
       [  5.74077382],
       [ 10.44190831],
       [  5.63336002],
       [ -0.21762443],
       [  1.29527138],
       [  7.20509082],
       [ 10.71434782],
       [  7.44950013],
       [  5.41970504]])

In [233]:
np.shape(s_current), np.shape(action), np.shape(r), np.shape(next_state)

((10, 1), (10,), (10, 1), (10, 1))

In [234]:
tuplesMx = np.column_stack((s_current, action, r, next_state))
tuplesMx

array([[  2.        ,   2.        ,   0.        ,   4.2590229 ],
       [  7.        ,  -2.        ,   0.        ,   5.74077382],
       [  8.        ,   2.        ,  75.        ,  10.44190831],
       [  7.        ,  -2.        ,   0.        ,   5.63336002],
       [  1.        ,  -2.        ,  21.09375   ,  -0.21762443],
       [  3.        ,  -2.        ,   0.        ,   1.29527138],
       [  9.        ,  -2.        ,   0.        ,   7.20509082],
       [  8.        ,   2.        ,  17.79785156,  10.71434782],
       [  9.        ,  -2.        ,   0.        ,   7.44950013],
       [  3.        ,   2.        ,   0.        ,   5.41970504]])

In [235]:
s1_a1 = pd.DataFrame(np.column_stack((next_state, list(np.random.choice([-2], N))))).values
s2_a2 = pd.DataFrame(np.column_stack((next_state, list(np.random.choice([2], N))))).values

## ANN:

In [236]:
n_nodes_hl1 = 15
n_nodes_hl2 = 25
NUM_STATES = np.shape(Q)[0]
NUM_ACTIONS = np.shape(Q)[1]

In [237]:
## init weights .. 

def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=1))

In [238]:
def model(X, w_h, w_o,bias_I,bias_h):
    h = tf.nn.relu(tf.matmul(X, w_h) + bias_I) 
    py_x = tf.matmul(h, w_o) + bias_h
    return py_x # note that we dont take the softmax at the end because our cost fn does that for us

In [239]:
def model_training(Curr_state_current_action, output):

    Curr_state_current_action = Curr_state_current_action.astype(np.float32) 
    w_h = init_weights([2, n_nodes_hl1]) # create symbolic variables\n",
    w_o = init_weights([n_nodes_hl1, 1])
    bias_I=init_weights([n_nodes_hl1])

    bias_h=init_weights([1])
    py_x = model(Curr_state_current_action, w_h, w_o,bias_I,bias_h)  #model training  

    cost = tf.reduce_mean(tf.square(py_x - output)) # compute costs\n",

    train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost) # construct an optimizer\n",
    
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)
    
    
    return w_h, w_o,bias_I,bias_h

In [None]:
## Prediction phase: 
def model_predict(s1_a1, s2_a2, output):

    s1_a1 = s1_a1.astype(np.float32)
    [w_h, w_o,bias_I,bias_h] = model_training(s1_a1, output[:,0])
    predict_op_1 = model(s1_a1, w_h, w_o,bias_I,bias_h)   #optimal prediction
      
        
    s2_a2 = s2_a2.astype(np.float32)
    [w_h, w_o,bias_I,bias_h] = model_training(s2_a2, output[:,1])
    predict_op_2 = model(s2_a2, w_h, w_o,bias_I,bias_h)   #optimal prediction
    
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    l1=sess.run(predict_op_1)
    l2=sess.run(predict_op_2)
    Q = [l1, l2]
    return np.transpose(Q)

In [None]:
gamma = 0.3
for i in xrange(N):
    Q[action] = reward + gamma*np.amax(Q_, axis = 1)[0]
    Q_ = model_predict(s1_a1, s2_a2, Q)
Q

In [None]:
f, ax = plt.subplots()
ax.plot(Q)
plt.title('Q value at each state in both actions')
plt.xlabel("State Number")
plt.ylabel("Q(s,a)")
plt.legend('LR')