# 2 TD learning with function approximation (3 pts.)
Consider the MDP (S, A, p, r, γ), where  
&emsp;&emsp;• S = {1, 2, 3, 4, 5, 6, 7} is the state space;  
&emsp;&emsp;• A = {A, B} is the action space;  
&emsp;&emsp;• The transition probabilities are summarized in the matrices  

![image.info](./pictures/p-matrices.png)

&emsp;&emsp;• r(s, a) = 0 for all pairs (s, a) ∈ S × A.  
&emsp;&emsp;• γ = 0.99

The q-function computed by the two algorithms should be:
&emsp;&emsp;qw(x, a) = sum(k=1, 15)(φk(x, a)wk

In [10]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from fractions import Fraction

import tqdm

In [158]:
# TD learning with function approximation

class TDLearning(object):
    
    def __init__(self, algorithm='q-learning', steps=500, alpha=0.01, gamma=0.99):
        
        # States are [1, 2, 3, 4, 5, 6, 7]
        self.state_space = np.arange(1, 8)
        
        # Initial state
        self.current_state = 1
        
        # Action space is [A, B]
        self.action_space = ['A', 'B']
        
        # Probability matrix A
        self.p_a = np.zeros((7, 7))
        self.p_a[:, 6] = 1
        
        # Probability matrix B
        self.p_b = np.zeros((7, 7))
        self.p_b[:, :6] = Fraction(1, 6)
        self.p_b[:, 6] = 0
        
        # Reward values are 0 for all action pairs (s, a)
        self.reward = 0
        
        # Initialize number of timesteps, step-size and discount factor
        self.steps = steps
        self.alpha = alpha
        self.gamma = gamma
        
        # Initialize feature vector A - rows: states, columns: features
        self.f_a = np.zeros((7, 15))
        for i in np.arange(7):
            self.f_a[i, i] = 2
        self.f_a[6, 6] = 1
        self.f_a[:, 7] = 2
        self.f_a[6, 7] = 1
        
        # Initialize feature vector B
        self.f_b = np.zeros((7, 15))
        for i in np.arange(7):
            self.f_b[i, i + 8] = 1
        
        # Initialize weights
        self.weights = np.zeros((15, 1)) + 1
        self.weights[6] = 10
        
        # Probability of taking actions
        self.use_a = Fraction(1, 7)
        self.use_b = Fraction(6, 7)
        
        
    # Policy to follow - translation of p_a and p_b into conditions
    def use_policy(self, state):
        if np.random.rand() < self.use_a:
            return 7, 0
        else:
            if state < 7:
                return np.random.randint(1, 7), 1
            else:
                return 7, 1
            
    
    def calculate_q(self, state, action):
        if action == 0:
            temp_q = [self.f_a[state-1, i] * self.weights[i][0] for i in np.arange(15)]
            return np.sum(temp_q)
        elif action == 1:
            temp_q = [self.f_b[state-1, i] * self.weights[i][0] for i in np.arange(15)]
            return np.sum(temp_q)
        else:
            return
        
        
    def get_features(self, state, action):
        if action == 0:
            return self.f_a[state-1, :]
        elif action == 1:
            return self.f_b[state-1, :]
        return
    
    
    # Q-Learning iteration loop
    def qlearning_iteration(self):
        
        # Run through N episodes
        for i in tqdm.tnrange(self.steps):
            
            # Choose action using policy
            next_state, action = self.use_policy(self.current_state)
            
            # Get current Q values
            current_q = self.calculate_q(self.current_state, action)
            
            # Get the Q values at next step
            q_alternative = np.max([self.calculate_q(next_state, i) for i in np.arange(2)])
            
            # Immediate reward R
            new_reward = 0
            
            # TD-target
            temp_target = new_reward + self.gamma*q_alternative - current_q
            #print(self.alpha * temp_target * self.get_features(self.current_state, action))
            
            # Weight update
            weight_update = self.alpha * temp_target * self.get_features(self.current_state, action)
            weight_update = np.reshape(weight_update, (15, 1))
            
            # Assign the weights
            temp_weights = self.weights + weight_update
            self.weights = temp_weights
            
            # Do the action and take next one
            self.current_state = next_state
            
        print(self.weights)

In [159]:
td_test = TDLearning()
td_test.qlearning_iteration()

  for i in tqdm.tnrange(self.steps):


  0%|          | 0/500 [00:00<?, ?it/s]

[[ 1.1378    ]
 [ 1.        ]
 [ 1.        ]
 [ 1.        ]
 [ 1.        ]
 [ 1.        ]
 [ 9.92806569]
 [ 1.06586569]
 [ 1.058904  ]
 [ 1.        ]
 [ 1.058904  ]
 [ 1.08791496]
 [ 1.0296    ]
 [ 1.058904  ]
 [10.7829402 ]]


In [None]:
 # For each step of the episode
while True: 

    # Choose action with epsilon-greedy
    action = self.epsilon_greedy(self.current_row, self.current_col)

    # Get new state S
    new_row, new_col = self.do_action(action, self.current_row, self.current_col)

    # Immediate reward R
    new_reward = self.rewards[new_row, new_col]

    # Current Q
    current_q = self.get_q(action, self.current_row, self.current_col)

    # Q(S', a)
    q_alternative = np.max([self.get_q(a, new_row, new_col) for a in self.actions])

    # Q'
    temp_q = current_q + self.alpha*(new_reward + self.gamma*q_alternative - current_q)

    # Set the new Q value
    self.set_q(action, self.current_row, self.current_col, temp_q)

    # Update action and state values
    self.current_row, self.current_col = self.do_action(action, self.current_row, self.current_col)

    episode_timestep += 1
    episode_total_reward += new_reward
    episode_rewards.append(new_reward)

    if (self.current_row == self.goal_row) & (self.current_col == self.goal_col):
        break