# Calculating Rewards

In [2]:
state = (0, 0)
action = 'right'
reward_table = {
                ((0, 0),  'right' ):  -1,
                ((0, 1),  'down'  ):  10,
                }

In [3]:
reward = reward_table.get((state,action),0)

In [4]:
print(f"Reward for action:{action} from state {state}:{reward}")

Reward for action:right from state (0, 0):-1


# Discounted Reward

In [5]:
rewards = [0,0,10]
gamma = 0.9
discounted_reward = sum([r*(gamma**i) for i,r in enumerate(rewards)])

In [6]:
print(f"Discounted rewards: {discounted_reward:.2f}")

Discounted rewards: 8.10


# Calculating Optimal Quantities

In [7]:
import numpy as np

In [30]:
states = [1, 0, 2]
actions = ['left', 'right']
rewards = {2: 1}
gamma =0.9
V = np.zeros(len(states))

In [35]:
for _ in range(20):
  for s in states:
    action_values = []
    for a in actions:
      if a == 'right':
        next_state = min(s +1, 2)
      else:
        next_state = max(s - 1, 0)
      r = rewards.get(next_state, 0)
      action_values.append(r + gamma * V[next_state])
    V[s] = max(action_values)

In [34]:
print(f"Optimal state values: {V[:]}")

Optimal state values: [8.61847958 9.57608842 9.57608842]


# Implementing Q-learning

In [36]:
import random

In [44]:
states = [1, 0, 2]
actions = ['left', 'right']
Q = {(s,a): 0.0 for s in states for a in actions}
alpha    =  0.1
gamma    =  0.9
episodes =  100

In [45]:
def next_state(s,a):
  return min(s+1,2) if a == 'right' else max(s-1,0)

In [46]:
for _ in range(episodes):
  s = random.choice(states)
  a = random.choice(actions)
  s_next= next_state(s, a)
  r=1 if s_next-2 else 0
  best_next = max(Q[(s_next, a_next)] for a_next in actions)
  Q[(s, a)] += alpha * (r + gamma * best_next - Q[(s, a)])


In [47]:
print("Learned Q-values:")
for key in Q:
  print(f"{key}: {Q[key]:.2f}")

Learned Q-values:
(1, 'left'): 1.54
(1, 'right'): 0.77
(0, 'left'): 1.82
(0, 'right'): 1.74
(2, 'left'): 1.67
(2, 'right'): 0.80


#  Setting up an Optimal Action

In [49]:
optimal_policy = {}
for s in states:
  best_action = max(actions, key=lambda a: Q[(s, a)])
  optimal_policy[s] = best_action

In [50]:
print("Optimal policy:", optimal_policy)

Optimal policy: {1: 'left', 0: 'left', 2: 'left'}
