Libraries

In [1]:
import pandas as pd
import numpy as np
import random
import ast

Utility functions

In [2]:
def choose_best_action(table,state):
  for action in table.columns:
    if table.loc[state,action] == max(table.loc[state]):
      return action

############################################################

def epsilon_greedy_action(table,state,epsilon = 0.1):
  seed = random.uniform(0,1)
  if seed < epsilon:
    return random.sample(set(Qtable.columns),1)[0]
  else:
    return choose_best_action(table,state)

############################################################

def state_transition(state,action):
    newstate = ast.literal_eval(state)
    x = newstate[0]
    y = newstate[1]

    if action == '+SR':
      return '[{},{}]'.format(x,max(y//2,250))

    elif action == '=SR':
      return state

    elif action == '-SR':
      return '[{},{}]'.format(x,min(y*2,1000))

    else:
      print('Wrong action')
      return state

############################################################

def update_state_based_on_quality(state,curr_obs,prev_obs,th=0.2):

   newstate = ast.literal_eval(state)
   x = newstate[0]
   y = newstate[1]

   quality_error = np.abs(curr_obs - prev_obs)
   if quality_error > th:
     q = 0
   elif quality_error <= th and quality_error >= th/2:
     q = 1
   else:
     q = 2

   return '[{},{}]'.format(q,y) 

############################################################

def compute_reward(state):

   SR_0 = 250
   list_state = ast.literal_eval(state)
   q = list_state[0]
   SR = list_state[1]

   if q == 0:
     k = -1
   elif q == 1:
     k = 1.5
   else:
     k = 1

   return k*(SR//SR_0)

############################################################

def update_prev_Qtable_cell(Qtable,state,action,learning_rate=0.9,discount_factor=0.2):

  future_state = state_transition(state,action)

  contr1 = (1-learning_rate)*Qtable.loc[state,action]

  contr2 = learning_rate*compute_reward(future_state)

  contr3 = learning_rate*discount_factor*max([ Qtable.loc[future_state,future_action] for future_action in actions])

  return contr1 + contr2 + contr3


Quality state:

- 0 --> low quality
- 1 --> perfect quality
- 2 --> too high quality 

In [3]:
states = [
          '[0,250]','[1,250]','[2,250]',
          '[0,500]','[1,500]','[2,500]',
          '[0,1000]','[1,1000]','[2,1000]'
          ]

      
actions = ['+SR','=SR','-SR']

Qtable = pd.DataFrame(data=0, index=states, columns=actions)

Qtable

Unnamed: 0,+SR,=SR,-SR
"[0,250]",0,0,0
"[1,250]",0,0,0
"[2,250]",0,0,0
"[0,500]",0,0,0
"[1,500]",0,0,0
"[2,500]",0,0,0
"[0,1000]",0,0,0
"[1,1000]",0,0,0
"[2,1000]",0,0,0


In [4]:
a = '[low,250]'
c = 23

In [5]:
print(ast.literal_eval('[0,250]'))

[0, 250]


In [6]:
 state_transition('[0,500]','-SR')

'[0,1000]'

In [7]:
for i in range(10):
  print(epsilon_greedy_action(Qtable,'[0,500]'))

+SR
+SR
+SR
=SR
+SR
=SR
+SR
+SR
+SR
+SR


In [8]:
Qtable.loc['[2,500]','+SR']

0

In [9]:
compute_reward('[2,500]')

2

Training

In [10]:
def train_on_measurement(curr_state, Qtable,curr_obs,prev_obs, epsilon=0.1,learning_rate=0.9,discount_factor=0.2,th=0.2 ):

  newstate = update_state_based_on_quality(curr_state,curr_obs,prev_obs,th)

  chosen_action = epsilon_greedy_action(Qtable,newstate,epsilon)

  Qtable.loc[newstate,chosen_action] = update_prev_Qtable_cell(Qtable,newstate,chosen_action,learning_rate,discount_factor)

  final_state = state_transition(newstate,chosen_action)

  return Qtable, final_state






In [11]:
Qtable

Unnamed: 0,+SR,=SR,-SR
"[0,250]",0,0,0
"[1,250]",0,0,0
"[2,250]",0,0,0
"[0,500]",0,0,0
"[1,500]",0,0,0
"[2,500]",0,0,0
"[0,1000]",0,0,0
"[1,1000]",0,0,0
"[2,1000]",0,0,0


In [12]:
initial_state = '[2,500]'
curr_obs = 0.4
prev_obs = 0.35

new_Qtable, final_state = train_on_measurement(initial_state, Qtable,curr_obs,prev_obs)

In [13]:
new_Qtable

Unnamed: 0,+SR,=SR,-SR
"[0,250]",0.0,0,0
"[1,250]",0.0,0,0
"[2,250]",0.0,0,0
"[0,500]",0.0,0,0
"[1,500]",0.0,0,0
"[2,500]",0.9,0,0
"[0,1000]",0.0,0,0
"[1,1000]",0.0,0,0
"[2,1000]",0.0,0,0


In [14]:
final_state

'[2,250]'