# Time Dependent Q-Learning

In [7]:
import gym 
import operator
import itertools
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import deque
from random import randint

In [8]:
env = gym.make('CartPole-v1')
env._max_episode_steps = 5000
number_of_games = 10000000
ACTION_SPACE = env.action_space.n #number of possible actions
OBSERVATION_SPACE = env.observation_space.shape[0] #number of observable variables
STATES_IN_INTERVAL = 11

In [None]:
def create_state_intervals():
    intervals = np.zeros((OBSERVATION_SPACE, STATES_IN_INTERVAL))
    intervals[0] = np.linspace(-4.8, 4.8, STATES_IN_INTERVAL)
    intervals[1] = np.linspace(-3.5, 3.5, STATES_IN_INTERVAL)
    intervals[2] = np.linspace(-0.42, 0.42, STATES_IN_INTERVAL)
    intervals[3] = np.linspace(-4, 4, STATES_IN_INTERVAL)
    return intervals

In [None]:
def discretize_observation(observation):
    discrete_observation = np.array([np.digitize(observation[index], INTERVALS[index])-1 for index in range(OBSERVATION_SPACE)])
    # if some value is under the lower border ignore it and give it min value
    discrete_observation = [0 if x<0 else x for x in discrete_observation]
    return discrete_observation

In [None]:
def get_all_possible_states():
    digits = len(str(STATES_IN_INTERVAL))
    state_indices = [str(state_index).zfill(digits) for state_index in range(STATES_IN_INTERVAL)] # all encodings for a single observation variable
    states = [state_indices for i in range(OBSERVATION_SPACE)] # for each observation variable a list of its encodings
    states = list(itertools.product(*states)) # get all permutation of all state encodings (->list of tuples)
    states = [''.join(x) for x in states] # join tuples to a single string
    return states

In [None]:
def observation_to_state(observation):
    discrete_observation = discretize_observation(observation)
    digits = len(str(STATES_IN_INTERVAL))
    
    state = ''
    for state_id in discrete_observation:
        if len(str(state_id)) < digits:
            state += str(state_id).zfill(digits)
        else:
            state += str(state_id)
    return state

In [None]:
def init_q_table(states, actions):
    q_table = dict()
    for state in states:
        q_table[state] = dict()
        for action in actions:
            q_table[state][action] = np.random.randint(10)
    return q_table

In [None]:
def get_action(q_table, state, alpha):
    action = 0 if q_table[state][0] > q_table[state][1] else 1
    if(random.random() < 0.1):
        action += 1 
        action %= 2
        
    return action

In [None]:
def max_a(q_table, next_state):
    return max(q_table[next_state][k] for k in q_table[next_state].keys())

In [None]:
def update_q_table(q_table, state, action, next_state, reward, alpha, gamma):
    q_s_a = q_table[state][action]
    q_table[state][action] = q_s_a + alpha * (reward + gamma * max_a(q_table, next_state) - q_s_a)
    return q_table

In [None]:
INTERVALS = create_state_intervals()

for i in range(1):
    print('EPISODE:', i)
            
    last100_rewards = deque(maxlen=100) # fifo queue
    game_max = []
    game_mean = []
    solved = False
    
    q_table = init_q_table(get_all_possible_states(), [0, 1])
    
    
    alpha = 0.1
    gamma = 0.9
          
    for game in range(number_of_games):
        
        overall_reward, done = 0, False
        observation = env.reset()
        state = observation_to_state(observation)
        
        if alpha > 0.01:
            alpha *= 0.99         
        while not done:
            if game % 1000 == 0: env.render()
                       
            action = get_action(q_table, state, alpha)

            observation, reward, done, _ = env.step(action)
            
            next_state = observation_to_state(observation)
            overall_reward += reward

            if done: reward = -5000 # punish if agent dies
                
            update_q_table(q_table, state, action, next_state, reward, alpha, gamma)

            state = next_state

        if game % 100 == 0 and game != 0:
            print('Episode:', game,  
                  'Mean-Reward:', np.mean(last100_rewards), 
                  'Max-Reward:', max(last100_rewards),
                  'Alpha:', alpha                     
                 )
            game_max.append(max(last100_rewards))
            game_mean.append(np.mean(last100_rewards))
            
        if (np.mean(last100_rewards) >= 195) and not solved: 
            print('TASK COMPLETED LAST 100 GAMES HAD AN AVERAGE SCORE >=195 ON GAME', game)
            print(last100_rewards)
            solved = True
                       
        
        last100_rewards.append(overall_reward) 

EPISODE: 0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Episode: 100 Mean-Reward: 47.75 Max-Reward: 233.0 Alpha: 0.03623720178604972
Episode: 200 Mean-Reward: 122.08 Max-Reward: 316.0 Alpha: 0.013263987810938228
Episode: 300 Mean-Reward: 143.22 Max-Reward: 321.0 Alpha: 0.009910481551887479
Episode: 400 Mean-Reward: 136.38 Max-Reward: 361.0 Alpha: 0.009910481551887479
Episode: 500 Mean-Reward: 148.01 Max-Reward: 303.0 Alpha: 0.009910481551887479
Episode: 600 Mean-Reward: 134.71 Max-Reward: 358.0 Alpha: 0.009910481551887479
Episode: 700 Mean-Reward: 142.4 Max-Reward: 465.0 Alpha: 0.009910481551887479
Episode: 800 Mean-Reward: 168.41 Max-Reward: 450.0 Alpha: 0.009910481551887479
Episode: 900 Mean-Reward: 175.32 Max-Reward: 519.0 Alpha: 0.009910481551887479
Episode: 1000 Mean-Reward: 185.49 Max-Reward: 458.0 Alpha: 0.009910481551887479
Episode: 1100 Mean-Reward: 176.32 Max-Reward: 353.0 Alpha: 0.009910481551887479
Episode: 1200 Mean-Reward: 182.55 Max-Reward: 527.0 Alpha: 0.009910481551887479
Episode: 1300 Mean-Reward: 189.73 Max-Reward: 609.0 

Episode: 9400 Mean-Reward: 234.95 Max-Reward: 870.0 Alpha: 0.009910481551887479
Episode: 9500 Mean-Reward: 325.33 Max-Reward: 776.0 Alpha: 0.009910481551887479
Episode: 9600 Mean-Reward: 286.7 Max-Reward: 688.0 Alpha: 0.009910481551887479
Episode: 9700 Mean-Reward: 302.05 Max-Reward: 784.0 Alpha: 0.009910481551887479
Episode: 9800 Mean-Reward: 273.95 Max-Reward: 708.0 Alpha: 0.009910481551887479
Episode: 9900 Mean-Reward: 369.32 Max-Reward: 1181.0 Alpha: 0.009910481551887479
Episode: 10000 Mean-Reward: 294.37 Max-Reward: 920.0 Alpha: 0.009910481551887479
Episode: 10100 Mean-Reward: 262.53 Max-Reward: 777.0 Alpha: 0.009910481551887479
Episode: 10200 Mean-Reward: 282.18 Max-Reward: 721.0 Alpha: 0.009910481551887479
Episode: 10300 Mean-Reward: 319.58 Max-Reward: 835.0 Alpha: 0.009910481551887479
Episode: 10400 Mean-Reward: 290.65 Max-Reward: 796.0 Alpha: 0.009910481551887479
Episode: 10500 Mean-Reward: 284.18 Max-Reward: 636.0 Alpha: 0.009910481551887479
Episode: 10600 Mean-Reward: 354.51

Episode: 19500 Mean-Reward: 385.17 Max-Reward: 1044.0 Alpha: 0.009910481551887479
Episode: 19600 Mean-Reward: 408.39 Max-Reward: 1039.0 Alpha: 0.009910481551887479
Episode: 19700 Mean-Reward: 346.32 Max-Reward: 970.0 Alpha: 0.009910481551887479
Episode: 19800 Mean-Reward: 401.12 Max-Reward: 1308.0 Alpha: 0.009910481551887479
Episode: 19900 Mean-Reward: 412.69 Max-Reward: 1210.0 Alpha: 0.009910481551887479
Episode: 20000 Mean-Reward: 406.33 Max-Reward: 986.0 Alpha: 0.009910481551887479
Episode: 20100 Mean-Reward: 440.86 Max-Reward: 1253.0 Alpha: 0.009910481551887479
Episode: 20200 Mean-Reward: 383.47 Max-Reward: 960.0 Alpha: 0.009910481551887479
Episode: 20300 Mean-Reward: 411.01 Max-Reward: 1328.0 Alpha: 0.009910481551887479
