# Time Dependent Q-Learning

In [1]:
import gym 
import operator
import itertools
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import deque
from random import randint

In [2]:
env = gym.make('CartPole-v1')
env._max_episode_steps = 5000
number_of_games = 10000000
ACTION_SPACE = env.action_space.n #number of possible actions
OBSERVATION_SPACE = env.observation_space.shape[0] #number of observable variables
STATES_IN_INTERVAL = 11

In [3]:
def create_state_intervals():
    intervals = np.zeros((OBSERVATION_SPACE, STATES_IN_INTERVAL))
    intervals[0] = np.linspace(-4.8, 4.8, STATES_IN_INTERVAL)
    intervals[1] = np.linspace(-3.5, 3.5, STATES_IN_INTERVAL)
    intervals[2] = np.linspace(-0.42, 0.42, STATES_IN_INTERVAL)
    intervals[3] = np.linspace(-4, 4, STATES_IN_INTERVAL)
    return intervals

In [4]:
def discretize_observation(observation):
    discrete_observation = np.array([np.digitize(observation[index], INTERVALS[index])-1 for index in range(OBSERVATION_SPACE)])
    # if some value is under the lower border ignore it and give it min value
    discrete_observation = [0 if x<0 else x for x in discrete_observation]
    return discrete_observation

In [5]:
def get_all_possible_states():
    digits = len(str(STATES_IN_INTERVAL))
    state_indices = [str(state_index).zfill(digits) for state_index in range(STATES_IN_INTERVAL)] # all encodings for a single observation variable
    states = [state_indices for i in range(OBSERVATION_SPACE)] # for each observation variable a list of its encodings
    states = list(itertools.product(*states)) # get all permutation of all state encodings (->list of tuples)
    states = [''.join(x) for x in states] # join tuples to a single string
    return states

In [6]:
def observation_to_state(observation):
    discrete_observation = discretize_observation(observation)
    digits = len(str(STATES_IN_INTERVAL))
    
    state = ''
    for state_id in discrete_observation:
        if len(str(state_id)) < digits:
            state += str(state_id).zfill(digits)
        else:
            state += str(state_id)
    return state

In [None]:
def init_q_table(states, actions):
    q_table = dict()
    for state in states:
        q_table[state] = dict()
        for action in actions:
            q_table[state][action] = np.random.randint(10)
    return q_table

In [None]:
def get_action(q_table, state, alpha):
    action = 0 if q_table[state][0] > q_table[state][1] else 1
    if(random.random() < alpha):
        action += 1 
        action %= 2
        
    return action

In [None]:
def max_a(q_table, next_state):
    return max(q_table[next_state][k] for k in q_table[next_state].keys())

In [None]:
def update_q_table(q_table, state, action, next_state, reward, alpha, gamma):
    q_s_a = q_table[state][action]
    q_table[state][action] = q_s_a + alpha * (reward + gamma * max_a(q_table, next_state) - q_s_a)
    return q_table

In [None]:
INTERVALS = create_state_intervals()

for i in range(1):
    print('EPISODE:', i)
            
    last100_rewards = deque(maxlen=100) # fifo queue
    game_max = []
    game_mean = []
    solved = False
    
    q_table = init_q_table(get_all_possible_states(), [0, 1])
    
    
    alpha = 1
    gamma = 0.9
          
    for game in range(number_of_games):
        
        overall_reward, done = 0, False
        observation = env.reset()
        state = observation_to_state(observation)
        
        if alpha > 0.01:
            alpha *= 0.999
            
        while not done:
            if game % 1000 == 0: env.render()
                       
            action = get_action(q_table, state, alpha)

            observation, reward, done, _ = env.step(action)
            
            next_state = observation_to_state(observation)
            overall_reward += reward

            if done: reward = -5000 # punish if agent dies
                
            update_q_table(q_table, state, action, next_state, reward, alpha, gamma)

            state = next_state

        if game % 100 == 0 and game != 0:
            print('Episode:', game,  
                  'Mean-Reward:', np.mean(last100_rewards), 
                  'Max-Reward:', max(last100_rewards),
                  'Alpha:', alpha                     
                 )
            game_max.append(max(last100_rewards))
            game_mean.append(np.mean(last100_rewards))
            
        if (np.mean(last100_rewards) >= 195) and not solved: 
            print('TASK COMPLETED LAST 100 GAMES HAD AN AVERAGE SCORE >=195 ON GAME', game)
            print(last100_rewards)
            solved = True
                       
        
        last100_rewards.append(overall_reward) 

EPISODE: 0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Episode: 100 Mean-Reward: 17.47 Max-Reward: 64.0 Alpha: 0.9038873549665959
Episode: 200 Mean-Reward: 14.23 Max-Reward: 50.0 Alpha: 0.8178301806491574
Episode: 300 Mean-Reward: 14.01 Max-Reward: 42.0 Alpha: 0.7399663251239436
Episode: 400 Mean-Reward: 14.66 Max-Reward: 44.0 Alpha: 0.6695157201007336
Episode: 500 Mean-Reward: 16.53 Max-Reward: 34.0 Alpha: 0.6057725659163237
Episode: 600 Mean-Reward: 18.28 Max-Reward: 50.0 Alpha: 0.548098260578011
Episode: 700 Mean-Reward: 20.05 Max-Reward: 56.0 Alpha: 0.4959150020176678
Episode: 800 Mean-Reward: 24.72 Max-Reward: 95.0 Alpha: 0.44869999946146477
Episode: 900 Mean-Reward: 26.06 Max-Reward: 73.0 Alpha: 0.4059802359226587
Episode: 1000 Mean-Reward: 33.04 Max-Reward: 193.0 Alpha: 0.36732772934619257
Episode: 1100 Mean-Reward: 39.8 Max-Reward: 152.0 Alpha: 0.33235524492954527
Episode: 1200 Mean-Reward: 47.81 Max-Reward: 175.0 Alpha: 0.3007124156643058
Episode: 1300 Mean-Reward: 51.52 Max-Reward: 171.0 Alpha: 0.2720822322326576
Episode: 1400 Me

Episode: 9500 Mean-Reward: 361.44 Max-Reward: 849.0 Alpha: 0.009998671593271896
Episode: 9600 Mean-Reward: 342.21 Max-Reward: 747.0 Alpha: 0.009998671593271896
Episode: 9700 Mean-Reward: 344.87 Max-Reward: 674.0 Alpha: 0.009998671593271896
Episode: 9800 Mean-Reward: 382.44 Max-Reward: 654.0 Alpha: 0.009998671593271896
Episode: 9900 Mean-Reward: 367.27 Max-Reward: 663.0 Alpha: 0.009998671593271896
Episode: 10000 Mean-Reward: 364.12 Max-Reward: 737.0 Alpha: 0.009998671593271896
Episode: 10100 Mean-Reward: 362.8 Max-Reward: 601.0 Alpha: 0.009998671593271896
Episode: 10200 Mean-Reward: 392.11 Max-Reward: 772.0 Alpha: 0.009998671593271896
Episode: 10300 Mean-Reward: 376.66 Max-Reward: 885.0 Alpha: 0.009998671593271896
Episode: 10400 Mean-Reward: 337.08 Max-Reward: 748.0 Alpha: 0.009998671593271896
Episode: 10500 Mean-Reward: 352.09 Max-Reward: 679.0 Alpha: 0.009998671593271896
Episode: 10600 Mean-Reward: 389.69 Max-Reward: 1089.0 Alpha: 0.009998671593271896
Episode: 10700 Mean-Reward: 346.2

Episode: 19700 Mean-Reward: 454.68 Max-Reward: 889.0 Alpha: 0.009998671593271896
Episode: 19800 Mean-Reward: 490.57 Max-Reward: 867.0 Alpha: 0.009998671593271896
Episode: 19900 Mean-Reward: 501.04 Max-Reward: 918.0 Alpha: 0.009998671593271896
Episode: 20000 Mean-Reward: 539.16 Max-Reward: 1104.0 Alpha: 0.009998671593271896
Episode: 20100 Mean-Reward: 499.82 Max-Reward: 1290.0 Alpha: 0.009998671593271896
Episode: 20200 Mean-Reward: 445.84 Max-Reward: 746.0 Alpha: 0.009998671593271896
Episode: 20300 Mean-Reward: 509.95 Max-Reward: 935.0 Alpha: 0.009998671593271896
Episode: 20400 Mean-Reward: 495.92 Max-Reward: 985.0 Alpha: 0.009998671593271896
Episode: 20500 Mean-Reward: 499.67 Max-Reward: 832.0 Alpha: 0.009998671593271896
Episode: 20600 Mean-Reward: 510.15 Max-Reward: 875.0 Alpha: 0.009998671593271896
Episode: 20700 Mean-Reward: 512.72 Max-Reward: 856.0 Alpha: 0.009998671593271896
Episode: 20800 Mean-Reward: 480.54 Max-Reward: 903.0 Alpha: 0.009998671593271896
Episode: 20900 Mean-Reward