In [1]:
import gym
import numpy as np
import random
from IPython.display import clear_output
import pandas as pd
from time import sleep
import os

In [2]:
import time


class Q_learn:

    def __init__(self,env):
        self.env=env  


    def buildQ_leanTable(self):
        self.q_table = np.zeros([self.env.observation_space.n,self.env.action_space.n])
        return self.q_table


    def train(self,epoch=100001,alpha = 0.1,gamma = 0.6,epsilon = 0.1):
        self.buildQ_leanTable()
        for i in range(1, epoch):
            state = self.env.reset()
            epochs, reward, = 0, 0
            done = False

            while not done:
                if random.uniform(0, 1) < epsilon:
                    action = self.env.action_space.sample() # Explore action space
                else:
                    action = np.argmax(self.q_table[state]) # Exploit learned values

                next_state, reward, done, info = self.env.step(action) 
                
                old_value = self.q_table[state, action]
                next_max = np.max(self.q_table[next_state])
                
                new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
                self.q_table[state, action] = new_value

                state = next_state
                epochs += 1
                
                # clear_output(wait=True)
                # self.env.render()

            if i % 100 == 0:
                clear_output(wait=True)
                print(f"Episode: {i}")

        print("Training finished.\n")


    def trainTune(self,alpha = 0.5,gamma = 0.8,epsilon = 0.6):
        self.buildQ_leanTable()
        for i in range(1, 100001):
            state = self.env.reset()
            epochs, reward, = 0, 0
            done = False

            while not done:
                if random.uniform(0, 1) < epsilon:
                    action = self.env.action_space.sample() # Explore action space
                else:
                    action = np.argmax(self.q_table[state]) # Exploit learned values

                next_state, reward, done, info = self.env.step(action) 
                
                old_value = self.q_table[state, action]
                next_max = np.max(self.q_table[next_state])
                
                new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
                self.q_table[state, action] = new_value

                state = next_state
                epochs += 1
                
            if i % 100 == 0:
                # clear_output(wait=True)
                print(f"Episode: {i}")
            if i % 1000 == 0:
                epsilon-=0.0058
                gamma-=0.0078
                alpha-=0.0048
                print(epsilon,gamma,alpha,sep='\n')

        print("Training finished.\n")


    def trainGraidSearch(self,hyper):
        self.buildQ_leanTable()
        df=pd.DataFrame(columns=['epoch','alpha','gama','epsilon','average Time'])
        epochF=False
        alphaF=False
        gammaF=False
        epsilonF=False
        try:
          epochs=hyper['epochs']
          epochF=True
        except:
          epochs=[100001]
        try:
          alphas=hyper['alphas']
          alphaF=False
        except:
          alphas=[0.1]
        try:
          gammas=hyper['gammas']
          gammaF=True
        except:
          gammas=[0.6]
        try:
          epsilons=hyper['epsilons']
          epsilonF=True
        except:
          epsilons=[0.1]
        first=True
        best_parameter=[]
        for epoch in epochs:
          for alpha in alphas:
            for gamma in gammas:
              for epsilon in epsilons:
                self.train(epoch,alpha,gamma,epsilon)
                average=self.evaloute()
                df.loc[len(df.index)]=[epoch,alpha,gamma,epsilon,average]
                display(df)
        best_parmeter=df.loc[df['average Time'].idxmin()]
        return best_parmeter 


    def evaluate(self,episodes = 100):
      total_epochs= 0
      for _ in range(episodes):
          state = self.env.reset()
          epochs, reward = 0, 0
          done = False
          
          while not done:
              action = np.argmax(self.q_table[state])
              state, reward, done, info = self.env.step(action)
              epochs += 1
          total_epochs += epochs
      average=total_epochs / episodes
      print(f"Results after {episodes} episodes:")
      print(f"Average timesteps per episode: {average}")
      return average

    def play(self,episode=100):
      state = self.env.reset()
      done = False
      sequence = []

      while not done:
          # Choose the action with the highest value in the current state
          if np.max(self.q_table[state]) > 0:
            action = np.argmax(self.q_table[state])

          # If there's no best action (only zeros), take a random one
          else:
            action = self.env.action_space.sample()
          
          # Add the action to the sequence
          sequence.append(action)

          # Implement this action and move the agent in the desired direction
          new_state, reward, done, info = self.env.step(action)

          # Update our current state
          state = new_state

          # Update the render
          clear_output(wait=True)
          self.env.render()
          time.sleep(1)

      print(f"Sequence = {sequence}")

In [43]:
from gym.envs.toy_text.frozen_lake import generate_random_map
random_map = generate_random_map(size=4, p=0.3)
env3 = gym.make("FrozenLake-v1", desc=random_map, is_slippery=True)
env3.reset()
env3.render()


[41mS[0mFHH
HFHF
HFFF
FHHG


In [44]:
print("Action Space {}".format(env3.action_space))
print("State Space {}".format(env3.observation_space))

Action Space Discrete(4)
State Space Discrete(16)


In [45]:
RL_model3=Q_learn(env3)
RL_model3.train()



In [47]:
RL_model3.evaluate()

Results after 100 episodes:
Average timesteps per episode: 3.01


3.01

In [36]:
hyper={
    'epochs':[100001],
    'alphas':[0.9,0.8,0.5,0.3],
    'gammas':[0.9,0.8,0.5,0.2],
    'epsilons':[0.7,0.4,0.1,0.06]
}
best_param=RL_model3.trainGraidSearch(hyper)
print("best parameters is :",best_param)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 2.98


Unnamed: 0,epoch,alpha,gama,epsilon,average Time
0,100001.0,0.9,0.9,0.70,7.74
1,100001.0,0.9,0.9,0.40,3.13
2,100001.0,0.9,0.9,0.10,7.55
3,100001.0,0.9,0.9,0.06,3.05
4,100001.0,0.9,0.8,0.70,9.13
...,...,...,...,...,...
59,100001.0,0.3,0.5,0.06,8.84
60,100001.0,0.3,0.2,0.70,2.96
61,100001.0,0.3,0.2,0.40,3.20
62,100001.0,0.3,0.2,0.10,2.95


best parameters is : epoch           100001.00
alpha                0.30
gama                 0.50
epsilon              0.70
average Time         2.55
Name: 56, dtype: float64


In [37]:
RL_model3.trainTune(alpha=0.30, gamma=0.50, epsilon=0.70)
RL_model3.evaloute()

Episode: 100000
0.11999999999999786
-0.27999999999999914
-0.1800000000000002
Training finished.

Results after 100 episodes:
Average timesteps per episode: 3.31


3.31

In [49]:
RL_model3.play()

  (Left)
SFHH
[41mH[0mFHF
HFFF
FHHG
Sequence = [3, 0, 3, 1, 1, 0, 0, 0]
