بِسْمِ اللهِ

In [1]:
import numpy as np
import random as rnd 

In [2]:
# define a custom grid world environment ]
class GridWorld:
    def __init__(self,size):
        self.size=size # size of the grid world
        self.grid=np.zeros((size,size))
        self.start_state=(0,0)
        # placing the obstacle at the center of the grid world 
        # self.grid[size//2,size//2]=1
        # self.obstacle_state=(size//2,size//2)
        self.goal_state=(size-1,size-1)
        self.current_state=self.start_state

    def reset(self):
        self.current_state=self.start_state
        return self.current_state 
    
    def step(self,action):
        x_coord,y_coord =self.current_state 
        if action==0: # move up
            x_coord=max(0,x_coord-1)
        elif action==1: # move down 
            x_coord=min(self.size-1,x_coord+1)
        elif action==2: # move left 
            y_coord=max(0,y_coord-1)
        elif action==3: # move right 
            y_coord=min(self.size-1,y_coord+1)
        else :
            raise ValueError("Invalid Action")
        
        # setting the current state to the updated coordinates 
        self.current_state=(x_coord,y_coord)

        # if the current state is goal state , reward is 1 else 0
        if self.current_state == self.goal_state:
            reward=1
        else :
            reward=0
        
        if self.current_state==self.goal_state:
            done = True
        else :
            done = False 

        return self.current_state,reward,done



In [3]:
# q-learning algorithm
def q_learning(env,episodes,alpha,gamma,epsilon):
    # timesteps for each episode 
    timesteps=100
    # initializing the Q-table with random values (preferably zeros)\
    Q=np.zeros((env.size,env.size,4))
    
    # running each episode 
    for each_episode in range(episodes):
        
        # resetting the environment for each episode 
        state=env.reset()
        # setting the done Flag to False 
        done=False 
        while not done:
            # epsilon greedy action selection
            # genereate a random number between 0 and 1
            random=np.random.random()
            if random<epsilon:
                # exploration (random action)
                action=np.random.randint(0,4)
            elif random>=epsilon:
                # exploitation
                action=np.argmax(Q[state[0],state[1]])


            # taking the action and observing the next state and reward and done flag
            next_state,reward,done=env.step(action)

            # updating the Q-table using Q-Rule 
            Q[state[0], state[1], action] += alpha * (reward + gamma * np.max(Q[next_state[0], next_state[1]]) - Q[state[0], state[1], action])

            state=next_state

        if each_episode%timesteps==0:
            print(f"Episode {each_episode} completed")

        
    return Q
         
 

In [None]:
# initializing the environment 
env=GridWorld(5)
# defining the hyperparameters 
alpha=0.1
gamma=0.9
epsilon=0.1
episodes=1000

Q_table_final=q_learning(env,episodes,alpha,gamma,epsilon)
print(Q_table_final)