# **part 1 : Change the code to be modular code**

In [None]:
#installation Libraries
!pip install cmake 'gym[atari]' scipy
import gym
import sys
from time import sleep
import random
from IPython.display import clear_output
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#choose environment and render it
def Set_Environment(environment_name):
  """Function Change_Environment
     Input : environment_name
     Output : change the environment name and render it as a test and return the environment
  """
  env = gym.make(environment_name)
  env.render()
  return env

In [None]:
#choose environment and render it
env=Set_Environment("Taxi-v3")

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+



In [None]:
#change environment state for just check it
env.s = 301
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+



In [None]:
# reset the environment environment to just choose random state and render 
env.reset() 
env.render()
#print the environment action space and states that prints 6 action in our environment and 500 state
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [None]:
def print_frames(frames):
  """
   Function : print_frames
   Input : frames
   Output : print the frames 
  """
  for i, frame in enumerate(frames):
    clear_output(wait=True)
    #print(frame['frame'].getvalue())
    print(frame['frame'])
    print(f"Timestep: {i + 1}")
    print(f"State: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    sleep(.1)

In [None]:

def Brute_force_approach():
  """
  Function Brute force approach
  Output : return frames to print them, print Timesteps and Penalties
  """
  env.s = 328  # set environment to illustration's state

  epochs = 0
  penalties, reward = 0, 0

  frames = [] # for animation

  done = False

  while not done:
      action = env.action_space.sample()
      state, reward, done, info = env.step(action)

      if reward == -10:
          penalties += 1
      
      # Put each rendered frame into dict for animation
      frames.append({
          'frame': env.render(mode='ansi'),
          'state': state,
          'action': action,
          'reward': reward
          }
      )

      epochs += 1
      
      
  print("Timesteps taken: {}".format(epochs))
  print("Penalties incurred: {}".format(penalties))
  return frames 

In [None]:
frames = Brute_force_approach()

Timesteps taken: 200
Penalties incurred: 66


In [None]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)

Timestep: 200
State: 214
Action: 1
Reward: -1


In [None]:

def TrainingAgentFixedHyperParametersQLearning(q_table,env,alpha,gamma,epsilon,trianingTimes):
  """Function Training Agent with Fixed HyperParameters
   Input : q_table, env, alpha, gamma, epsilon and trianingTimes
   Output : print Episode numbers that already trained and return env and qtable after training
  """
  # Initialize the q table
  
  # For plotting metrics
  all_epochs = []
  all_penalties = []
  for i in range(1, trianingTimes+1):
      state = env.reset()
      epochs, penalties, reward, = 0, 0, 0
      done = False
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values
          next_state, reward, done, info = env.step(action) 
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value
          if reward == -10:
              penalties += 1
          state = next_state
          epochs += 1
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")
  print("Training finished.\n")
  return env, q_table

In [None]:
# Evaluation
"""Function : Evaluate agent's performance after Q-learning
   Input : env, episodes, q_table
   output : print episodes, timesteps and penalities and return episodes,timesteps,penalties,rewards
"""
def EvaluationWithQLearning(env,episodes,q_table):
   
  total_epochs, total_penalties = 0, 0
  rewards = 0
  for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      
      done = False
      
      while not done:
          action = np.argmax(q_table[state])
          state, reward, done, info = env.step(action)
          rewards+=reward
          if reward == -10:
              penalties += 1

          epochs += 1

      total_penalties += penalties
      total_epochs += epochs

  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / episodes}")
  print(f"Average penalties per episode: {total_penalties / episodes}")
  return episodes,(total_epochs / episodes),(total_penalties / episodes),(rewards / episodes)

In [None]:
# Initialize the q table
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [None]:
# Intialize Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

In [None]:
#call TrainingAgentFixedHyperParametersQLearning function with 10000 training times
envAfterTraining,q_tableAfterTraining = TrainingAgentFixedHyperParametersQLearning(q_table,env,alpha,gamma,epsilon,100000)

Episode: 100000
Training finished.



In [None]:
#evaluate our training that was with 10000 training times with 1000 episodes
EvaluationWithQLearning(envAfterTraining,1000,q_tableAfterTraining)

Results after 1000 episodes:
Average timesteps per episode: 13.073
Average penalties per episode: 0.0


(1000, 13.073, 0.0)

In [None]:
#change our environment to FrozenLake-v0 environment check our modularity
env=Set_Environment("FrozenLake-v0") 

# Intialize Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
# Initialize the q table
q_table = np.zeros([env.observation_space.n, env.action_space.n])


[41mS[0mFFF
FHFH
FFFH
HFFG


In [None]:
# frames = Brute_force_approach()

In [None]:
#call TrainingAgentFixedHyperParametersQLearning function with 10000 training times

envAfterTraining,q_tableAfterTraining = TrainingAgentFixedHyperParametersQLearning(q_table,env,alpha,gamma,epsilon,100000)

Episode: 100000
Training finished.



In [None]:
#evaluate our training that was with 10000 training times with 1000 episodes
EvaluationWithQLearning(envAfterTraining,1000,q_tableAfterTraining)

Results after 1000 episodes:
Average timesteps per episode: 9.277
Average penalties per episode: 0.0


(1000, 9.277, 0.0)

# **Decreasing Hyperparameters continuesly to show the affect of this decreasing **

In [None]:

def TrainingAgentDescreasingHyperParametersQLearning(q_table,env,trianingTimes):
  """Function Training Agent with Descreasing HyperParameters
   Input : q_table, epsilon and trianingTimes
   Output : print Episode numbers that already trained and return env and qtable after training
  """
  # Initialize the q table
  # For plotting metrics
  all_epochs = []
  all_penalties = []
  descreasing_parameters = np.linspace(0.9,0.7,1001)
  descreasing_counter=-1;
  for i in range(1, trianingTimes+1):
    #decreasing hyperparameters every 100 iterations from 0.9 to 0.1
      if i%100==0 or i==1:
        descreasing_counter+=1
        print(descreasing_counter)
        alpha=descreasing_parameters[descreasing_counter]
        gamma=descreasing_parameters[descreasing_counter]
        epsilon=descreasing_parameters[descreasing_counter]
      state = env.reset()
      epochs, penalties, reward, = 0, 0, 0
      done = False
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values
          next_state, reward, done, info = env.step(action) 
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value
          if reward == -10:
              penalties += 1
          state = next_state
          epochs += 1
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")
  print("Training finished.\n")
  return env, q_table

In [None]:
#Change the enivronment to Taxi-v3
env=Set_Environment("Taxi-v3") 

# Initialize the q table
q_table = np.zeros([env.observation_space.n, env.action_space.n])

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+



In [None]:
envAfterTraining,q_tableAfterTraining = TrainingAgentDescreasingHyperParametersQLearning(q_table,env,100000)

Episode: 100000
Training finished.



In [None]:
#evaluate our training that was with 10000 training times with 1000 episodes
EvaluationWithQLearning(envAfterTraining,1000,q_tableAfterTraining)

Results after 1000 episodes:
Average timesteps per episode: 12.994
Average penalties per episode: 0.0


(1000, 12.994, 0.0)

In [None]:
#Change the enivronment to Taxi-v3 again and reset it 
env=Set_Environment("Taxi-v3") 

# Initialize the q table
q_table = np.zeros([env.observation_space.n, env.action_space.n])

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [None]:
def TrainingAgentFixedHyperParametersQLearningWithoutLog(q_table,env,alpha,gamma,epsilon,trianingTimes):
  """Function Training Agent with Fixed HyperParameters Without screen Log
   Input : q_table, env, alpha, gamma, epsilon and trianingTimes
   Output : print Episode numbers that already trained and return env and qtable after training
  """
  # Initialize the q table
  
  # For plotting metrics
  all_epochs = []
  all_penalties = []

  for i in range(1, trianingTimes+1):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
      
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value

          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
      

  print("Training finished.\n")
  return env, q_table

In [None]:
# Intialize Hyperparameters
alpha = np.linspace(0.1,0.9,3)
gamma = np.linspace(0.1,0.9,3)
epsilon = np.linspace(0.1,0.9,3)
alphaval = 0
gammaval =0
epsilonval=0
episodesVal=0
timestepsVal=0
penaltiesVal=0
EvaluationnValue = sys.maxsize
for i in alpha:
  for j in gamma:
    for k in epsilon:

      print("\n***************\nat alpha {} gamma {} epsilon {}".format(i,j,k))
      #call TrainingAgentFixedHyperParametersQLearning function with 10000 training times
      envAfterTraining,q_tableAfterTraining = TrainingAgentFixedHyperParametersQLearningWithoutLog(q_table,env,i,j,j,100000)
      #evaluate our training that was with 10000 training times with 1000 episodes
      episodes,timesteps,penalties,rewards=EvaluationWithQLearning(envAfterTraining,1000,q_tableAfterTraining)
      print("episodes {} timesteps {} penalties {}".format(episodes,timesteps,penalties))
      if(EvaluationnValue>(rewards/timestepsVal)):
        EvaluationnValue = (rewards/timestepsVal)
        alphaval = i
        gammaval =j
        epsilonval=k
        episodesVal= episodes
        penaltiesVal = penalties
        timestepsVal= timesteps
print("\n**************************\n")    
print("Best Parameters Combinations alpha {}, gamma {}, epsilon {}".format(alphaval,gammaval,epsilonval))        
print(f"Results after {episodesVal} episodes:")
print(f"Average timesteps per episode: {timestepsVal}")
print(f"Average penalties per episode: {penaltiesVal}")
print(f"Average rewards per episode: {rewards}")
print(f"Overall Evaluation: {rewards/timestepsVal}")




***************
at alpha 0.1 gamma 0.1 epsilon 0.1
Training finished.

Results after 1000 episodes:
Average timesteps per episode: 13.846
Average penalties per episode: 0.0
episodes 1000 timesteps 13.846 penalties 0.0

***************
at alpha 0.1 gamma 0.1 epsilon 0.5
Training finished.

Results after 1000 episodes:
Average timesteps per episode: 13.031
Average penalties per episode: 0.0
episodes 1000 timesteps 13.031 penalties 0.0

***************
at alpha 0.1 gamma 0.1 epsilon 0.9
Training finished.

Results after 1000 episodes:
Average timesteps per episode: 12.984
Average penalties per episode: 0.0
episodes 1000 timesteps 12.984 penalties 0.0

***************
at alpha 0.1 gamma 0.5 epsilon 0.1
Training finished.

Results after 1000 episodes:
Average timesteps per episode: 12.938
Average penalties per episode: 0.0
episodes 1000 timesteps 12.938 penalties 0.0

***************
at alpha 0.1 gamma 0.5 epsilon 0.5
Training finished.

Results after 1000 episodes:
Average timesteps per e