In [None]:

# Reinforcement learning is a type of machine learning where an agent learns to make sequential decisions in
# an environment to maximize a cumulative reward. The agent interacts with the environment, 
# receives feedback in the form of rewards or punishments, 
# and uses this feedback to learn the optimal actions to take in different situations.

# In reinforcement learning, the agent learns through trial and error.
# It explores the environment by taking actions and receives feedback in the form of rewards
# or penalties based on the outcomes of those actions. The agent's goal is to learn a policy, 
# which is a mapping from states to actions, that maximizes the long-term cumulative reward.

In [18]:
import random 

#create Environment class
class MyEnvironment:

  def __init__(self):
    #maximum number of steps which the agent can take to gain rewards
    self.remaining_steps=20 #assume that the game must be completed within 20 steps

  def get_observation(self):
    #it can be any number of coordinates.Its considered as 3 here.
    #These values -0.0,0.0,0.0 represent some kind of logic that gives
    #  info about the environment.These values can be anything.
    return [1.0,2.0,1.0]  
  
  #when agent,performs an action,it should get a reward
  #i have set it as 1 for reward,-1 for punishment
  def get_actions(self):
    return [-1,1]

  #if steps are completed,return True because the agent should not move anymore
  def check_is_done(self)->bool:
    return self.remaining_steps==0

  def action(self,int):
    if self.check_is_done():
      raise Exception("Game over")      
    self.remaining_steps-=1  #if steps can still be taken-game not finished=>decrement the remaining number of steps
    return random.random()  #here-as this is a simple implementation-just returning a random number


In [19]:

#agent implements some policy

class myAgent:
  def __init__(self):
    self.total_rewards=0.0 #initially-agent-no rewards

  def step(self,ob:MyEnvironment):
    curr_obs=ob.get_observation()
    print(curr_obs)
    curr_action=ob.get_actions()
    print(curr_action)
    curr_reward=ob.action(random.choice(curr_action)) 
    #here,we are randomly picking -1 or 1
    #usually,when action() is invoked,implementation to check if the decision of the agent is crt-give positive reward else negative reward
    self.total_rewards+=curr_reward
    print("Total rewards so far= %.3f "%self.total_rewards)

In [20]:

if __name__=='__main__':
  obj=MyEnvironment()
  agent=myAgent()
  step_number=0

  while not obj.check_is_done():
    step_number+=1
    agent.step(obj)
    

  print("Total reward is %.3f "%agent.total_rewards)
  #different o/p everytime we run this code b'coz diff random numbers will be generated
     

[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 0.895 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 1.116 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 2.014 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 3.006 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 3.625 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 4.489 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 5.340 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 6.047 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 6.139 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 6.734 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 7.068 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 7.813 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 8.322 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 9.220 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 9.976 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 10.697 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 11.212 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so far= 12.055 
[1.0, 2.0, 1.0]
[-1, 1]
Total rewards so fa