In [1]:

import gym
import numpy as np

class RecruitmentEnv(gym.Env):
    def __init__(self):
        super(RecruitmentEnv, self).__init__()

        self.action_space = gym.spaces.Discrete(2)  # Advance or Reject
        self.observation_space = gym.spaces.Dict({
            'stage': gym.spaces.Discrete(4),  # Stage of recruitment
            'performance': gym.spaces.Box(low=0, high=1, shape=(4,))  # Candidate performance
        })

        self.cand=np.random.uniform(size=4)
        self.state={'stage': 0, 'performance': self.cand}
        self.info={}
        self.done=False




    def step(self, action):
        good_threshold = 0.7
        # bad_threshold = 0.3 #i can add probabilities between the 2
        time_penalty = 40
        # Get the current candidate's score
        rew=0
        stage= self.state['stage']
        performance = self.state['performance']
        if action == 0:  # Reject the candidate
            if performance[stage] > good_threshold: #good candidate
                rew= -700 # Penalize for rejecting a good candidate
            else:
                rew= 800   # Reward for rejecting a bad candidate
            # Reset state?self.state=(0,np.random.uniform(size=4))
            self.done=True
            
        else:  # Proceed to the next stage
            if (stage == 3) & (performance[stage] > good_threshold ):
                rew= 500               # Highly reward for successfully extending an offer
                
                self.done=True
            elif (stage == 3) & (performance[stage] < good_threshold ):
                rew=300
                self.done=True
            elif performance[stage] > good_threshold:
                rew= 1050                # Reward for moving on with a good candidate
                self.state['stage']+=1
            else:
                rew= -600 - time_penalty * (1+stage)  # Penalize for moving on with a bad candidate
                self.state['stage']+=1
            rew+=2*(performance[stage]*1000-300)
        return self.state, rew, self.done, {}
    
    def reset(self):
        self.cand=np.random.uniform(size=4)
        self.state={'stage': 0, 'performance': self.cand}
        self.done=False
        return self.state 


    def render(self):
        # Render the environment to the screen
        print(f'Step: {self.state[0]}, Scores: {self.state[1]}')


In [157]:
from stable_baselines3 import DQN


env=RecruitmentEnv()


model = DQN("MultiInputPolicy", env)

#Train
model.learn(total_timesteps=100000)


<stable_baselines3.dqn.dqn.DQN at 0x7fc66abbc910>

In [241]:

x = np.random.uniform(size=4)

step=0
obs={'stage': step, 'performance': x}
done=False

while step<5:
    print(obs)
    action,_state=model.predict(obs)
    if action==0:break
    step+=1
    obs['stage']+=1
    
print(step) # Rejection step or Offer


{'stage': 0, 'performance': array([0.04582057, 0.51589669, 0.45747965, 0.36368137])}
0
