## Predicting Candidate Rejection Step in a Recruitment Process using Reinforcement Learning

#### The environment :
Two actions (Advance or Reject) and a state that consists of the stage of recruitment and the candidate's performance
the step method is the effect of the action and gives the reward
* The environment represents the recruitment process and has discrete action space (Advance or Reject) and a tuple observation space.
* The observation space includes the current stage of recruitment (discrete) and the candidate's performance.
* The ``step``  method takes an action, updates the state based on the action
* The `reset `method resets the environment to its initial state. 
* The `render` method displays the current state.



In [2]:

import gym
import numpy as np

class RecruitmentEnv(gym.Env):
    def __init__(self,cand):
        super(RecruitmentEnv, self).__init__()

        self.action_space = gym.spaces.Discrete(2)  # Advance or Reject
        self.observation_space = gym.spaces.Tuple((
            gym.spaces.Discrete(4),  # Stage of recruitment
            gym.spaces.Box(low=0, high=1, shape=(4,))  # Candidate performance
        ))

        self.state=(0,cand)
        self.info={}
        self.done=False
        self.cand=cand




    def step(self, action):

        good_threshold = 0.7
        # bad_threshold = 0.3 # i can add probabilities between the good and bad thresholds
        time_penalty = 40

        # Get the current candidate's score
        rew=0
        stage, performance = self.state
        
        if action == 0:  # Reject the candidate
            if performance[stage] > good_threshold: #good candidate
                rew= -700 # Penalize for rejecting a good candidate
            else:
                rew= 600   # Reward for rejecting a bad candidate
            
            self.done=True
            
        else:  # Proceed to the next stage
            if (stage == 3) & (performance[stage] > good_threshold ):
                rew= 500               # Highly reward for successfully extending an offer
                
                self.done=True
            elif (stage == 3) & (performance[stage] < good_threshold ):
                rew=300
                self.done=True
            elif performance[stage] > good_threshold:
                rew= 1050                # Reward for moving on with a good candidate
                self.state=(stage+1,performance)
            else:
                rew= -400 - time_penalty * (1+stage)  # Penalize for moving on with a bad candidate
                self.state=(stage+1,performance)
            rew+=1*(performance[stage]*1000-300)
        return self.state, rew, self.done, {}
    
    def reset(self):
        self.state=(0,self.cand)
        self.done=False
        return self.state


    # def render(self):
    #     # Render the environment to the screen
    #     print(f'Step: {self.state[0]}, Scores: {self.state[1]}')


#### The agent:
* The Q-learning agent is initialized with hyperparameters such as learning rate, discount factor, exploration rate, and exploration decay rate.
* The agent maintains a Q-table to store the expected cumulative rewards for each state-action pair.
* The `get_action` method selects an action based on the epsilon-greedy strategy (explore or exploit).
* The `train` method updates the Q-value based on the observed reward and the maximum Q-value of the next state.
* The exploration rate decays over time.

In [3]:
import numpy as np

class QLearningAgent:
    def __init__(self, env, learning_rate=0.5, discount_factor=0.95, exploration_rate=0.7, exploration_decay_rate=0.99):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay_rate = exploration_decay_rate

        # Initialize Q-table to 0
        self.q_table = np.zeros((5, 2))

    def get_action(self, state):   # Epsilon-Greedy Strategy
        if np.random.uniform(0, 1) < self.exploration_rate:
            return self.env.action_space.sample()  # Explore
        else:
            return np.argmax(self.q_table[state])  # Exploit

    def train(self,state,action,reward,next_state):
        old_value=self.q_table[state, action]
        next_max=np.max(self.q_table[next_state])

        new_value = (1 - self.learning_rate) * old_value + self.learning_rate * (reward + self.discount_factor * next_max)
        self.q_table[state, action] = new_value

        # Decay exploration rate
        if self.exploration_rate>0.01:
            self.exploration_rate=self.exploration_decay_rate

#### Training loop:
* A set of candidates with random performance values is generated.
* An environment is created for each candidate.
* Q-learning agents are initialized for each environment.
* The ``training`` function runs episodes of training for each agent on its corresponding environment.
 - The update rule is given by:
     $ Q(s, a) \leftarrow (1 - \alpha) \cdot Q(s, a) + \alpha \cdot \left(r + \gamma \cdot \max_{a'} Q(s', a')\right) $
     where:
     - $Q(s, a)$ is the Q-value for state $s$ and action $a$.
     - $\alpha$ is the learning rate, determines to what extent newly acquired information overrides old information
     - $\gamma$ is the discount factor, determining the importance of future rewards.
     - $r$ is the immediate reward received for taking action $a$ in state $s$.
     - $s'$ is the next state after taking action $a$.


``find_action`` :*The idea is to look for the closest known candidates to our new candidate and do a **majority vote** the find the next action.*

In [4]:

import matplotlib.pyplot as plt
tol=100000
cands=[np.random.uniform(size=4) for _ in range(tol)]
envs=[RecruitmentEnv(cand) for cand in cands]
agents=[(i,QLearningAgent(envs[i])) for i in range(tol)]


def training(i,agent):


    for episode in range(1000):  
        state = envs[i].reset()
        
        done=False
        while not done:
            
            action = agent.get_action(state[0])
            
            next_state, reward, done, info = envs[i].step(action)
            # envs[i].render() 
            agent.train(state[0], action, reward, next_state[0])

            state = next_state

        # print()







In [5]:
# Train the agents 
a=[training(i, agent) for i, agent in agents]

In [6]:
def find_action(x,step, cands, agents):
    # Calculate the distances
    distances =np.array([np.linalg.norm(arr - x) for arr in cands])

    # Find the indices of the 5 smallest distances
    closest_indices =np.argsort(distances)[:101]

    # Get the actions for the closest indices
    actions =[agents[i][1].get_action(step) for i in closest_indices]

    # Count the occurrences of each action
    counts =np.bincount(actions)

    # Return the most frequent element
    most_frequent_element = np.argmax(counts)

    return most_frequent_element

x = np.random.uniform(size=4)
print(x)
find_action(x,0,cands,agents) #  First action

[0.96806977 0.14672314 0.35679317 0.58816522]


1

#### Predict rejection step

In [7]:
def rejection_step(x, cands, agents,log=False):
    step=0
    done=False
    if log:
        print(x)
    while not done:
        action=find_action(x,step,cands,agents)
        if log:
            print(action , end=" ")
        if action==0:
            done=True
            break

        step+=1
    return step



In [9]:

x=np.array([0.9,0.9,0.9,0.9])
rej=rejection_step(x,cands,agents,log=True)
if rej > 3: print('Offer') # 0==profile matching
else: print(f'rejection step:{rej}')

[0.9 0.9 0.9 0.9]
1 1 1 1 0 Offer


In [94]:
x = np.random.uniform(size=4)

rej=rejection_step(x,cands,agents,log=True)
if rej > 3: print('Offer')
else: print(f'rejection step:{rej}')

[0.89677377 0.07774106 0.77534968 0.7285238 ]
1 0 rejection step:1


In [8]:
x = np.random.uniform(size=4)
# x=np.array([0.9,0.9,0.9,0.9])
rej=rejection_step(x,cands,agents,log=True)
if rej > 3: print('Offer')
else: print(f'rejection step:{rej}')

[0.44218574 0.32549752 0.38347218 0.44549719]
0 rejection step:0
