In [None]:
import numpy as np
from prettytable import PrettyTable


correct_actions_sequence = {
    0: 0,  # Empty - Initiate Task
    1: 1,  # Resource Shortage - Resource Reallocation
    2: 4,  # Ongoing Task - Progress Review
    3: 5,  # Pending Task - Adjust Strategy
    4: 2,  # Progress Review - Review Progress
    5: 3,  # Strategy Adjustment - Define New Strategy
    6: 6,  # Define Strategy - Define Strategy
    7: 5,  # Communicate - Communicate Changes
    8: 0,  # Task Completed - Initiate Task
}

def custom_reward(current_state, next_state):
    state_desirability = {
        0: 1,  # Empty
        1: 0.5,  # Resource Shortage
        2: 1.5,  # Ongoing Task
        3: 0.8,  # Pending Task
        4: 1.2,  # Progress Review
        5: 1.5,  # Strategy Adjustment
        6: 1.8,  # Define Strategy
        7: 1,  # Communicate
        8: 2,  # Task Completed
    }

    unknown_problem_penalty = -0.5  # Adjust this penalty value as needed

    # Check if the next_state is a valid key in the dictionary
    if current_state in state_desirability and next_state in state_desirability:
        current_desirability = state_desirability[current_state]
        next_desirability = state_desirability[next_state]
        desirability_change = next_desirability - current_desirability

        if next_state == 9:  # Check if the next state is an "Unknown Problem" state
            desirability_change += unknown_problem_penalty

        if desirability_change > 0:
            reward_weight = 1.0
        elif desirability_change < 0:
            reward_weight = -0.5
        else:
            reward_weight = 0.1

        reward = desirability_change * reward_weight
    else:
        reward = 0.0  # Default reward if states are not in the dictionary

    return reward

class ProjectManagementEnv:

    def __init__(self, grid_size=5, initial_state=0, end_cell=(6, 6)):
        self.grid_size = grid_size
        self.num_states = self.grid_size ** 2
        self.num_actions = 6
        self.max_episodes = 100

        self.grid = np.arange(self.num_states).reshape(self.grid_size, self.grid_size)
        self.agent_position = (0, 0)
        self.episode = 0
        self.state = initial_state

        self.state_names = {
            0: "Empty",
            1: "Resource Shortage",
            2: "Ongoing Task",
            3: "Pending Task",
            4: "Progress Review",
            5: "Strategy Adjustment",
            6: "Define Strategy",
            7: "Communicate",
            8: "Task Completed"
        }
        self.agent_position = (initial_state // self.grid_size, initial_state % self.grid_size)
        self.end_cell = end_cell

        self.place_states_randomly()  # Place states randomly after defining state_names




    def place_states_randomly(self):
        unique_states = list(self.state_names.keys())
        self.grid = np.random.choice(unique_states, self.grid_size * self.grid_size, replace=True).reshape(self.grid_size, self.grid_size)


    def reset(self):
        self.state = 0
        return self.state

    def step(self, action,movement):

        if movement == 'u':
            next_row = max(0, self.agent_position[0] - 1)
            next_state = next_row * self.grid_size + self.agent_position[1]
        elif movement == 'd':
            next_row = min(self.grid_size - 1, self.agent_position[0] + 1)
            next_state = next_row * self.grid_size + self.agent_position[1]
        elif movement == 'l':
            next_col = max(0, self.agent_position[1] - 1)
            next_state = self.agent_position[0] * self.grid_size + next_col
        elif movement == 'r':
            next_col = min(self.grid_size - 1, self.agent_position[1] + 1)
            next_state = self.agent_position[0] * self.grid_size + next_col
        else:
            next_state = self.state  # Stay in the same state

        self.agent_position = (next_state // self.grid_size, next_state % self.grid_size)
        self.grid[self.agent_position[0], self.agent_position[1]] = self.state

        next_state = self.agent_position[0] * self.grid_size + self.agent_position[1]

        reward = custom_reward(self.state, next_state)


        done = False
        if self.agent_position == self.end_cell:
            done = True

        return next_state, reward, done


    def set_max_episodes(self, max_episodes):
        self.max_episodes = max_episodes

    def render(self, action):
        action_labels = {
            0: "Initiate Task",
            1: "Reallocate Resources",
            2: "Review Progress",
            3: "Adjust Strategy",
            4: "Define New Strategy",
            5: "Communicate Changes"
        }

        table = PrettyTable()
        table.field_names = [""] + [str(col) for col in range(self.grid_size)]

        for row in range(self.grid_size):
            row_data = [str(row)]
            for col in range(self.grid_size):
                state = self.grid[row, col] % 10
                if (row, col) == self.agent_position:
                    row_data.append("P")
                elif state in self.state_names:
                    row_data.append(str(state))
                else:
                    row_data.append("X")  # Display "X" for obstacles (Unknown Problem states)
            table.add_row(row_data)

        print(table)
        print(f"Agent's Chosen Action: {action_labels[action]}")

        if self.state in self.state_names:
            current_state_name = self.state_names[self.state]
            print(f"Current State: {current_state_name}")
        else:
            print("Current State: Unknown Problem")




class QLearningAgent:

    def __init__(self, num_states, num_actions, initial_learning_rate=0.1, discount_factor=0.9, exploration_prob=0.5,
                 learning_rate_decay_rate=0.95, min_learning_rate=0.01):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = initial_learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.learning_rate_decay_rate = learning_rate_decay_rate
        self.min_learning_rate = min_learning_rate

        self.q_table = np.zeros((num_states, num_actions))

    def select_action(self, state):
        if np.random.uniform(0, 1) < self.exploration_prob:
            return np.random.choice(self.num_actions)  # Explore
        else:
            return np.argmax(self.q_table[state, :])  # Exploit

    def update_q_value(self, state, action, reward, next_state, next_state_valid):
        if next_state_valid:  # Check if the next state is not an "Unknown Problem" state
            max_next_action_value = np.max(self.q_table[next_state, :])
            self.q_table[state, action] += self.learning_rate * (
                reward + self.discount_factor * max_next_action_value - self.q_table[state, action]
            )
        else:
            self.q_table[state, action] = float('-inf')  # Set Q-value to negative infinity for "Unknown Problem" states

        self.decay_learning_rate()  # Decay learning rate here

        # Check if the user rating is 5 or close to it
        if user_rating >= 4:
         state = next_state  # No need to change state if the action is rated 5
        else:
         state = next_state
    def decay_learning_rate(self):
        self.learning_rate = max(self.learning_rate * self.learning_rate_decay_rate, self.min_learning_rate)

    def get_user_feedback(self, reward):
        print("Agent's Action Outcome:")
        print(f"Reward: {reward}")
        rating = input("Please rate the agent's action (1 to 5, where 5 is excellent): ")
        return int(rating)

end_cell_position = (6, 6)
env = ProjectManagementEnv(grid_size=7, end_cell=end_cell_position)
env.set_max_episodes(1)
agent = QLearningAgent(num_states=env.num_states, num_actions=env.num_actions,
                           initial_learning_rate=0.1, discount_factor=0.95, exploration_prob=0.7)
agent.env = env

action_labels = [
    "Initiate Task", "Reallocate Resources", "Review Progress",
    "Adjust Strategy", "Define New Strategy", "Communicate Changes"
]

for episode in range(env.max_episodes):
    env.episode = episode
    state = env.reset()
    done = False
    total_reward = 0
    correct_actions_count = 0  # Count of correct actions taken by the agent
    consecutive_correct_actions = 0  # Count of consecutive correct actions

    while not done:
        action = agent.select_action(state)
        env.render(action)

        # Calculate the next state based on the selected action
        if action == 0:  # Up
            next_row = max(0, env.agent_position[0] - 1)
            next_state = next_row * env.grid_size + env.agent_position[1]
        elif action == 1:  # Down
            next_row = min(env.grid_size - 1, env.agent_position[0] + 1)
            next_state = next_row * env.grid_size + env.agent_position[1]
        elif action == 2:  # Left
            next_col = max(0, env.agent_position[1] - 1)
            next_state = env.agent_position[0] * env.grid_size + next_col
        elif action == 3:  # Right
            next_col = min(env.grid_size - 1, env.agent_position[1] + 1)
            next_state = env.agent_position[0] * env.grid_size + next_col
        else:
            next_state = state  # Stay in the same state

        # Determine if the next state is valid (not an "Unknown Problem" state)
        next_state_valid = next_state not in env.state_names or env.state_names[next_state] != "Unknown Problem"

        reward = custom_reward(state, next_state)


        # Get user feedback for the agent's action
        user_rating = agent.get_user_feedback(reward)

        # Update Q-values based on the Q-learning update equation
        agent.update_q_value(state, action, user_rating, next_state, next_state_valid)  # Pass next_state_valid here

        agent.decay_learning_rate()

        if user_rating >= 4:
         state = next_state
         consecutive_correct_actions += 1
        else:
         state = next_state  # Move to another state in the grid
         consecutive_correct_actions = 0

        # Check if the action taken by the agent is correct
        if state in correct_actions_sequence and action == correct_actions_sequence[state]:
            correct_actions_count += 1
            consecutive_correct_actions += 1
        else:
            consecutive_correct_actions = 0

            print(f"Correct Actions Count: {correct_actions_count}")


        # Update current state and agent's position in the grid
        env.state = next_state
        env.agent_position = (next_state // env.grid_size, next_state % env.grid_size)

        total_reward += reward
        state = next_state

        if env.agent_position == env.end_cell:
            done = True
            print("Agent has reached the end cell. Episode completed.")

        # Clear the terminal for a cleaner display
        import os
        os.system('clear')

        # Check if the agent has taken the correct actions for each state in the correct sequence
        if consecutive_correct_actions >= len(correct_actions_sequence):
            print("Agent has taken the correct actions for each state. Agent wins!")
            break

    print(f"Episode {episode + 1} - Total Reward: {total_reward}")
    print(f"Correct Actions Count: {correct_actions_count}")
    print("-" * 40)  # Print a separator line between episodes


# Evaluation parameters
num_evaluation_episodes = 10

# Initialize evaluation environment
evaluation_env = ProjectManagementEnv(grid_size=7, end_cell=end_cell_position)
evaluation_env.set_max_episodes(num_evaluation_episodes)

# List to store evaluation results
evaluation_results = []

# Run evaluation episodes
for episode in range(num_evaluation_episodes):
    state = evaluation_env.reset()
    done = False
    total_reward = 0
    correct_actions_count = 0

    while not done:
        action = agent.select_action(state)

        # Calculate the next state based on the selected action (same as during training)
        next_state, reward, done = evaluation_env.step(action, movement=None)

        # Check if the action taken by the agent is correct (same as during training)
        if state in correct_actions_sequence and action == correct_actions_sequence[state]:
            correct_actions_count += 1

        total_reward += reward
        state = next_state

    # Store evaluation results for this episode
    evaluation_results.append({
        "Episode": episode + 1,
        "Total Reward": total_reward,
        "Correct Actions Count": correct_actions_count
    })

# Print evaluation results
print("\nEvaluation Results:")
for result in evaluation_results:
    print(result)

# Calculate average performance metrics
average_total_reward = np.mean([result["Total Reward"] for result in evaluation_results])
average_correct_actions = np.mean([result["Correct Actions Count"] for result in evaluation_results])

print("\nAverage Performance Metrics:")
print(f"Average Total Reward: {average_total_reward}")
print(f"Average Correct Actions Count: {average_correct_actions}")

+---+---+---+---+---+---+---+---+
|   | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
+---+---+---+---+---+---+---+---+
| 0 | P | 5 | 8 | 1 | 6 | 8 | 8 |
| 1 | 4 | 4 | 6 | 7 | 3 | 8 | 1 |
| 2 | 8 | 6 | 3 | 8 | 5 | 2 | 0 |
| 3 | 8 | 8 | 4 | 6 | 8 | 0 | 1 |
| 4 | 2 | 3 | 0 | 7 | 3 | 4 | 1 |
| 5 | 0 | 6 | 1 | 8 | 2 | 0 | 6 |
| 6 | 3 | 2 | 0 | 2 | 2 | 0 | 6 |
+---+---+---+---+---+---+---+---+
Agent's Chosen Action: Initiate Task
Current State: Empty
Agent's Action Outcome:
Reward: 0.0
Please rate the agent's action (1 to 5, where 5 is excellent): 5
+---+---+---+---+---+---+---+---+
|   | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
+---+---+---+---+---+---+---+---+
| 0 | P | 5 | 8 | 1 | 6 | 8 | 8 |
| 1 | 4 | 4 | 6 | 7 | 3 | 8 | 1 |
| 2 | 8 | 6 | 3 | 8 | 5 | 2 | 0 |
| 3 | 8 | 8 | 4 | 6 | 8 | 0 | 1 |
| 4 | 2 | 3 | 0 | 7 | 3 | 4 | 1 |
| 5 | 0 | 6 | 1 | 8 | 2 | 0 | 6 |
| 6 | 3 | 2 | 0 | 2 | 2 | 0 | 6 |
+---+---+---+---+---+---+---+---+
Agent's Chosen Action: Communicate Changes
Current State: Empty
Agent's Action Outcome:
Rewar

KeyboardInterrupt: ignored