In [None]:
#  Supervised Learning  in RL- HoaDNt@fe.edu.vn
import numpy as np

class GridWorld:
    def __init__(self):
        self.grid_size = (3, 3)
        self.num_actions = 4  # Up, Down, Left, Right
        self.start_state = (0, 0)
        self.goal_state = (2, 2)

    def step(self, state, action):
        # Define the dynamics of the environment
        row, col = state
        if action == 0:  # Up
            row = max(0, row - 1)
        elif action == 1:  # Down
            row = min(self.grid_size[0] - 1, row + 1)
        elif action == 2:  # Left
            col = max(0, col - 1)
        elif action == 3:  # Right
            col = min(self.grid_size[1] - 1, col + 1)
        next_state = (row, col)
        reward = 0
        if next_state == self.goal_state:
            reward = 1  # Reward of +1 upon reaching the goal state
        return next_state, reward

def generate_training_data(grid_world, num_samples):
    X = np.zeros((num_samples, 2))  # State features
    y = np.zeros((num_samples,))     # Actions
    for i in range(num_samples):
        state = (np.random.randint(grid_world.grid_size[0]), np.random.randint(grid_world.grid_size[1]))
        action = np.random.randint(grid_world.num_actions)
        next_state, _ = grid_world.step(state, action)
        X[i] = state
        y[i] = action
    return X, y

# Create a grid world environment
grid_world = GridWorld()

# Generate training data
num_samples = 1000
X_train, y_train = generate_training_data(grid_world, num_samples)

# Train a supervised learning model (e.g., a decision tree classifier)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate the learned policy
def evaluate_policy(grid_world, model):
    total_reward = 0
    state = grid_world.start_state
    while state != grid_world.goal_state:
        # Predict action based on current state
        action = model.predict([state])[0]
        # Take action and observe next state and reward
        next_state, reward = grid_world.step(state, action)
        total_reward += reward
        state = next_state
    return total_reward

# Evaluate the learned policy
total_reward = evaluate_policy(grid_world, model)
print("Total reward obtained by learned policy:", total_reward)
