In [None]:
import gymnasium as gym
import numpy as np
import cv2

def preprocess_observation(observation):
    """
    Convert the observation to grayscale and apply edge detection to identify track boundaries.
    """
    # Convert to grayscale
    gray = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
    
    # Apply Sobel edge detection in both horizontal and vertical directions
    sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=5)
    sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=5)
    
    # Combine the Sobel X and Y results to get the overall edge magnitude
    edges = np.sqrt(sobelx**2 + sobely**2)
    
    # Normalize edges to range 0-1 for easier thresholding
    edges_normalized = cv2.normalize(edges, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    
    return edges_normalized

def simple_policy(edges):
    """
    Determine an action based on the edge detection result. This is a placeholder for a more sophisticated policy.
    """
    # If significant edges are detected in the lower part of the image, it might indicate being off-track.
    lower_half_edges = edges[edges.shape[0]//2:, :]
    edge_strength = np.mean(lower_half_edges)
    # Define a threshold to determine if we are off the track based on edge strength
    if edge_strength > 0.06:  # This threshold is arbitrary; adjust based on your observations
        return [0, 1, 0]  # Straight with full acceleration
    else:
        # If we detect less edges, it might indicate being off-track, so try turning
        return [np.random.uniform(-1, 1), 0.5, 0]  # Random steering with some acceleration

# Initialize environment
env = gym.make("CarRacing-v2", domain_randomize=False, render_mode="human")

episodes = 3
steps = 100

for _ in range(episodes):

    # Main loop
    observation, info = env.reset(options={"randomize": False})
    for _ in range(steps):
        # Preprocess the observation to get edge information
        edges = preprocess_observation(observation)
        
        # Decide on an action based on edges
        action = simple_policy(edges)
        
        # Apply the action
        observation, reward, terminated, truncated, info = env.step(action)
        env.render()
        
        if terminated or truncated:
            break

env.close()


In [None]:
import gymnasium as gym
import numpy as np

env = gym.make('Blackjack-v1', natural=False, sab=False, render_mode="human")

num_states = 10 * 10 * 2  # Adjusted ranges
num_actions = 2  # Actions: Stick (0), Hit (1)

# Initialize the Q-matrix with zeros
Q_matrix = np.zeros((num_states, num_actions))

# Define the number of steps you want to simulate
episodes = 20
steps = 20

alpha = 0.1  # learning rate
gamma = 0.9  # discount factor

def update_q_matrix(state_index, action, reward, next_state_index, alpha, gamma):
    min_next_q = np.min(Q_matrix[next_state_index])  # Use min for exploration
    Q_matrix[state_index, action] = (1 - alpha) * Q_matrix[state_index, action] + alpha * (reward + gamma * min_next_q)


def observation_to_state(observation):
    player_sum, dealer_card, usable_ace = observation

    player_sum -= 12  
    dealer_card -= 1 
    
    # Calculate the state index using base conversion
    state_index = player_sum * 10 * 2 + dealer_card * 2 + usable_ace

    if(state_index > 200):
        return 199
    
    return state_index

win_count = 0
loss_count = 0
draw_count = 0

for episode in range(episodes):
    observation, info = env.reset(options={"randomize": False})  # Start a new episode
    state_index = observation_to_state(observation)
    
    for step in range(steps):

        action = np.argmin(Q_matrix[state_index])  # Take the action with the smallest Q value (exploration)
        next_observation, reward, terminated, _, _ = env.step(action)  # Take the action in the environment
        next_state_index = observation_to_state(next_observation)  # Get the state index for the new observation
        
        # Update the Q-matrix with the new knowledge
        update_q_matrix(state_index, action, reward, next_state_index, alpha, gamma)
        
        env.render()
        print(f"Observation: {next_observation}, Action: {action}, Reward: {reward}, Terminated: {terminated}")

        state_index = next_state_index  # Update the state index for the next step

        if terminated:
            if reward == 1:
                win_count += 1
            elif reward == -1:
                loss_count += 1
            elif reward == 0:
                draw_count += 1
            break  # Exit the loop because the episode has ended

        
results_tuple = (win_count, loss_count, draw_count)
print(f"Results - Wins: {win_count}, Losses: {loss_count}, Draws: {draw_count}")
print(f"Results tuple: {results_tuple}")

# Close the environment
env.close()
