In [6]:
import random

STATES = [-2, -1, 0, 1, 2]  # temperature error
A_OFF, A_ON = 0, 1

def step(e, a):
    """
    Toy dynamics:
    - If ON: error tends to decrease by 1 (toward 0)
    - If OFF: error tends to increase by 1 (away from 0)
    plus small noise.
    Động lực đồ chơi:
    - Bật: sai lệch có xu hướng giảm 1 (tiến về 0)
    - Tắt: sai lệch có xu hướng tăng 1 (xa 0 hơn)
    cộng nhiễu nhỏ.
    """
    noise = random.choice([-1, 0, 0, 0, 1])  # mostly 0
    if a == A_ON:
        e2 = e - 1 + noise
    else:
        e2 = e + 1 + noise
    e2 = max(-2, min(2, e2))
    r = -abs(e2)  # reward based on next state comfort
    return e2, r

def policy_threshold(e):
    """
    Simple policy:
    - If too warm/cold (|e|>=1), turn ON to correct
    - If perfect (e==0), turn OFF
    Chính sách đơn giản:
    - Nếu lệch (|e|>=1) thì bật để kéo về 0
    - Nếu e==0 thì tắt
    """
    return A_OFF if e == 0 else A_ON

def rollout_return(gamma, T=200, seed=0):
    random.seed(seed)
    e = random.choice(STATES)
    G = 0.0
    pow_g = 1.0
    for _ in range(T):
        a = policy_threshold(e)
        e, r = step(e, a)
        G += pow_g * r
        pow_g *= gamma
    return G

gammas = [0.0, 0.5, 0.9, 0.99]
for g in gammas:
    avg = sum(rollout_return(g, T=200, seed=s) for s in range(20)) / 20
    print(f"gamma={g:>4}  avg discounted return over 200 steps ≈ {avg:.3f}")


gamma= 0.0  avg discounted return over 200 steps ≈ -1.150
gamma= 0.5  avg discounted return over 200 steps ≈ -2.315
gamma= 0.9  avg discounted return over 200 steps ≈ -14.416
gamma=0.99  avg discounted return over 200 steps ≈ -162.549


In [7]:
import gymnasium as gym  # Import Gymnasium
import numpy as np

# 1. Create the environment
env = gym.make('CartPole-v1')

# 2. Initialize the agent's parameters
state, _ = env.reset()  # The reset method now returns a tuple (state, info)
done = False  # Indicator for episode completion
total_reward = 0  # Variable to track total reward during an episode

# 3. Define Q-learning parameters (for simplicity, we use a basic agent here)
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1  # Exploration rate
num_episodes = 10  # Total episodes to run

# Initialize Q-table (simple state-action space)
# CartPole has 4 continuous state variables, so here we'll discretize it into a smaller grid
state_space_bins = [6, 12, 12, 6]  # Discretizing each dimension into bins
q_table = np.random.uniform(low=-1, high=1, size=(state_space_bins[0], state_space_bins[1], state_space_bins[2], state_space_bins[3], env.action_space.n))

# Function to discretize continuous state space
def discretize_state(state):
    """Discretizes the continuous state to a discrete index in the Q-table"""
    state_bins = []
    for i in range(len(state)):
        # Clip the value of each dimension within the allowed bin range
        bin_idx = np.digitize(state[i], np.linspace(-1, 1, state_space_bins[i]))
        # Ensure that the index is within the bounds of the state space bins
        state_bins.append(min(bin_idx, state_space_bins[i] - 1))  # Clip to avoid out-of-bounds indexing
    return tuple(state_bins)

# 4. Training loop (run episodes)
for episode in range(num_episodes):
    state, _ = env.reset()  # Reset the environment and extract the state
    state = discretize_state(state)  # Discretize the continuous state
    done = False
    total_reward = 0
    
    while not done:
        # 5. Agent selects an action using epsilon-greedy strategy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore: random action
        else:
            action = np.argmax(q_table[state])  # Exploit: choose the best action based on Q-table
        
        # 6. Take the action and observe the result
        next_state, reward, done, _, info = env.step(action)
        next_state = discretize_state(next_state)
        
        # 7. Update Q-value based on the agent's experience
        q_table[state + (action,)] = q_table[state + (action,)] + learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state + (action,)])
        
        # Update the state and total reward
        state = next_state
        total_reward += reward
    
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")
    
env.close()


Episode 1: Total Reward = 40.0
Episode 2: Total Reward = 21.0
Episode 3: Total Reward = 16.0
Episode 4: Total Reward = 14.0
Episode 5: Total Reward = 12.0
Episode 6: Total Reward = 13.0
Episode 7: Total Reward = 19.0
Episode 8: Total Reward = 17.0
Episode 9: Total Reward = 17.0
Episode 10: Total Reward = 23.0


In [12]:
import gymnasium as gym  # Import Gymnasium
import numpy as np
import pandas as pd

# Load the dataset from Kaggle (assuming CSV format)
df = pd.read_csv('path_to_your_kaggle_dataset.csv')  # Replace with the actual dataset path

# Print the first few rows to check the structure
print(df.head(15))

# Extract relevant features from the dataset
# Assume the dataset has columns like 'feature1', 'feature2', etc., that we can use
features = df[['feature1', 'feature2', 'feature3']].values  # Example features

# 1. Create the environment
env = gym.make('CartPole-v1')

# 2. Initialize the agent's parameters
state, _ = env.reset()  # The reset method now returns a tuple (state, info)
done = False  # Indicator for episode completion
total_reward = 0  # Variable to track total reward during an episode

# 3. Define Q-learning parameters (for simplicity, we use a basic agent here)
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1  # Exploration rate
num_episodes = 10  # Total episodes to run

# Initialize Q-table (simple state-action space)
# CartPole has 4 continuous state variables, so here we'll discretize it into a smaller grid
state_space_bins = [6, 12, 12, 6]  # Discretizing each dimension into bins
q_table = np.random.uniform(low=-1, high=1, size=(state_space_bins[0], state_space_bins[1], state_space_bins[2], state_space_bins[3], env.action_space.n))

# Function to discretize continuous state space
def discretize_state(state):
    """Discretizes the continuous state to a discrete index in the Q-table"""
    state_bins = []
    for i in range(len(state)):
        # Clip the value of each dimension within the allowed bin range
        bin_idx = np.digitize(state[i], np.linspace(-1, 1, state_space_bins[i]))
        # Ensure that the index is within the bounds of the state space bins
        state_bins.append(min(bin_idx, state_space_bins[i] - 1))  # Clip to avoid out-of-bounds indexing
    return tuple(state_bins)

# 4. Training loop (run episodes)
for episode in range(num_episodes):
    state, _ = env.reset()  # Reset the environment and extract the state
    state = discretize_state(state)  # Discretize the continuous state
    done = False
    total_reward = 0
    
    while not done:
        # 5. Agent selects an action using epsilon-greedy strategy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore: random action
        else:
            action = np.argmax(q_table[state])  # Exploit: choose the best action based on Q-table
        
        # 6. Take the action and observe the result
        next_state, reward, done, _, info = env.step(action)
        next_state = discretize_state(next_state)
        
        # 7. Update Q-value based on the agent's experience
        q_table[state + (action,)] = q_table[state + (action,)] + learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state + (action,)])
        
        # Update the state and total reward
        state = next_state
        total_reward += reward
    
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

env.close()


    feature1  feature2  feature3
0          1         5         5
1          2         4         4
2          3         3         3
3          4         2         2
4          5         1         1
5          1         5         5
6          2         4         4
7          3         3         3
8          4         2         2
9          5         1         1
10         7         3         3
11         8         4         4
12         9         5         5
13        10         6         6
14        11         7         7
Episode 1: Total Reward = 27.0
Episode 2: Total Reward = 11.0
Episode 3: Total Reward = 38.0
Episode 4: Total Reward = 35.0
Episode 5: Total Reward = 18.0
Episode 6: Total Reward = 11.0
Episode 7: Total Reward = 13.0
Episode 8: Total Reward = 15.0
Episode 9: Total Reward = 23.0
Episode 10: Total Reward = 11.0
