In [None]:
##dependencies

import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from tqdm import tqdm

In [None]:
envt = gym.make('CartPole-v1')

In [None]:

seed = 122
rg = np.random.RandomState(seed)

# Epsilon greedy
def choose_action_epsilon(Q, state, epsilon, rg=rg):
    if not np.any(Q[state]) or rg.rand() < epsilon:
        return rg.randint(0, 2)
    else:
        return np.argmax(Q[state])


In [None]:
#discretize the spaces
discrete_factor = 50
theta_discrete = np.linspace(-0.20943951, 0.20943951, discrete_factor)
theta_dot_discrete = np.linspace(-2.0, 2.0, discrete_factor//5)
pose_discrete = np.linspace(-2.4, 2.4, discrete_factor)
vel_discrete = np.linspace(-2.0 ,2.0, discrete_factor//5)

def discrete_states(obs):
    cartX, cartXdot, cartTheta, cartThetadot = obs
    cartX = int(np.digitize(cartX, pose_discrete))
    cartXdot = int(np.digitize(cartXdot, vel_discrete ))
    cartTheta = int(np.digitize(cartTheta, theta_discrete))
    cartThetadot = int(np.digitize(cartThetadot, theta_dot_discrete))

    return (cartX, cartXdot, cartTheta, cartThetadot)

In [None]:
# Initialize Q-values
Q = {}
for i in range(discrete_factor+1):
    for j in range(discrete_factor//5+1):
        for k in range(discrete_factor+1):
            for l in range(discrete_factor//5+1):
                Q[(i, j, k, l)] = np.random.uniform(0,1,size = 2)

In [None]:
##SARSA
def sarsa(envt, Q, epsilon_input, alpha_input,rg, gamma = 0.99, choose_action = choose_action_epsilon):
    episodes = 22000
    episode_rewards = np.zeros(episodes)
    steps_to_completion = np.zeros(episodes)
    mean_score_l100=[]
    eps = epsilon_input
    alp = alpha_input
    rg=rg
    for ep in range(episodes):
        tot_reward, steps = 0, 0

        # Reset environment
        obs,_ = envt.reset()
        ##descrete state
        state = discrete_states(obs)
        #rand = np.random.random()
        action = choose_action_epsilon(Q, state, eps)
        done = False
        while not done:
            obs_, reward, done,_, _ = envt.step(action)

            state_next = discrete_states(obs_)
            action_next = choose_action_epsilon(Q, state_next, eps,rg)
            # update equation
            Q[state][action] = Q[state][action] + alp * (reward + gamma * Q[state_next][action_next] - Q[state][action])
            tot_reward += reward
            steps += 1
            state, action = state_next, action_next

        episode_rewards[ep]= tot_reward
        steps_to_completion[ep] = steps
        if ep>=99:
          mean_score_l100.append(np.mean(episode_rewards[ep-99:ep]))
        if ep >510:
            mean_score = np.mean(episode_rewards[ep-499:ep])
            if mean_score%200==0:
                print("Avg score above 200 for last 500 episode")
    mean_final = np.mean(episode_rewards)
    return Q, episode_rewards, steps_to_completion, mean_final, mean_score_l100

In [None]:
# Initialize lists to store results across experiments and seeds
all_rewards = []
all_steps = []
num_seeds = 5
epsilon_alpha_tuning = np.array([[0.08, 0.1],[0.08, 0.3],[0.1, 0.1],[0.1, 0.3]])
mean_rewards_per_epsilon = []
std_rewards_per_epsilon = []
mean_rewards_per_epsilon_100 = []
std_rewards_per_epsilon_100 = []
regret={}

plt.style.use('seaborn-v0_8-whitegrid')  # Using the correct seaborn style
sns.set_style("whitegrid")  # Ensures consistency

plt.rcParams.update({
    'grid.alpha': 0.5,
    'grid.linestyle': '--',
    'figure.figsize': (13, 6)
})
colors = plt.cm.plasma(np.linspace(0, 1, len(epsilon_alpha_tuning)))
for exp_idx, exp in enumerate(epsilon_alpha_tuning):  # Use enumerate for safer indexing
    print(f"\nStarting Experiment {exp_idx+1} with ε={exp[0]}, α={exp[1]}")

    # Initialize lists to store results for each seed in this experiment
    seed_rewards = []
    avg_reward_exp=[]
    seed_rewards_100 = []

    for seed in range(num_seeds):
        print(f"  Running seed {seed+1}")

        # Initialize Q-table (use j instead of i to avoid conflict)
        Q = {}
        for j in range(discrete_factor + 1):  # Changed i to j
            for k in range(discrete_factor // 5 + 1):
                for l in range(discrete_factor + 1):
                    for m in range(discrete_factor // 5 + 1):
                        Q[(j, k, l, m)] = np.random.uniform(0, 1, size=2)

        # Run SARSA with the current seed
        rg = np.random.RandomState(seed+40)
        epsilon_start = exp[0]
        alpha_input = exp[1]
        Q, rewards, steps, avg, rewards_100 = sarsa(envt, Q, epsilon_start, alpha_input,rg)

        seed_rewards.append(rewards)
        avg_reward_exp.append(avg)
        seed_rewards_100.append(rewards_100)


    # Convert to numpy arrays for easier calculations
    seed_rewards = np.array(seed_rewards)  # shape: (num_seeds, num_episodes)
    avg_reward_exp = np.array(avg_reward_exp)
    seed_rewards_100 = np.array(seed_rewards_100)

    # Calculate mean across seeds for this epsilon
    mean_avg = np.mean(avg_reward_exp)
    print(mean_avg)
    mean_rewards = np.mean(seed_rewards, axis=0)
    mean_rewards_per_epsilon.append(mean_rewards)
    std_rewards = np.std(seed_rewards, axis=0)
    std_rewards_per_epsilon.append(std_rewards)
    mean_rewards_100 = np.mean(seed_rewards_100, axis=0)
    mean_rewards_per_epsilon_100.append(mean_rewards_100)
    std_rewards_100 = np.std(seed_rewards_100, axis=0)
    std_rewards_per_epsilon_100.append(std_rewards_100)

    regret[exp_idx]=np.sum(475-np.array(mean_rewards))
    eps, alpha = exp
    print(f'Calculated Regret for epsilon ={eps} & alpha ={alpha} = {regret[exp_idx]}')

    plt.figure()
    eps, alpha = exp
    plt.plot(mean_rewards_100,label=f'ε={eps}, α={alpha}',color=colors[exp_idx])
    plt.fill_between(range(len(mean_rewards_100)),
                     mean_rewards_100 - std_rewards_100,
                     mean_rewards_100 + std_rewards_100,color=colors[exp_idx],
                     alpha=0.1, linewidth=0)


            # Set consistent axes and labels
    plt.ylim(0, 550)  # Fixed y-axis range
    plt.title(f'SARSA Performance means reward across 5 seeds (moving avg of 100 used) (ε={eps}, α={alpha}, {num_seeds} seeds)')
    plt.xlabel('Episode')
    plt.ylabel('Mean Reward')

        # Standardized legend and grid
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.grid(True)
    plt.tight_layout()
    plt.show()





# Plotting (same as before)
plt.figure()

position_=[0,1,2,3]

for g in position_:
  eps, alpha = epsilon_alpha_tuning[g]
  mean_rewards=mean_rewards_per_epsilon_100[g]
  std_rewards=std_rewards_per_epsilon_100[g]
  colorss=colors[g]
  plt.plot(mean_rewards, label=f'ε={eps}', color=colorss, linewidth=1)
  plt.fill_between(range(len(mean_rewards_100)),
                   mean_rewards - std_rewards,
                   mean_rewards + std_rewards,
                   color=colorss, alpha=0.1, linewidth=0)  # Light transparency

plt.ylim(0, 550)  # Fixed y-axis range
plt.title(f'SARSA Performance Cartpole-V1: Hyperparameters (epsilon & alpha)')
plt.xlabel('Episode')
plt.ylabel('Mean Reward')

    # Standardized legend and grid
plt.legend(bbox_to_anchor=(1.05, 1))
plt.grid(True)
plt.tight_layout()
plt.show()