In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import seaborn as sns

In [None]:
envt = gym.make('MountainCar-v0')

In [None]:
seed = 200
rg = np.random.RandomState(seed)

# Epsilon greedy
def choose_action_epsilon(Q, state, epsilon, rg=rg):
    if not np.any(Q[state]) or rg.rand() < epsilon:
        return rg.randint(0, 3) ## three actions are posible for Mountain car (0,1,2)
    else:
        return np.argmax(Q[state])


In [None]:

#discretize the spaces
pose_discrete = np.linspace(-1.2, 0.6,36)
vel_discrete = np.linspace(-0.07 ,0.07,25)

def discrete_states(obs):
    cartX, cartXdot = obs
    cartX = int(np.digitize(cartX, pose_discrete))
    cartXdot = int(np.digitize(cartXdot, vel_discrete ))

    return (cartX, cartXdot)

In [None]:
# Initialize Q-values
Q = {}
for i in range(19):
    for j in range(15):
                Q[(i, j)] = np.random.uniform(-0.5,0,size = 3)# to promote random action

In [None]:
##SARSA

def sarsa(envt, Q, epsilon_input, alpha_input, rg, gamma = 0.99, choose_action = choose_action_epsilon):
    episodes = 30000 #defined
    decay=0.9999#defined
    epsilon_start=1#defined
    episode_rewards = np.zeros(episodes)
    mean_score_l100 =[]
    steps_to_completion = np.zeros(episodes)
    eps = epsilon_start
    alp = alpha_input
    rg =rg
    list_of_tuples=[]
    for ep in range(episodes):
        tot_reward, steps = 0, 0

        # Reset environment
        obs,_ = envt.reset()
        ##descrete state
        state = discrete_states(obs)
        #rand = np.random.random()
        action = choose_action_epsilon(Q, state, eps)
        done = False
        truncated =False
        while not done and not truncated:
            obs_, reward, done,truncated, _ = envt.step(action)
            position, velocity = obs_
            state_next = discrete_states(obs_)
            rewards=reward
            if position>-0.4   and velocity>0.5:
              rewards=rewards+position+15*velocity
            #case1
            if position>0.4 and velocity>0.6:
              rewards=rewards + 0.7
            if state_next in list_of_tuples:
              rewards=rewards
            else:
               list_of_tuples.append(state_next)
               rewards+=0.1

            action_next = choose_action_epsilon(Q, state_next, eps, rg)
            # update equation
            Q[state][action] = Q[state][action] + alp * (rewards + gamma * Q[state_next][action_next] - Q[state][action])
            tot_reward += reward
            steps += 1
            state, action = state_next, action_next
            eps=max(epsilon_input,decay*eps)
        episode_rewards[ep]= tot_reward
        steps_to_completion[ep] = steps
        if ep>=99:
          mean_score_l100.append(np.mean(episode_rewards[ep-99:ep]))
        if ep>510:
          mean_score = np.mean(episode_rewards[ep-499:ep])
          if mean_score>=-140:
            print("Avg score above -140 for last 500 episode")
    mean_final = np.mean(episode_rewards[-1000:])
    return Q, episode_rewards, steps_to_completion, mean_final, mean_score_l100

In [None]:
#Test Run
epsilon_input=0.06
alpha_input=0.13

rg = np.random.RandomState(40)
Q, rewards, steps, avg_reward, rewards_100 = sarsa(envt, Q, epsilon_input, alpha_input, rg)
print(f'avg_reward for last 1000 episode is {avg_reward}')

In [None]:
# Initialize lists to store results across experiments and seeds
all_rewards = []
all_steps = []
num_seeds = 5
epsilon_alpha_tuning = np.array([[0.03, 0.1],[0.01, 0.15],[0.06, 0.08]])
mean_rewards_per_epsilon = []
std_rewards_per_epsilon = []
mean_rewards_per_epsilon_100 = []
std_rewards_per_epsilon_100 = []

seed_rewards_ = {}
avg_reward_exp_ = {}
Q_stored_ ={}
steps_stored_ = {}
seed_rewards_100_ = {}
mean_seed_rewards={}
mean_seed_rewards_100={}
regret={}
plt.style.use('seaborn-v0_8-whitegrid')  # Using the correct seaborn style
sns.set_style("whitegrid")  # Ensures consistency

plt.rcParams.update({
    'grid.alpha': 0.5,
    'grid.linestyle': '--',
    'figure.figsize': (13, 6)
})

colors = plt.cm.plasma(np.linspace(0, 1, len(epsilon_alpha_tuning)))

for exp_idx, exp in enumerate(epsilon_alpha_tuning):  # Use enumerate for safer indexing
    print(f"\nStarting Experiment {exp_idx+1} with ε={exp[0]}, α={exp[1]}")

    # Initialize lists to store results for each seed in this experiment
    seed_rewards = []
    avg_reward_exp=[]
    Q_stored = []
    steps_stored = []
    seed_rewards_100 = []

    for seed in range(num_seeds):
        print(f"  Running seed {seed+1}")

        # Initialize Q-table (use j instead of i to avoid conflict)
        Q = {}
        for i in range(37):
          for j in range(26):
            Q[(i, j)] = np.random.uniform(-1,1,size = 3)

        # Run SARSA with the current seed
        rg = np.random.RandomState(seed+40)
        epsilon_start = exp[0]
        alpha_input = exp[1]
        Q, rewards, steps, avg_last_1000, rewards_100 = sarsa(envt, Q, epsilon_start, alpha_input, rg)

        seed_rewards.append(rewards)
        avg_reward_exp.append(avg_last_1000)
        Q_stored.append(Q)
        steps_stored.append(steps)
        seed_rewards_100.append(rewards_100)

    # Convert to numpy arrays for easier calculations
    seed_rewards_[exp_idx] = np.array(seed_rewards)  # shape: (num_seeds, num_episodes)
    avg_reward_exp_[exp_idx] = np.array(avg_reward_exp)
    Q_stored_[exp_idx] = np.array(Q_stored)
    steps_stored_[exp_idx] = np.array(steps_stored)
    seed_rewards_100_[exp_idx] = np.array(seed_rewards_100)


    mean_avg = np.mean(avg_reward_exp)
    print(f'mean accross 5 seeds of last 1000 episode is {mean_avg}')
    ###############################################################################################
    # Calculate mean across seeds for this epsilon
    mean_seed_rewards[exp_idx] = np.mean(seed_rewards_[exp_idx], axis=0)
    mean_rewards_per_epsilon.append(mean_seed_rewards[exp_idx])###############
    std_rewards = np.std(seed_rewards_[exp_idx], axis=0)
    std_rewards_per_epsilon.append(std_rewards)##############################
    ################################################################################################
    # Calculate mean across seeds for this epsilon for the reward calculated as avg of every 100 episides
    mean_seed_rewards_100[exp_idx] = np.mean(seed_rewards_100_[exp_idx], axis=0)
    mean_rewards_per_epsilon_100.append(mean_seed_rewards_100[exp_idx])#######
    std_rewards_100 = np.std(seed_rewards_100_[exp_idx], axis=0)
    std_rewards_per_epsilon_100.append(std_rewards_100)#######################
    ################################################################################################
    regret[exp_idx]=np.sum(-110-np.array(mean_seed_rewards[exp_idx]))
    eps, alpha = exp
    print(f'Calculated Regret for epsilon ={eps} & alpha ={alpha} = {regret[exp_idx]}')

    ##single plot

    # Define color map (consistent across all plots)
    plt.figure()
    eps, alpha = exp
    plt.plot(mean_seed_rewards_100[exp_idx],label=f'ε={eps}, α={alpha}',color=colors[exp_idx])
    plt.fill_between(range(len(mean_seed_rewards_100[exp_idx])),
                     mean_seed_rewards_100[exp_idx] - std_rewards_100,
                     mean_seed_rewards_100[exp_idx] + std_rewards_100,color=colors[exp_idx],
                     alpha=0.1, linewidth=0)


            # Set consistent axes and labels
    plt.ylim(-200, -110)  # Fixed y-axis range
    plt.title(f'SARSA Performance Moutain car means reward across 5 seeds (moving avg of 100 used) (ε={eps}, α={alpha}, {num_seeds} seeds)')
    plt.xlabel('Episode')
    plt.ylabel('Mean Reward')

        # Standardized legend and grid
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.grid(True)
    plt.tight_layout()
    plt.show()


# Plotting (same as before)
plt.figure()

position_=epsilon_alpha_tuning.shape[0]###################################

for g in position_:
  eps, alpha = epsilon_alpha_tuning[g]
  mean_rewards=mean_rewards_per_epsilon_100[g]
  std_rewards=std_rewards_per_epsilon_100[g]
  colorss=colors[g]
  plt.plot(mean_rewards, label=f'ε={eps}', color=colorss, linewidth=1)
  plt.fill_between(range(len(mean_rewards)),
                   mean_rewards - std_rewards,
                   mean_rewards + std_rewards,
                   color=colorss, alpha=0.1, linewidth=0)  # Light transparency

plt.ylim(-200, -110)  # Fixed y-axis range
plt.title(f'SARSA Performance MountainCar V0: Hyperparameters (epsilon & alpha)')
plt.xlabel('Episode')
plt.ylabel('Mean Reward')

    # Standardized legend and grid
plt.legend(bbox_to_anchor=(1.05, 1))
plt.grid(True)
plt.tight_layout()
plt.show()
