# Expected SARSA TD parametrized

In [22]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Tuple
import copy
import numpy as np
import random, json, csv, os

In [23]:
train = True
directory_name = 'Expected_SARSA_TD1_test1'

In [24]:
ACTIONS = [ "none",
            "attack",
            "scolding",
            "intimidate",
            "grudge",
            "sharing_happiness",
            "happy_person",
            "satisfaction",
            "sharing_fear",
            "running_away",
            "sharing_sadness",
            "disappointment",
            "surprise",
            "disbelief",
            "astonishment"
            ]

            
            
EMOTIONS = ["none",
            "fear",
            "joy",
            "surprise",
            "sadness",
            "anger"
            ]

EMOTION_PER_ACTION = { "none": "none",
            "attack" : "anger",
            "scolding": "anger",
            "intimidate": "anger",
            "grudge": "anger",
            "sharing_happiness": "joy",
            "happy_person": "joy",
            "satisfaction": "joy",
            "sharing_fear": "fear",
            "running_away": "fear",
            "sharing_sadness": "sadness",
            "disappointment": "sadness",
            "surprise": "surprise",
            "disbelief": "surprise",
            "astonishment": "surprise"
}

In [25]:
SCENIC_ACTION_SIZE = len(ACTIONS)
EMOTIONAL_REACTION_SIZE = len(EMOTIONS)

MAXIMUM_SKETCH_LENGTH = 15

In [26]:
# returns the code number of an action given by name
def parse_action(action):
    return ACTIONS.index(action)

#returns the action associated to a code
def unparse_action(code):
    return ACTIONS[code]

# returns the code number of an action given by name
def parse_emotion(action):
    return EMOTIONS.index(action)

#returns the action associated to a code
def unparse_emotion(code):
    return EMOTIONS[code]

def convert_action_into_emo(action):
    return EMOTION_PER_ACTION[action]

In [27]:
class ImprobotEnv(Env):

    def __init__(self, actor_space, robot_space, sketch_lenght, env_matrix, P0):

        self.action_space = Discrete(len(robot_space))
        # environment conditions space
        self.observation_space = Tuple((Discrete(len(robot_space)), Discrete(len(actor_space)))) # (robot reaction, actor action)
        self.state = () # (last robot reaction, last actor action)
        self.initial_action_P = P0
        self.sketch_length = sketch_lenght
        self.counter = 0
        self.P = env_matrix # {action: [probability, actor_action, reward]}
        self.q_table = np.zeros([self.observation_space[0].n, self.observation_space[1].n, self.action_space.n])
    
    def render(self):
        print(f"Last actor scenic action: {self.state[1]}")
        print(f"Last robot reaction: {self.state[0]}")

    def step(self, action):
        possible_next_states = self.P[action]
        probabilities = []

        for elem in possible_next_states:
            probabilities.append(elem[0])

        #print(probabilities)
        #print(possible_next_states)
        new_observation_idx = np.random.choice(len(possible_next_states), p=probabilities)
        
        new_observation = possible_next_states[new_observation_idx]
        # update state
        new_reaction = action
        new_actor_action = new_observation[1]
        self.state = tuple([new_reaction, new_actor_action])
        reward = new_observation[2]

        #self.counter += 1

        if self.counter == self.sketch_length:
            done = True
        else:
            done = False

        return self.state, reward, done

    def reset(self):
        actor_action = np.random.choice(SCENIC_ACTION_SIZE,p=self.initial_action_P)
        #print(f"initial action: {actor_action}")
        self.state = (0, actor_action) # (none, none)
        self.counter = 0
        return self.state


In [28]:
# initial action probability distribution
equal_prob = 1/(SCENIC_ACTION_SIZE-1)
P0 = np.zeros(15)
P0[1:] = np.full(14, equal_prob)

# Expected-SARSA

In [29]:
def expected_sarsa_train(alpha, gamma, epsilon, TD_step, min_q_diff, env):
    # mask in order to non choose action none during the training
    mask = np.ones(env.observation_space[0].n, dtype=np.int8)
    mask[0] = 0

    epochs_number = 100000

    for i in range(1, epochs_number):
        q_old = copy.deepcopy(env.q_table)
        state = env.reset()

        epochs, reward = 0, 0
        done = False
        
        epsilon = -0.9/50000 * epochs_number + 1 if epochs_number < 50000 else 0.1

        while not done:

            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample(mask=mask) # Explore action space
                #print(f"random action: {action}")
            else:
                #print(f"here the vector to choose: {q_table[state[0]][state[1]][1:]}")
                action = np.argmax(env.q_table[state[0]][state[1]][1:]) + 1 # Exploit learned values
                #print(f"argmax action: {action}")
            initial_action = action
            old_value = env.q_table[state[0]][state[1]][initial_action]

            G = 0
            rewds = []
            gamma_arr = []
            states = []
            old_value = env.q_table[state[0]][state[1]][action]
            for j in range(TD_step + 1):
                if env.counter + (j) < env.sketch_length:
                    next_state, reward, _ = env.step(action)
                    states.append(next_state)
                    rewds.append(reward)
                    gamma_arr.append(gamma**(j))
                    if random.uniform(0, 1) < epsilon:
                        action = env.action_space.sample(mask=mask) # Explore action space
                        #print(f"random action: {action}")
                    else:
                        #print(f"here the vector to choose: {q_table[state[0]][state[1]][1:]}")
                        action = np.argmax(env.q_table[next_state[0]][next_state[1]][1:]) + 1 # Exploit learned values
                        #print(f"argmax action: {action}")
                else:
                    break

            if env.counter + (j) < env.sketch_length:    
                probs = []
                q_values = env.q_table[states[-1][0]][states[-1][1]][1:]
                #if random.uniform(0, 1) < epsilon:
                #    action = env.action_space.sample(mask=mask) # Explore action space
                #    probs = np.ones_like(q_values)*epsilon / env.action_space.n
                #else:
                    
                probs = np.ones_like(q_values)  / len(q_values)
                best_action = np.argmax(q_values)
                
                action = np.random.choice(np.arange(len(q_values)), p=probs) + 1 # Choose action with respect to probabilities

                next_state, reward, _ = env.step(action)
            
                next_values = env.q_table[next_state[0]][next_state[1]][1:]
                expected_next_value = np.dot(next_values, probs)
                rewds.append(expected_next_value)
                gamma_arr.append(gamma**(j+1))

            G = np.multiply(rewds,gamma)
            
            new_value = (1 - alpha) * old_value + np.sum(alpha * np.array(G))
            
            env.q_table[state[0]][state[1]][initial_action] = new_value

            state = states[0]
            
            epochs += 1

            env.counter += 1

            if env.counter == env.sketch_length:
                done = True
            else:
                done = False
            
        if i % 100 == 0:
            print(f"Episode: {i}")
            print(f"action choosen: {action}")
            print(f"next_state: {next_state}")
            print(f"q_value updated: {env.q_table[state[0]][state[1]][action]}")
        
    
    print("Training finished.\n")


In Expected SARSA, the algorithm computes the expected value of the action-value function over all possible actions at a given state. The expected value is computed as a weighted average of the action-values, where the weights are the probabilities of selecting each action according to the current policy.

The probs variable computes the probabilities of selecting each action according to the current policy (i.e., the epsilon-greedy policy). The epsilon-greedy policy selects the greedy action with probability 1 - epsilon and a non-greedy action (i.e., a random action) with probability epsilon.

When computing the expected value of the action-value function, we need to weight the action-values by their respective probabilities. Multiplying the action-values by the probs variable gives us the weighted action-values, which are then summed to compute the expected value.

In [30]:
def policy_eval(env):
    total_epochs = 0
    total_rewards = 0
    episodes = 100

    for _ in range(episodes):
        state = env.reset()
        epochs, reward = 0, 0
        
        done = False
        
        while not done:
            action = np.argmax(env.q_table[state[0]][state[1]][1:]) + 1 # Exploit learned values
            state, reward, _ = env.step(action)

            total_rewards += reward

            epochs += 1

            env.counter += 1

            if env.counter == env.sketch_length:
                done = True
            else:
                done = False

        total_epochs += epochs

    print(f"Results after {episodes} episodes:")
    print(f"Average timesteps per episode: {total_epochs / episodes}")
    print(f"Average rewards per episode: {total_rewards / episodes}")
    return total_rewards / episodes

In [31]:
def save_q_table(env,table_name, dir_name):
    reshaped_Q = env.q_table.reshape((env.q_table.shape[0]*env.q_table.shape[1]),env.q_table.shape[2])
    header = ["S(robot reaction, actor action)", "none", "attack", "scolding", "intimidate", "grudge", "sharing_happiness",
          "happy_person", "satisfaction", "sharing_fear", "running_away", "sharing_sadness", "disappointment",
          "surprise", "disbelief", "astonishment"] if env.observation_space[0].n == 15 else ["S(robot reaction, actor action)", "none", "fear", "joy", "surprise", "sadness", "anger"]
    reaction_unparser = unparse_action if env.observation_space[0].n == 15 else unparse_emotion
    
    data = []
    
    for i in range(reshaped_Q.shape[0]):
        if int(i/env.observation_space[1].n) == 0:
            reaction = reaction_unparser(0)
            row = [(reaction,unparse_action(i))]
        else:
            reaction = int(i/env.observation_space[1].n)
            #print(f"here is the reaction: {reaction}")
            #print(f"here is the i: {i}")
            reaction = reaction_unparser(reaction)
            row = [(reaction, unparse_action((i%env.observation_space[1].n)))]
        row.extend(reshaped_Q[i])
        data.append(row)

    # Scrittura del file CSV
    if not os.path.exists('trials/' + dir_name):
        os.makedirs('trials/' + dir_name)

    with open('trials/' + dir_name + '/' + table_name + '.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        writer.writerows(data)

# Action-per-Action learning Test

### Load P action per action matrix

In [32]:
with open('P_action_per_action.json') as f:
    P_aa = json.load(f)

print(P_aa)

# convert keys into numbers
new_P = {}
for k in P_aa.keys():
    new_P[int(k)]=P_aa[k]

P_aa = new_P
print(P_aa)

{'1': [[0.125, 3, 2], [0.125, 5, 0], [0.125, 6, -4], [0.125, 7, 0], [0.125, 8, 4], [0.25, 10, -1], [0.125, 11, 3]], '2': [[0.14285714285714285, 1, 1], [0.14285714285714285, 2, 4], [0.14285714285714285, 8, 0], [0.2857142857142857, 10, 13], [0.14285714285714285, 11, 4], [0.14285714285714285, 14, 2]], '3': [[0.25, 3, 5], [0.08333333333333333, 4, 0], [0.08333333333333333, 5, 12], [0.16666666666666666, 6, 6], [0.08333333333333333, 7, 0], [0.08333333333333333, 8, 2], [0.25, 10, 9]], '4': [[0.125, 3, 0], [0.125, 8, 1], [0.5, 10, 2], [0.25, 11, 1]], '5': [[0.1, 2, 1], [0.1, 5, 1], [0.2, 6, 8], [0.2, 8, 11], [0.2, 10, 15], [0.1, 11, 2], [0.1, 13, 6]], '6': [[0.08333333333333333, 2, 1], [0.08333333333333333, 3, -2], [0.08333333333333333, 5, 0], [0.3333333333333333, 6, 26], [0.08333333333333333, 8, 0], [0.16666666666666666, 11, -2], [0.08333333333333333, 12, -1], [0.08333333333333333, 14, 3]], '7': [[0.25, 3, 6], [0.125, 7, 0], [0.375, 10, 4], [0.125, 11, 3], [0.125, 12, -1]], '8': [[0.2, 3, 6], 

### Create action-per-action environment

In [33]:
env_aa = ImprobotEnv(actor_space=ACTIONS,robot_space=ACTIONS,sketch_lenght=15,env_matrix=P_aa, P0=P0)

## Q-Learning off-policy TD(1) Training

In [34]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
min_difference = 0.000001 # 10^-7

if train:
    expected_sarsa_train(alpha=alpha,gamma=gamma,epsilon=epsilon, TD_step=1, min_q_diff=min_difference,env=env_aa)

Episode: 100
action choosen: 5
next_state: (1, 3)
q_value updated: 5.46307877886765
Episode: 200
action choosen: 2
next_state: (1, 10)
q_value updated: 5.3267487757595875
Episode: 300
action choosen: 1
next_state: (6, 3)
q_value updated: 1.443979824530272
Episode: 400
action choosen: 5
next_state: (2, 10)
q_value updated: 8.685725278884231
Episode: 500
action choosen: 5
next_state: (2, 10)
q_value updated: 9.576717871993019
Episode: 600
action choosen: 3
next_state: (5, 13)
q_value updated: 3.0746244794571824
Episode: 700
action choosen: 2
next_state: (2, 11)
q_value updated: 5.260807847536715
Episode: 800
action choosen: 2
next_state: (2, 2)
q_value updated: 8.36145443408423
Episode: 900
action choosen: 2
next_state: (2, 8)
q_value updated: 6.825562437454836
Episode: 1000
action choosen: 2
next_state: (5, 5)
q_value updated: 9.545068383842064
Episode: 1100
action choosen: 5
next_state: (5, 8)
q_value updated: 8.616746517426904
Episode: 1200
action choosen: 1
next_state: (10, 2)
q_valu

## Q-Learning performance evaluation

In [35]:
avg_reward_aa = policy_eval(env=env_aa)

Results after 100 episodes:
Average timesteps per episode: 15.0
Average rewards per episode: 103.55


## Q_table visualization

In [36]:
save_q_table(env_aa,table_name='q_table_aa', dir_name=directory_name)

# Action-per-Emotion Model

### Load P action per emotion matrix

In [37]:
with open('P_action_per_emotion.json') as f:
    P_ae = json.load(f)

print(P_ae)

# convert keys into numbers
new_P = {}
for k in P_ae.keys():
    new_P[int(k)]=P_ae[k]

P_ae = new_P
print(P_ae)

{'1': [[0.15384615384615385, 3, 6], [0.07692307692307693, 5, 4], [0.07692307692307693, 6, 4], [0.07692307692307693, 7, 0], [0.3076923076923077, 8, 7], [0.3076923076923077, 10, 2]], '2': [[0.06666666666666667, 2, 2], [0.1, 3, 4], [0.06666666666666667, 5, 1], [0.2, 6, 34], [0.03333333333333333, 7, 0], [0.1, 8, 11], [0.16666666666666666, 10, 19], [0.13333333333333333, 11, 3], [0.06666666666666667, 12, -2], [0.03333333333333333, 13, 6], [0.03333333333333333, 14, 3]], '3': [[0.019230769230769232, 1, -1], [0.07692307692307693, 2, 6], [0.07692307692307693, 3, 2], [0.038461538461538464, 4, 4], [0.07692307692307693, 5, 1], [0.17307692307692307, 6, 8], [0.07692307692307693, 7, 6], [0.038461538461538464, 8, 3], [0.23076923076923078, 10, 6], [0.11538461538461539, 11, 6], [0.019230769230769232, 12, 4], [0.057692307692307696, 14, 3]], '4': [[0.041666666666666664, 2, 3], [0.08333333333333333, 3, 1], [0.08333333333333333, 4, 0], [0.041666666666666664, 5, -1], [0.16666666666666666, 6, 0], [0.125, 8, -2

### Create action-per-emotion environment

In [38]:
env_ae = ImprobotEnv(actor_space=ACTIONS,robot_space=EMOTIONS,sketch_lenght=15,env_matrix=P_ae, P0=P0)

## Q-Learning off-policy TD(1) Training

In [39]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
min_difference = 0.000001

if train:
    expected_sarsa_train(alpha=alpha,gamma=gamma,epsilon=epsilon, TD_step=1,min_q_diff=min_difference,env=env_ae)

Episode: 100
action choosen: 5
next_state: (2, 8)
q_value updated: 2.4332088437160997
Episode: 200
action choosen: 1
next_state: (4, 8)
q_value updated: 3.687195926375117
Episode: 300
action choosen: 5
next_state: (1, 10)
q_value updated: 8.924876038509007
Episode: 400
action choosen: 1
next_state: (2, 2)
q_value updated: 11.92976641454422
Episode: 500
action choosen: 1
next_state: (2, 10)
q_value updated: 9.595018319452176
Episode: 600
action choosen: 1
next_state: (2, 2)
q_value updated: 10.702405021542045
Episode: 700
action choosen: 1
next_state: (2, 6)
q_value updated: 10.991699087621456
Episode: 800
action choosen: 3
next_state: (1, 3)
q_value updated: 5.586027472386492
Episode: 900
action choosen: 2
next_state: (5, 3)
q_value updated: 10.154043965029464
Episode: 1000
action choosen: 1
next_state: (2, 2)
q_value updated: 11.52021409551727
Episode: 1100
action choosen: 2
next_state: (2, 6)
q_value updated: 15.351546652338799
Episode: 1200
action choosen: 2
next_state: (1, 10)
q_va

## Q-Learning performance evaluation

In [40]:
avg_reward_ae = policy_eval(env=env_ae)

Results after 100 episodes:
Average timesteps per episode: 15.0
Average rewards per episode: 161.35


## Q_table visualization

In [41]:
save_q_table(env_ae,table_name='q_table_ae', dir_name=directory_name)

In [42]:
#header = ["name", "avg_reward_aa", "avg_reward_ae"]
data = [directory_name, avg_reward_aa, avg_reward_ae]
with open('trials/summary.csv', mode='a', newline='') as file:
    writer = csv.writer(file)
    #writer.writerow(header)
    writer.writerow(data)