In [97]:
from itertools import product
import numpy as np
import random

In [98]:
user_states = [-1, 0, 1]
items = [-1, 1]
slates = list(product(items, repeat=4))
num_actions=len(slates)

In [99]:
def reward(u,r):
    rew=u*r
    return rew

In [100]:
def transition_function(current_state, action):
    # Update the new state based on the action
    if action == None:
        new_state = 0
    else:
        new_state = action
    return new_state

In [101]:
prob_i_u = np.zeros((len(user_states), len(items)))
for i in range(len(user_states)):
    for j in range(len(items)):
        if i == j:
            prob_i_u[i, j] = 0.6
        else:
            prob_i_u[i, j] = random.uniform(0, 0.5)

In [102]:
Q_item = np.zeros((len(user_states), len(items)))
Q_slate = np.zeros((len(user_states), len(slates)))

In [103]:
def update_q_value(Q_item,Q_slate, current_state, selected_item, slate, reward_value, next_state,alpha, gamma):
    state=user_states.index(current_state)
    action=slates.index(slate)
    if selected_item!=0:
        selected_item_index=items.index(selected_item)
        current_q_value=Q_item[state,selected_item_index]
        next_state=user_states.index(next_state)
        max_next_q_value=np.max(Q_item[next_state,:])
        new_q_value=current_q_value+alpha*(reward_value+gamma*max_next_q_value-current_q_value)
        Q_item[state,selected_item_index]=new_q_value
    
    for i in range(len(slate)):
        item_action=items.index(slate[i])
        Q_slate[state,action]+=Q_item[state,item_action]*prob_i_u[state,item_action]

    return Q_item, Q_slate

In [104]:
def epsilon_greedy_action(Q_slate, current_state, epsilon):
    state=user_states.index(current_state)
    if np.random.rand() < epsilon:
        # Choose a random action

        i = np.random.choice(num_actions)
        action = slates[i]
    else:
        # Choose the action with the highest Q-value
        i = np.argmax(Q_slate[state])
        action = slates[i]
    return action

In [105]:
def softmax(x):
    exp_x = np.exp(x)
    softmax_values = exp_x / np.sum(exp_x)
    return softmax_values

In [106]:
def run_episodes(Q_item,Q_slate, transition_function, reward, update_q_value, num_episodes, alpha, gamma, epsilon):
    for episode in range(num_episodes):
        # Randomly choose an initial state
        current_state = np.random.choice(user_states)
        
        while True:
            # Choose an action using epsilon-greedy strategy
            selected_slate = epsilon_greedy_action(Q_slate, current_state, epsilon)
            prob_selection=[]
            for i in range(len(selected_slate)):
                prob_selection.append(prob_i_u[user_states.index(current_state),items.index(selected_slate[i])])
            prob_selection.append(0.2)
            prob_selection=softmax(prob_selection)
            selected_index = np.random.choice(len(prob_selection), p=prob_selection)
            if selected_index<= len(selected_slate)-1:
                selected_item=selected_slate[selected_index]
            else:
                selected_item=0 #None
            
            
            next_state = transition_function(current_state, selected_item)
            
            # Assume a reward for the transition (you should replace this with the actual reward from your environment)
            reward_value = reward(current_state,selected_item) # Replace with the actual reward
            
            # Update the Q-value based on the transition
            Q_item,Q_slate = update_q_value(Q_item,Q_slate, current_state,selected_item,selected_slate, reward_value, next_state,alpha, gamma)
            
            # Move to the next state
            current_state = next_state
            
            # Terminate the episode if a terminal state is reached (you should replace this with your termination condition)
            if current_state in [-1, 1]:
                break

In [107]:
num_episodes = 5000
alpha = 0.1
gamma = 0.9
epsilon = 0.1

run_episodes(Q_item,Q_slate, transition_function, reward, update_q_value, num_episodes, alpha, gamma, epsilon)


In [108]:
first_max = np.argmax(Q_slate, axis=1)
second_max= np.argsort(Q_slate, axis=1)[:,-2]
# Print the result
for i in range(len(first_max)):
    print(first_max[i],second_max[i])
    print(f"For user state {user_states[i]}: optimal slate is {slates[first_max[i]]} and {slates[second_max[i]]} ")

0 10
For user state -1: optimal slate is (-1, -1, -1, -1) and (1, -1, 1, -1) 
4 7
For user state 0: optimal slate is (-1, 1, -1, -1) and (-1, 1, 1, 1) 
11 15
For user state 1: optimal slate is (1, -1, 1, 1) and (1, 1, 1, 1) 


In [109]:
Q_slate

array([[3.40734227e+04, 2.11474793e+02, 2.40398698e+02, 1.65709909e+02,
        1.91114850e+02, 1.76178796e+02, 1.52080637e+02, 8.43890435e+01,
        1.67178553e+02, 1.40993122e+02, 2.56452337e+02, 1.04495324e+02,
        2.01043256e+02, 1.14349522e+02, 7.71175986e+01, 4.09108976e+01],
       [2.19980087e+02, 2.57114794e+02, 2.32703441e+02, 1.48111158e+02,
        3.08970647e+04, 2.24953735e+02, 3.12545093e+02, 4.19739941e+02,
        2.95330700e+02, 2.49570273e+02, 2.85267109e+02, 3.65991398e+02,
        1.23573753e+02, 2.01393826e+02, 3.18940807e+02, 3.26545613e+02],
       [1.17676345e+01, 5.28654424e+01, 3.63354302e+01, 9.34517637e+01,
        5.96568259e+01, 1.22049569e+02, 7.68647489e+01, 9.68577743e+01,
        6.57254789e+01, 8.48527591e+01, 5.36585065e+01, 1.62438912e+04,
        1.07102987e+02, 9.79745791e+01, 1.12027817e+02, 1.81067819e+02]])

In [110]:
Q_item

array([[9.99999117, 7.97966925],
       [8.99999158, 8.9996102 ],
       [7.99997791, 9.99965739]])

In [111]:
prob_i_u

array([[0.6       , 0.21449518],
       [0.32792112, 0.6       ],
       [0.04852569, 0.40875872]])