In [18]:
import torch
from itertools import product
import numpy as np

In [19]:
user_states = [-1, 0, 1]
items = [-1, 1]
slates = list(product(items, repeat=5))
num_actions=len(slates)

In [20]:
def reward(u,r):
    rew=0
    for j in range(len(r)):
        if u!=0:
            rew+=u*r[j]
        else:
            rew+=0.0
    return rew
        

In [21]:
def transition_function(current_state, action):
    # Update the new state based on the action
    new_state = current_state + sum(action)
    
    # Clip the new state to ensure it stays within the defined states (1, -1, 0)
    new_state = max(min(new_state, 1), -1)
    
    return new_state

In [22]:
Q_table = np.zeros((len(user_states), len(slates)))

In [23]:
def epsilon_greedy_action(Q_table, current_state, epsilon):
    state=user_states.index(current_state)
    if np.random.rand() < epsilon:
        # Choose a random action

        i = np.random.choice(num_actions)
        action = slates[i]
    else:
        # Choose the action with the highest Q-value
        i = np.argmax(Q_table[state])
        action = slates[i]
    return action

In [24]:
def randSlate(s=0,otherargs=[]):
    i = np.random.choice(num_actions)
    action = slates[i]
   
    return action

In [25]:
def qLearned(s=0,qtable=[]):
    st = user_states.index(s)
    action_indx = np.argmax(qtable[st,:])
    slate = slates[action_indx]
    return slate

In [26]:
def run_episodes(Q_table,serveSlate=randSlate, num_episodes=1000, session_length=10, transition_function=transition_function, reward=reward,epsilon=0.1):
    training=[]
    cumr=0
    for episode in range(num_episodes):
        # Randomly choose an initial state
        current_state = np.random.choice(user_states)
        sess_reward=0
        
        for sess in range(session_length):
            
            # Choose an action using epsilon-greedy strategy
            selected_action = serveSlate(current_state,Q_table)
            action=slates.index(selected_action)
            # Transition to the next state based on the chosen action
            next_state = transition_function(current_state, selected_action)
            
            # Assume a reward for the transition (you should replace this with the actual reward from your environment)
            reward_value = reward(current_state,selected_action)
            sess_reward+=reward_value # Replace with the actual reward
            training.append([current_state,selected_action,next_state,reward_value])
            # Update the Q-value based on the transition
            # Q_table = update_q_value(Q_table, current_state, selected_action, reward_value, next_state,alpha, gamma)
            
            # Move to the next state
            current_state = next_state
            
            # Terminate the episode if a terminal state is reached (you should replace this with your termination condition)
        cumr+=sess_reward
    return cumr, training   


In [27]:
def evaluatePolicy(policy, Qtable =[],numSamples=100, numTrans=2000):
    c = 0
    for i in range(numSamples):
        cum_r, train = run_episodes(Q_table,policy,numTrans)
        c = c+cum_r
    
    return c/numSamples

In [28]:
def q_learning(Q_table, training, alpha, gamma, test=False):
    # Calculate the Q-value for the current state-action pair
    t=0
    for sample in training:
        current_state,slate_action,next_state,reward_value=sample
        action=slates.index(slate_action)
        state=user_states.index(current_state)
        current_q_value = Q_table[state, action]

    # Find the maximum Q-value for the next state and all possible actions
        next_state = user_states.index(next_state)
        max_next_q_value = np.max(Q_table[next_state, :])

    # Update the Q-value using the Q-learning formula
    
        new_q_value = current_q_value + alpha * (reward_value + gamma * max_next_q_value - current_q_value)

    # Update the Q-table with the new Q-value
        Q_table[state, action] = new_q_value
        if test and (t%500)==0:
            v=evaluatePolicy(qLearned,Q_table)
            action=np.argmax(Q_table,axis=1)
            print(action, end='')
            print(t, '\t',v)
        t=t+1

    return Q_table

In [29]:
cum_r, training=run_episodes(Q_table,randSlate, 100, 10, transition_function, reward, 0.1)

In [30]:
cum_r

98.0

In [31]:
qtable = q_learning(Q_table, training, 0.01, 0.9,True)

[0 0 0]0 	 89999.3
[ 0  3 31]500 	 96657.7


array([ 0, 23, 31])

In [32]:
first_max = np.argmax(Q_table, axis=1)
second_max= np.argsort(Q_table, axis=1)[:,-2]
# Print the result
for i in range(len(first_max)):
    print(first_max[i],second_max[i])
    print(f"For user state {user_states[i]}: optimal slate is {slates[first_max[i]]} and {slates[second_max[i]]} ")

0 8
For user state -1: optimal slate is (-1, -1, -1, -1, -1) and (-1, 1, -1, -1, -1) 
23 1
For user state 0: optimal slate is (1, -1, 1, 1, 1) and (-1, -1, -1, -1, 1) 
31 23
For user state 1: optimal slate is (1, 1, 1, 1, 1) and (1, -1, 1, 1, 1) 


In [33]:
Q_table

array([[ 0.69591718,  0.22880015,  0.34921306,  0.1050774 ,  0.50424737,
         0.21871536,  0.22819879, -0.12851021,  0.55948606,  0.17656434,
         0.15811515, -0.10352437,  0.14527352, -0.12060689, -0.05796252,
        -0.18539161,  0.38645638,  0.22609747,  0.10990446, -0.0764675 ,
         0.08264216, -0.07569952, -0.09418078, -0.29762767,  0.14472183,
        -0.05698649, -0.15456403, -0.32899376, -0.11158001, -0.41384508,
        -0.30058442, -0.4440331 ],
       [ 0.01587236,  0.03778851,  0.01997979,  0.0325706 ,  0.0232588 ,
         0.03016652,  0.01881125,  0.01937365,  0.01766536,  0.01447553,
         0.02023201,  0.01687905,  0.01228107,  0.02291382,  0.01383133,
         0.02891615,  0.01604562,  0.02101945,  0.01172297,  0.027377  ,
         0.0235266 ,  0.03072435,  0.01507656,  0.04009962,  0.02322299,
         0.03032662,  0.02451829,  0.01825815,  0.02504339,  0.01644053,
         0.02173083,  0.02326127],
       [-0.44728369, -0.26883099, -0.41629955, -0.1033