In [1]:
import numpy as np
from IPython.display import clear_output
import matplotlib.pylab as plt
from time import sleep
from copy import deepcopy
import tensorflow as tf
%matplotlib inline

In [2]:
import gym 
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, accuracy, SVD
from sklearn.model_selection import train_test_split
from gym.utils import seeding

Interaction = namedtuple('Interaction', ['t', 'uid', 'recs', 'rewards', 'probs', 'best_ps', 'ranks'])

class MovieLens(gym.Env):
    def __init__(self):
        self.embedding_dimension = 3
        self.n_rec = 1
        self.seed(0)
        
        self.df = pd.read_csv('../data/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
        
        self.train_ratings, self.test_ratings = train_test_split(self.df, test_size=0.2, random_state=0)

        reader = Reader(rating_scale=(1, 5))
        train_spr = Dataset.load_from_df(self.train_ratings[['user_id','item_id','rating']],reader).build_full_trainset()
        test_spr = Dataset.load_from_df(self.test_ratings[['user_id','item_id','rating']],reader).build_full_trainset()
        test_spr = test_spr.build_testset()
        
        self.algo = SVD(n_factors=self.embedding_dimension, biased=False)
        self.algo.fit(train_spr)
        
        self.users = self.algo.pu[:40]
        self.items = self.algo.qi[:500]
        self.n_users = len(self.users)
        self.n_items = len(self.items)
        
        self.active_uid = self.np_random.choice(range(self.n_users))
        
        self.bought_items = defaultdict(set)
        
        # logs
        self.steps_count = 0
        self.interactions = []
        
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
            
    def _get_observation(self):
        pos = 0
        self.item_pos2id = {}
        possible_items = []
        
        for i in set(range(self.n_items)) - self.bought_items[self.active_uid]:
            possible_items.append(self.items[i])
            
            self.item_pos2id[pos] = i
            pos += 1
        
        user_repr = self.users[self.active_uid]
        return (np.array(user_repr), np.array(possible_items))
        
    
    def _reward(self, action):
        assert len(action) == self.n_rec
        uid = self.active_uid
        rewards = []
        for a in action:
            iid = self.item_pos2id[a]
            
            r = self.algo.estimate(u=uid, i=iid)
            rewards.append(r)
            
            self.bought_items[uid].add(iid)
        
        self.interactions.append(Interaction(t=self.steps_count,
                                             uid=uid,
                                             recs=action,
                                             rewards=rewards,
                                             probs=None,
                                             best_ps=None,
                                             ranks=None))
        
        return np.sum(rewards)
    
    def _evolve(self):
        
        users_to_play = []
        for i in range(self.n_users):
            if len(self.bought_items[i]) < (self.n_items - self.n_rec):
                users_to_play.append(i)
                
        self.active_uid = self.np_random.choice(users_to_play)
        
    def step(self, action):
        self.steps_count += 1
        reward = self._reward(action)
        
        self._evolve()
        observation = self._get_observation()
        done = None
        info = {'userid' : self.active_uid}
        #print(len(observation[1]))
        return observation, reward, done, info

    def reset(self):
        observation = self._get_observation()
        return observation

    

def compute_rewards_to_go(rewards, gamma):
    rewards_to_go = [rewards[-1]]
    for k in range(2, len(rewards)+1):
        rewards_to_go = [rewards[-k] + gamma * rewards_to_go[0]] + rewards_to_go
    return rewards_to_go

In [3]:
base_env = MovieLens()

In [4]:
env = deepcopy(base_env)

In [5]:
# make rollout 
def make_rollout(env, steps=10, n_rec=4):
    states = []
    chosen_actions = []
    possible_actions = []
    beta_probs = []
    rewards = []
    
    s, A = env.reset()
        
    for i in range(steps):
        N = A.shape[0]
        beta = 1./N
        a = np.random.choice(list(range(N)), size=n_rec)

        states.append(s)
        chosen_actions.append(a)
        possible_actions.append(A)
        beta_probs.append(beta)
        
        (s, A), r, done, info = env.step(a)
        
        rewards.append(r)
        
    return [np.array(x) for x in (states, chosen_actions, possible_actions, beta_probs, rewards)]
        

In [6]:
# rollout

In [7]:
gamma = 0.9
rollout_size = 10

In [8]:
rollout = make_rollout(env, steps=10, n_rec=1)
states, chosen_actions, possible_actions, beta_probs, rewards = rollout
rewards_to_go = compute_rewards_to_go(rewards, gamma)

In [9]:
state_size = 3
action_size = 3

In [10]:
sess = tf.InteractiveSession()

In [11]:
states_ph = tf.placeholder(tf.float32, [1, state_size], name='state')
actions_ph = tf.placeholder(tf.float32, [1, action_size], name='action')
action_ids_ph = tf.placeholder(tf.int64, [1], name='action_id')
possible_actions_ph = tf.placeholder(tf.float32, [None, action_size], name='possible_actions')
reward_to_go_ph = tf.placeholder(tf.float32, name='reward_to_go')
beta_ph = tf.placeholder(tf.float32, name='beta')

In [12]:
contexts = tf.concat([states_ph, actions_ph], axis=1)



context_size = action_size + state_size
s_size = action_size

#observations = tf.expand_dims(contexts, axis=0)
# print(observations.shape)
init_states_ph = tf.placeholder(tf.float32, (1, s_size))
init_states = init_states_ph

#with tf.variable_scope('FCN', reuse=tf.AUTO_REUSE):
    
ua_t = tf.transpose(contexts) # 1x6
s_t = tf.transpose(init_states) # 1x3

W_a = tf.get_variable("W_a", shape=[s_size, context_size],
                      initializer=tf.contrib.layers.xavier_initializer())

W_z = tf.get_variable("W_z", shape=[s_size, context_size],
                      initializer=tf.contrib.layers.xavier_initializer())

W_i = tf.get_variable("W_i", shape=[s_size, context_size],
                      initializer=tf.contrib.layers.xavier_initializer())

b_i = tf.get_variable("b_i", shape=[s_size, 1],
                      initializer=tf.contrib.layers.xavier_initializer())


U_z = tf.get_variable("U_z", shape=[s_size, s_size],
                      initializer=tf.contrib.layers.xavier_initializer())

U_i = tf.get_variable("U_i", shape=[s_size, s_size],
                      initializer=tf.contrib.layers.xavier_initializer())

b_z = tf.get_variable("b_z", shape=[s_size, 1],
                      initializer=tf.contrib.layers.xavier_initializer())


z = tf.sigmoid(tf.matmul(U_z, s_t) + tf.matmul(W_z, ua_t) + b_z) # s_size x 1
i = tf.sigmoid(tf.matmul(U_i, s_t) + tf.matmul(W_i, ua_t) + b_i) # s_size x 1

s_tp1 = z * tf.tanh(s_t) + i * tf.tanh(tf.matmul(W_a, ua_t))

output = final_state = s_tp1

# with tf.variable_scope("rnn"):
#     gru_cell = tf.contrib.rnn.BasicRNNCell(gru_unit_size)

#     output, final_state = tf.nn.dynamic_rnn(gru_cell, inputs=observations, initial_state=init_states)

output = tf.reshape(output, [-1, s_size])

# with tf.variable_scope("softmax"):
#     w_softmax = tf.get_variable("w_softmax", shape=[gru_unit_size, num_actions],
#         initializer=tf.contrib.layers.xavier_initializer())
#     b_softmax = tf.get_variable("b_softmax", shape=[num_actions],
#         initializer=tf.constant_initializer(0))

logit = output#tf.matmul(output, w_softmax) + b_softmax
#return logit, final_state

In [13]:
tf.matmul(U_i, s_t)

<tf.Tensor 'MatMul_5:0' shape=(3, 1) dtype=float32>

In [14]:
tf.matmul(W_i, ua_t) + b_i

<tf.Tensor 'add_5:0' shape=(3, 1) dtype=float32>

In [15]:
scores = tf.transpose(tf.matmul(possible_actions_ph, logit, transpose_b=True))

In [16]:
softmax = tf.math.softmax(scores)
log_softmax = tf.math.log_softmax(scores)

In [17]:
onehot_actions = tf.one_hot(action_ids_ph, depth=tf.shape(possible_actions_ph)[0])

In [18]:
loss = tf.reduce_sum(log_softmax*onehot_actions * tf.stop_gradient(softmax) )#* reward_to_go_ph / beta_ph)

In [19]:
opt = tf.train.GradientDescentOptimizer(learning_rate=0.1)

In [20]:
trainable_variables = tf.trainable_variables()

grad = opt.compute_gradients(loss=loss, var_list=trainable_variables)
max_gradient = 5
grad = [(tf.clip_by_norm(grad, max_gradient), var)
                                  for grad, var in grad]

In [21]:
train_op = opt.apply_gradients(grads_and_vars=grad)

In [22]:
sess.run(tf.global_variables_initializer())

In [23]:
# sess.run(tf.global_variables_initializer())
rollout = make_rollout(env, steps=10, n_rec=1)
states, chosen_actions, possible_actions, beta_probs, rewards = rollout
rewards_to_go = compute_rewards_to_go(rewards, gamma)

curr_state = np.zeros((1, s_size))
grads = []

for t in range(1, rollout_size):
    feed_dict = {
                states_ph : states[t][np.newaxis],
                actions_ph : np.array([possible_actions[i][a] for i, a in enumerate(chosen_actions[t])]),
                action_ids_ph : chosen_actions[t],
                possible_actions_ph : possible_actions[t],
                reward_to_go_ph : rewards_to_go[t],
                beta_ph : beta_probs[t],
                init_states_ph : curr_state,
             }
      
    curr_state, gradient = sess.run([ 
                    # logit, 
                    tf.transpose(final_state), 
                    # log_softmax, 
                    # loss, 
                     grad
    ], 
                 feed_dict)
    
    grads.append(gradient)
    #curr_state = res[1]

    print(sess.run([loss, scores], feed_dict))

    
for g in grads:
    
    grads_dict = {}
    for (grad_tensor, var), (g, v) in zip(grad, g):
        grads_dict[grad_tensor] = v
    
    sess.run(train_op, feed_dict= grads_dict)

[-0.011710793, array([[-0.75300395, -0.94715452, -0.67407501, -0.98677683, -0.77380723,
        -0.870794  , -0.65751749, -0.29757482, -0.90901989, -0.42883751,
        -0.83743119, -1.14625239, -0.61388415, -1.02183771, -0.9642722 ,
        -0.71877116, -0.71428293, -0.88314974, -0.61482865, -0.72276556,
        -0.75079912, -0.51885962, -1.04742324, -0.94349951, -0.9927187 ,
        -0.91277206, -0.75414103, -0.78752196, -0.72001016, -0.75306439,
        -0.51899493, -1.13466275, -0.5505715 , -0.5142867 , -0.63168609,
        -0.61337054, -0.80517399, -0.9661783 , -0.64315462, -0.94426244,
        -0.89162576, -0.64516652, -0.66718876, -0.60275519, -0.69371361,
        -0.63089913, -0.45481986, -0.73092002, -0.7444483 , -0.723598  ,
        -0.45755073, -0.79652715, -0.59108853, -0.76640451, -0.37089556,
        -0.82995808, -0.62333423, -1.27762794, -0.93874538, -0.90302831,
        -0.33519256, -0.71877933, -0.62482226, -0.66643298, -0.95413566,
        -0.83807147, -0.63573384, -0

[-0.010792571, array([[-1.1713537 , -1.38536453, -1.07119823, -1.4202069 , -1.08286917,
        -1.19026113, -0.95412481, -0.44699013, -1.31316817, -0.75065601,
        -1.12445331, -1.74937987, -0.91828477, -1.49306464, -1.22164881,
        -0.91387105, -1.01904142, -1.20503318, -0.78923988, -1.04651964,
        -1.0812465 , -0.66766036, -1.51688087, -1.30586135, -1.39036393,
        -1.25329185, -1.10767126, -0.95475799, -0.9704513 , -1.05687571,
        -0.62951463, -1.53744936, -0.81867719, -0.73492134, -0.85124356,
        -0.91813743, -1.05692208, -1.38243985, -1.04640794, -1.31494236,
        -1.5457226 , -0.87740809, -0.9103992 , -0.85377747, -0.95586145,
        -1.05087876, -0.73070276, -1.07617807, -1.04772818, -1.02575052,
        -0.76672536, -1.13595665, -0.88656944, -1.20612872, -0.54119903,
        -1.13931239, -0.97215211, -1.86689341, -1.19235408, -1.29166365,
        -0.57237303, -1.04752302, -0.87913507, -0.90494269, -1.33825588,
        -1.10262227, -0.93403578, -0

In [24]:
curr_state

array([[-0.09119575,  0.3061817 , -1.07912779]], dtype=float32)

In [25]:
sess.run([final_state, output], feed_dict={
    states_ph : states[0][np.newaxis],
    actions_ph : np.array([possible_actions[i][a] for i, a in enumerate(chosen_actions[0])]),
    init_states_ph : curr_state
})


[array([[-0.0611067 ],
        [ 0.18754616],
        [-0.46794692]], dtype=float32),
 array([[-0.0611067 ,  0.18754616, -0.46794692]], dtype=float32)]

In [26]:
import numpy as np
import tensorflow as tf
from agents.baselines import Agent

class RnnReinforceAgent(Agent):
    def __init__(self, n_rec, state_dim, action_dim, sess):
        self.n_rec = n_rec
        self.state_size = state_dim
        self.action_size = action_dim 
        self.sess = sess

        
        self.t = 0
        self.warms_start = 200
        
        # params
        self.gamma = 0.9
        self.lr = 0.1
        self.gru_unit_size = 5
        
        num_actions = action_size
        
        # placeholders 
        self.states_ph = tf.placeholder(tf.float32, [1, self.state_size], name='state')
        self.actions_ph = tf.placeholder(tf.float32, [1, self.action_size], name='action')
        self.action_ids_ph = tf.placeholder(tf.int64, [1], name='action_id')
        self.possible_actions_ph = tf.placeholder(tf.float32, [None, self.action_size], name='possible_actions')
        self.reward_to_go_ph = tf.placeholder(tf.float32, name='reward_to_go')
        self.beta_ph = tf.placeholder(tf.float32, name='beta')
        self.init_states_ph = tf.placeholder(tf.float32, (1, self.gru_unit_size), name='rnn_state')

        # operations 
        contexts = tf.concat([self.states_ph, self.actions_ph], axis=1)
        
        observations = tf.expand_dims(contexts, axis=0)

        with tf.variable_scope("agent_rnn", reuse=tf.AUTO_REUSE):
            gru_cell = tf.contrib.rnn.BasicRNNCell(self.gru_unit_size)
            self.output, self.final_state = tf.nn.dynamic_rnn(gru_cell, inputs=observations, 
                                                         initial_state=self.init_states_ph)
            self.output = tf.reshape(self.output, [-1, self.gru_unit_size])

        with tf.variable_scope("agent_softmax", reuse=tf.AUTO_REUSE):
            w_softmax = tf.get_variable("w_softmax", shape=[self.gru_unit_size, self.action_size],
                initializer=tf.contrib.layers.xavier_initializer())
            b_softmax = tf.get_variable("b_softmax", shape=[self.action_size],
                initializer=tf.constant_initializer(0))

        self.logit = tf.matmul(self.output, w_softmax) + b_softmax
        
        self.scores = tf.transpose(tf.matmul(self.possible_actions_ph, self.logit, transpose_b=True))
        
        self.softmax = tf.math.softmax(self.scores)
        self.log_softmax = tf.math.log_softmax(self.scores)
        
        onehot_actions = tf.one_hot(self.action_ids_ph, depth=tf.shape(self.possible_actions_ph)[0])
        loss = self.log_softmax*onehot_actions * tf.stop_gradient(self.softmax) * self.reward_to_go_ph / self.beta_ph
        
        # train_ops
        self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
        
        self.trainable_variables = tf.trainable_variables()

        self.grad = self.opt.compute_gradients(loss=-loss, var_list=self.trainable_variables)
        
        self.train_op = self.opt.apply_gradients(grads_and_vars=self.grad)
        
        
        # last rollout 
        self.rollouts = defaultdict(lambda: defaultdict(list))
        
#         self.last_states = []
#         self.last_chosen_actions = []
#         self.last_possible_actions = []
#         self.last_beta_probs = []
#         self.last_rewards = []
        
        self.rollout_size = 10
        self.current_user_id = None
    
    def _prepare_rollout(self):
        
        last_states = self.rollouts[self.current_user_id]['last_states']
        last_chosen_actions = self.rollouts[self.current_user_id]['last_chosen_actions']
        last_possible_actions = self.rollouts[self.current_user_id]['last_possible_actions']
        last_beta_probs = self.rollouts[self.current_user_id]['last_beta_probs']
        last_rewards = self.rollouts[self.current_user_id]['last_rewards']
        
        return [np.array(x) for x in (last_states[-self.rollout_size:], 
                                      last_chosen_actions[-self.rollout_size:], 
                                      last_possible_actions[-self.rollout_size:], 
                                      last_beta_probs[-self.rollout_size:], 
                                      last_rewards[-self.rollout_size:])
               ]
    
    def _train(self):
        
        rollout = self._prepare_rollout()
        
        states, chosen_actions, possible_actions, beta_probs, rewards = rollout
        rewards_to_go = compute_rewards_to_go(rewards, self.gamma)
        rollout_size = len(states)
        
        curr_state = np.ones((1, self.gru_unit_size))
        grads = []
        for t in range(1, rollout_size):
            feed_dict = {
                        self.states_ph : states[t][np.newaxis],
                        self.actions_ph : possible_actions[t][chosen_actions[t]],
                        self.action_ids_ph : chosen_actions[t],
                        self.possible_actions_ph : possible_actions[t],
                        self.reward_to_go_ph : rewards_to_go[t],
                        self.beta_ph : beta_probs[t],
                        self.init_states_ph : curr_state,
                     }
            #print(feed_dict)
            #print(self.sess.run([self.final_state], feed_dict))
#             print(feed_dict)
            grad, state = self.sess.run([self.grad, self.final_state], feed_dict)
            grads.append(grad)
            curr_state = state

        for g in grads:
            grads_dict = {}
            for (grad_tensor, var), (g, v) in zip(self.grad, g):
                grads_dict[grad_tensor] = v

#             self.sess.run(self.train_op, feed_dict= grads_dict)
    
    def _predict(self, user, possible_items):
        
        rollout = self._prepare_rollout()
        
        states, chosen_actions, possible_actions, beta_probs, rewards = rollout
        rewards_to_go = compute_rewards_to_go(rewards, self.gamma)
        rollout_size = len(states)
        
        curr_state = np.ones((1, self.gru_unit_size))
        for t in range(1, rollout_size):
            feed_dict = {
                        self.states_ph : states[t][np.newaxis],
                        self.actions_ph : possible_actions[t][chosen_actions[t]],
                        #self.action_ids_ph : chosen_actions[t],
                        #self.possible_actions_ph : possible_actions[t],
                        #self.reward_to_go_ph : rewards_to_go[t],
                        #self.beta_ph : beta_probs[t],
                        self.init_states_ph : curr_state,
                     }
            
            curr_state = self.sess.run(self.final_state, feed_dict)
            print(states[t][np.newaxis])
            print(possible_actions[t][chosen_actions[t]])
            print( self.sess.run([self.output, self.final_state], feed_dict))
            print(curr_state)
            
        
        
        probs = self.sess.run(self.softmax, 
                              feed_dict = {
                                  self.states_ph : states[-1][np.newaxis],
                                  self.actions_ph : possible_actions[-1][chosen_actions[-1]],
                                  self.possible_actions_ph : possible_items,
                                  self.init_states_ph : curr_state
                              })[0]
        
        print(rollout_size)
        #print(curr_state)
        #print(np.mean(probs), np.std(probs), np.max(probs), np.min(probs))
        #print(list(range(len(possible_items))))
        idx = np.random.choice(list(range(len(possible_items))), p=probs)
        
        return idx, probs[idx]
    
    
    
    def _get_actions(self, observation):
        user, items = observation
        
        if self.t <= self.warms_start:
            # trake random action
            N = len(items)
            beta = 1./N
            idx = np.random.choice(list(range(N)))    
        else:
            idx, beta = self._predict(user, items)
            
        
        self.rollouts[self.current_user_id]['last_states'].append(user)
        self.rollouts[self.current_user_id]['last_chosen_actions'].append([idx])
        self.rollouts[self.current_user_id]['last_possible_actions'].append(items)
        self.rollouts[self.current_user_id]['last_beta_probs'].append(beta)
        
        #self.last_states.append(user)
        #self.last_chosen_actions.append([idx])
        #self.last_possible_actions.append(items)
        #self.last_beta_probs.append(beta)
        
        self.t += 1
        
        return [idx]
    
    #============= Agent interface ===============            
    def begin_episode(self, observation, info=None):
        
        if info:
            self.current_user_id = info['userid']
        
        self.sess.run(tf.global_variables_initializer())
        idxs = self._get_actions(observation)
        return idxs
        

    def step(self, reward, observation, info=None):
        
        self.rollouts[self.current_user_id]['last_rewards'].append(reward)
#         self.last_rewards.append(reward)
        
        if info:
            if self.current_user_id != info['userid']:
                self._train()
            self.current_user_id = info['userid']
        
        if len(self.rollouts[self.current_user_id]['last_rewards']) > self.rollout_size:
            self._train()
        
        idxs = self._get_actions(observation)
        return idxs

    def end_episode(self, reward, info=None):
        raise NotImplemented

In [27]:
env = deepcopy(base_env)


state_dim = env.embedding_dimension
action_dim = env.embedding_dimension
action_size = env.n_rec
sess.close()
sess = tf.InteractiveSession()


agent = RnnReinforceAgent(action_size, state_dim, action_dim, sess)

Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.


In [28]:
from rec_gym.runner import run_experiment

In [29]:
run_experiment(env, agent, t_train=300, t_test=0)

  0%|          | 0/300 [00:00<?, ?it/s]


TypeError: Fetch argument None has invalid type <class 'NoneType'>

In [None]:
t = 200
pg_rs = []
for i in env.interactions:
    pg_rs.extend(i.rewards)
print(np.sum(pg_rs))

In [None]:
from agents.baselines import RandomAgent

env =  deepcopy(base_env)

state_dim = env.embedding_dimension
action_dim = env.embedding_dimension
action_size = env.n_rec

agent = RandomAgent(action_size = action_size)
run_experiment(env, agent, t_train=4000, t_test=0)

In [None]:
t = 200
rand_rs = []
for i in env.interactions:
    rand_rs.extend(i.rewards)
print(np.sum(rand_rs))

In [None]:
plt.plot(np.cumsum(pg_rs), label='pg')
plt.plot(np.cumsum(rand_rs), label='random')