In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
from tqdm import tqdm
random.seed(0)
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
def ind_max(temp):
    m = np.max(temp)
    indices = np.argwhere(temp == m).flatten()
    return np.random.choice(indices)

In [None]:
def categorical_draw(probs):
    z = random.random()
    cum_prob = 0.0
    for i in range(len(probs)):
        prob = probs[i]
        cum_prob += prob
        if cum_prob > z:
            return i
  
    return len(probs) - 1

In [None]:
class ReplaySimulator(object):
    def __init__(self,reward_history, n_iterations = 1,epochs = 2):
        self.reward_history = reward_history
        self.n_iterations = n_iterations
        self.n_epochs = epochs
        self.aligned_time_steps = 0      
        
        self.items = self.reward_history['movie_id'].unique()
        self.n_items = len(self.items)
        self.visitors = self.reward_history['user_id'].unique() 
        self.n_visitors = len(self.visitors) # the number of uniwque users
         
    def reset(self):
        self.aligned_time_steps = 0      
        self.n_item_samples = np.zeros(self.n_items) # the number of each arm sampled
        self.n_item_rewards = np.zeros(self.n_items) # reward for each arm
        self.reward_squared = np.zeros(self.n_items) # reward^2 for each arm
   
    def replay(self):
        results = []
        for iteration in tqdm(range(0, self.n_iterations)):
            np.random.seed(iteration)
            self.filtered_data = self.reward_history.reindex(np.random.permutation(self.reward_history.index)).reset_index(drop = True)
            self.shuffled_data = self.filtered_data.copy()
            self.reset() 

            total_rewards = 0
            total_regret = 0 
            aligned_time_steps = 0
            
            for epoch_iter in range(self.n_epochs):
                print('Epoch : ', epoch_iter)
                
                if epoch_iter == 0:
                    data = self.shuffled_data.copy()
                    unused_data = pd.DataFrame(columns = ["user_id", "movie_id","rating","reward"])
                else:
                    data = unused_data.copy().reset_index(drop = True)
                    unused_data = pd.DataFrame(columns = ["user_id", "movie_id","rating","reward"])
                    
                for i in range(len(data)):
                    user_id = data.loc[i,"user_id"]
                    movie_id = data.loc[i, "movie_id"]
                    data_reward = data.loc[i, "reward"]

                    item_idx = self.select_item() 
                    item_id = self.items[item_idx]

                    if item_id  == movie_id:
                        self.aligned_time_steps += 1
                        total_rewards += data_reward
                        self.record_result(item_idx, data_reward)
                        result = {}
                        result['iteration'] = iteration
                        result['aligned_time_steps'] = self.aligned_time_steps
                        result['item_id'] = item_id
                        result['user_id'] = user_id
                        result['reward'] = data_reward
                        result['total_reward'] = total_rewards
                        result['aligned_ctr'] = total_rewards/self.aligned_time_steps
                        results.append(result)
                    else:
                        unused_data = unused_data.append(data.iloc[i])

        return results
       
    
    def select_item(self):
        return np.random.randint(self.n_items)
        
    def record_result(self,item_idx, reward):
    
        self.n_item_samples[item_idx] += 1 
        n = self.n_item_samples[item_idx] 
        self.reward_squared[item_idx] += reward**2

        value = self.n_item_rewards[item_idx] 
        new_value = ((n-1)/float(n))*value + (1/float(n)) * reward 
        self.n_item_rewards[item_idx] = new_value 

        