In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
from tqdm import tqdm
random.seed(0)
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# LOAD DATA
df = pd.read_csv('reward_history_top10.csv')
df

## Lin-UCB

In [None]:
def lin_UCB(alpha,n_simulation, n_epochs,n_arms, n_features):
    results = []

    items = reward_history['movie_id'].unique()
    n_items = len(items)
    visitors = reward_history['user_id'].unique()
    n_visitors = len(visitors)
    
    for sim in range(n_simulation):
        total_rewards = 0 # 다시 REWARDS 
        total_regret = 0 
        aligned_time_steps = 0
        aligned_ctr = []
        unaligned_ctr = []
        
        np.random.seed(sim)
        data = df.copy()
        shuffledata = data.reindex(np.random.permutation(data.index)).reset_index(drop = True)
        shuffled_data = shuffledata.copy()
        
        A = np.identity(n_features) 
        b = np.zeros(shape=(n_features,1))    
        
        for epoch_iter in range(n_epochs):
            if epoch_iter == 0:
                data = shuffled_data.copy()
                unused_data = pd.DataFrame(columns = ["user_id", "movie_id","rating","reward"])
            else:
                data = unused_data.copy().reset_index(drop = True)
                unused_data = pd.DataFrame(columns = ["user_id", "movie_id","rating","reward"])

            for i in range(len(data)):
                user_id = data.loc[i,"user_id"]
                item_id = data.loc[i, "movie_id"]
                user_context = np.array(user_features.query("user_id == @user_id").drop("user_id", axis = 1)) # Shape (1 * 29), d = 29
                  
                # selecting arm
                p = [0 for i in range(n_arms)]
                for a in range(n_arms):
                    x_array = user_context.tolist()[0].copy()
                    movie_id = items[a]
                    movie_array = np.array(movie_features.query("movie_id == @movie_id").drop("movie_id", axis = 1)).tolist()[0]
                    x_array.extend(movie_array)
                    inv_A = np.linalg.inv(A)
                    theta = inv_A.dot(b.reshape(n_features,))
                    x_array = np.array(x_array)
                    p[a] = theta.dot(x_array) + alpha * np.sqrt(x_array.dot(inv_A).dot(x_array))
                item_idx = ind_max(p)
                chosen_arm = items[item_idx]
                
                context_t = user_context.tolist()[0].copy()
                movie_array = np.array(movie_features.query("movie_id == @chosen_arm").drop("movie_id", axis = 1)).tolist()[0]
                context_t.extend(movie_array)
                context_t = np.array(context_t).reshape(n_features, 1)  

                # get the reward
                reward = data.loc[i, "reward"]

                if chosen_arm == item_id:
                    aligned_time_steps += 1
                    total_rewards += reward
                    aligned_ctr.append(total_rewards/aligned_time_steps)
                    A += context_t.dot(context_t.T)
                    b += reward*context_t

                    result = {}
                    result['iteration'] = sim
                    result['aligned_time_steps'] = aligned_time_steps
                    result['item_id'] = chosen_arm
                    result['user_id'] = user_id
                    result['reward'] = reward
                    result['total_reward'] = total_rewards
                    result['aligned_ctr'] = total_rewards/aligned_time_steps
                    results.append(result)
                else:
                    unused_data = unused_data.append(data.iloc[i])

    return results

## Thompson Sampling with linear pay off

In [None]:
def lin_TS(delta, R ,EPSILON , n_simulation, n_epochs ,n_arms, n_features):
    results = []

    items = reward_history['movie_id'].unique()
    n_items = len(items)
    visitors = reward_history['user_id'].unique()
    n_visitors = len(visitors)
    v = R * np.sqrt(24/EPSILON*d*np.log(1/ delta)) 
    
    for sim in range(n_simulation):
        sim = sim
        
        total_rewards = 0 # 다시 REWARDS 
        total_regret = 0 
        aligned_time_steps = 0
        aligned_ctr = []
        unaligned_ctr = []
        
        np.random.seed(sim)
        data = df.copy()
        shuffledata = data.reindex(np.random.permutation(data.index)).reset_index(drop = True)
        shuffled_data = shuffledata.copy()
        
        A = np.identity(n_features) 
        b = np.zeros(shape=(n_features,1))    
        
        for epoch_iter in range(n_epochs):
            if epoch_iter == 0:
                data = shuffled_data.copy()
                unused_data = pd.DataFrame(columns = ["user_id", "movie_id","rating","reward"])
            else:
                data = unused_data.copy().reset_index(drop = True)
                unused_data = pd.DataFrame(columns = ["user_id", "movie_id","rating","reward"])

            for i in range(len(data)):
                user_id = data.loc[i,"user_id"]
                item_id = data.loc[i, "movie_id"]
                user_context = np.array(user_features.query("user_id == @user_id").drop("user_id", axis = 1)) # Shape (1 * 29), d = 29
                  
                 # select arm
                inv_A = np.linalg.inv(A)
                mean = np.dot(inv_A, b).flatten()
                cov = v**2 * inv_A
                theta = np.random.multivariate_normal(mean, cov).reshape((n_features,))
                p = [0 for i in range(n_arms)]
                for a in range(n_arms):
                    x_array = user_context.tolist()[0].copy()
                    movie_id = items[a]
                    movie_array = np.array(movie_features.query("movie_id == @movie_id").drop("movie_id", axis = 1)).tolist()[0]
                    x_array.extend(movie_array)
                
                    x_array = np.array(x_array)
                    p[a] = theta.dot(x_array)
                item_idx = ind_max(p)
                chosen_arm = items[item_idx]
                
                context_t = user_context.tolist()[0].copy()
                movie_array = np.array(movie_features.query("movie_id == @chosen_arm").drop("movie_id", axis = 1)).tolist()[0]
                context_t.extend(movie_array)
                context_t = np.array(context_t).reshape(n_features, 1)  

                # get the reward
                reward = data.loc[i, "reward"]

                if chosen_arm == item_id:
                    aligned_time_steps += 1
                    total_rewards += reward
                    aligned_ctr.append(total_rewards/aligned_time_steps)
                    A += context_t.dot(context_t.T)
                    b += reward*context_t

                    result = {}
                    result['iteration'] = sim
                    result['aligned_time_steps'] = aligned_time_steps
                    result['item_id'] = chosen_arm
                    result['user_id'] = user_id
                    result['reward'] = reward
                    result['total_reward'] = total_rewards
                    result['aligned_ctr'] = total_rewards/aligned_time_steps
                    results.append(result)
                else:
                    unused_data = unused_data.append(data.iloc[i])

    return results