El modelo a aplicar se basa en el artículo [Building a Recommendation System for Amazon Fashion Products using DQN (beta)](https://medium.com/@vibhu12345/building-a-recommendation-system-for-amazon-fashion-products-using-dqn-beta-855b1ff7834e)

In [3]:
import pandas as pd
from collections import deque
from keras import layers, models
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.utils import plot_model

In [3]:
data = pd.read_parquet("Netflix_Prize_data/netflix_data_sample.parquet")

In [9]:
movie_count = data.movie_id.unique().shape[0]
customer_count = data.customer_id.unique().shape[0]

print(f"Cantidad de películas: {movie_count}")
print(f"Cantidad de clientes: {customer_count}")

Cantidad de películas: 17675
Cantidad de clientes: 18095


# Definición del ambiente

In [None]:

class RecommendationEnv(gym.Env):
    def __init__(self, states, states_dict, iterations = 10):
        self.states = states
        self.state = self.states[0]
        self.states_dict = states_dict
        self.iterations = iterations
        self.index = 0
        state.action = 0


    def step(self, actions):
        # Implement the transition logic based on the action
        reward= 0
        done= False
        reviewerId= self.state.reviewerId
        future_asins= [p for p in reviewers[reviewerId].products if self.states_dict[(p,reviewerId)].time>self.state.time]
        matched_recommendations = False
        #predicted recommendations
        for i in actions:
          if self.states[i].product_asin in future_asins:
            self.action = i
            matched_recommendations = True
            break;


        if matched_recommendations:
            #Higher reward as they are bought products for the user in future
            reward = 1
        else:
            self.action = actions[0]


        self.index += 1
        self.state = self.states[self.index]
        print(f"iteration :{self.index}")
        if (self.iterations == self.index): done = True

        return self.state, reward, done, {}




    def reset(self, iterations = 10):
        # Reset the state to the initial position
        self.state = self.states[0]
        self.iterations = iterations
        self.index = 0
        return self.state

# Create the custom environment
env = RecommendationEnv(states_list, states, 10)

# Definición del Agente

In [4]:

class DQNAgent:
    def __init__(self, state_size, action_size, states):
        self.states = states
        self.state_size = state_size
        self.action_size = action_size

        # Hiperparámetros para DQN
        self.discount_factor = 0.99

        # Tasa de aprendizaje para la red neuronal
        self.learning_rate = 0.001
        
        self.epsilon = 1
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()

   
    def build_model(self):

        # Aproximar la función Q usando Red Neuronal
        # entrada: estado actual (película actual que ve la persona)
        # salida: Q-value para cada acción posible (cada película en el catálogo)


        # Capa de entrada para el estado actual (película actual que ve la persona)
        ### idea: agregar al input el rating actual
        input_layer = tf.keras.layers.Input(shape=(1,), name='actual_state')

        # Definición de capas ocultas
        hidden_layer = layers.Dense(64, activation='relu')(input_layer)


        # 1 Q-value por acción
        output_layer = layers.Dense(100, activation='linear', name='q_values')(hidden_layer)

        # Crear modelo
        model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name='DQN_model')

        # Resumen del modelo
        model.summary()

        # Grafo de la red
        # plot_model(model, to_file='Arquitectura_DQN.png', show_shapes=True, show_layer_names=True)


        # Compile the model
        model.compile(loss='mse',
                      optimizer=Adam(learning_rate=self.learning_rate))
        
        return model



    def train_model(self):
        # if len(self.memory) < self.train_start:
        #     return
        # batch_size = min(self.batch_size, len(self.memory))
        # mini_batch = random.sample(self.memory, batch_size)

        # update_input_metadata =[]
        # update_input_ratings =[]
        # update_target_metadata = []
        # update_target_ratings = []
        action, reward, done = [], [], []

        # for i in range(self.batch_size):
        #     update_input_metadata.append(np.array(mini_batch[i][0].metadata))
        #     update_input_ratings.append(np.array(mini_batch[i][0].ratings))
        #     action.append(mini_batch[i][1])
        #     reward.append(mini_batch[i][2])
        #     update_target_metadata.append(np.array(mini_batch[i][3].metadata))
        #     update_target_ratings.append(np.array(mini_batch[i][3].ratings))
        #     done.append(mini_batch[i][4])

        target = self.model.predict([np.transpose(update_input_metadata),np.transpose(update_input_ratings)])
        target_val = self.target_model.predict([np.transpose(update_target_metadata),np.transpose(update_target_ratings)])

        for i in range(self.batch_size):
            # Q Learning: get maximum Q value at s' from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * ( np.amax(target_val[i]))

        # and do the model fit!
        self.model.fit([np.transpose(update_input_metadata),np.transpose(update_input_ratings)], target, batch_size=self.batch_size,
                       epochs=1, verbose=1)




    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get recommendations from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.sample(range(self.action_size),10)
        else:
            q_value = self.model.predict([np.array([state.metadata]), np.array([state.ratings])])
            return np.argpartition(q_value[0],-10)[-10:]

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


state_size = len(env.states)
# Every other product can be a recommendation
action_size = state_size
agent = DQNAgent(state_size, action_size, env.states)

NameError: name 'env' is not defined