In [None]:
import numpy as np
from tqdm import tqdm
import cv2
import pickle
from natsort import natsorted
import matplotlib.pyplot as plt
import random
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import tensorflow as tf
import os
from tensorflow import keras
from collections import deque
import random

## Data Loading

In [None]:
os.chdir('./datasets')
with open('replay_buffer.pkl', 'wb') as f:
    replay_buffer = pickle.load(f) # deserialize the list

## Paramters

In [None]:
replay_buffer = deque(maxlen=50)

epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995

episodes = 1000

## Model Creation and Training

In [None]:
def simple_Q_model_creation(layers=[], learning_rate = 0.001):

    '''
    The input will be a np.array of length 50, with each index representing a tag
    The value of each index is the amount of saves that tag has received

    The output will be a np.array of length 50, with each index representing a tag
    The value of each index is either 1 or 0 indicating whether the tag should be recommended or not
    '''

    model = tf.keras.Sequential()
    input_layer = tf.keras.Input(shape=(50,))
    for layer in layers:
        model.add(tf.keras.layers.Dense(layer, activation='relu'))
        model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(50, activation='linear'))

    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    model.compile(optimizer=opt, loss='mse', metrics=['accuracy'])

    model.summary()

    return model

q_network = simple_Q_model_creation([256, 128, 64])
target_network = simple_Q_model_creation([256, 128, 64])

In [None]:

def store_experience(state, action, reward, next_state, done):
    replay_buffer.append((state, action, reward, next_state, done))

In [None]:

def select_action(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(0, 2, size=50)
    q_values = q_network.predict(state.reshape(1, -1))
    return (q_values[0] > 0).astype(int)

In [None]:
losses = []

def replay(batch_size=32, gamma=0.99):
    if len(replay_buffer) < batch_size:
        return

    minibatch = random.sample(replay_buffer, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            target += gamma * np.amax(target_network.predict(next_state.reshape(1, -1))[0])

        target_f = q_network.predict(state.reshape(1, -1))
        target_f[0][np.argmax(action)] = target

        history = q_network.fit(state.reshape(1, -1), target_f, epochs=1)
        losses.append(history.history['loss'][0])


    global epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay


In [None]:
# Replace with actual data once dataset is generated

# for episode in range(episodes):
#     state = np.random.rand(50)
#     done = False

#     while not done:

#         action = select_action(state, epsilon)

#         next_state = np.random.rand(50)
#         reward = np.sum(action)
#         done = np.random.rand() > 0.95

#         store_experience(state, action, reward, next_state, done)
#         state = next_state

#         replay()


#     if episode % 10 == 0:
#         target_network.set_weights(q_network.get_weights())

#     print(f"Episode {episode + 1}/{episodes}, Epsilon: {epsilon:.2f}")


## Script

## Explainability

In [None]:
state = np.random.rand(50)  # Replace with actual state
q_values = q_network.predict(state.reshape(1, -1))
recommendations = (q_values[0] > 0).astype(int)
print("Recommendations:", recommendations)

In [None]:
plt.plot(losses)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training Progress: Loss')
plt.show()