# Exercice 1 : Stationary Environment - Multi Armed Bandits

In [None]:
import os
os.chdir("../")

In [None]:
import numpy as np
import tensorflow as tf
from tqdm import trange
import matplotlib.pyplot as plt
from IPython.display import clear_output
import ENV.Stationary_Bandit_ENV as S_bandit_env
from ENV.Logger import Logger
import math
np.random.seed(42) # For reproductibility

In [None]:
os.getcwd()

# $\epsilon-greedy$ action value ( sample average)

In [None]:
# Creating the environment 
env = S_bandit_env.Stationary_Bandit()

env.get_nb_bandit
env.list_mean_bandit()

In [None]:
class Q_solver() :

    def __init__(self, env, timestep = 1000) :
        self.env = env
        self.nb_bandits = env.get_nb_bandit
        self.Q_table = np.zeros((self.nb_bandits, 1))
        self.action_count = np.zeros((self.nb_bandits, 1))
        self.timestep = timestep
        # Parameters initialisation
        self.epsilon = 0.1
        self.print_delay = 0.25
        # Logger
        self.logger = Logger()

    def act(self, act_epsilon_greedy) :
        if act_epsilon_greedy :
            return np.random.randint(0, self.nb_bandits, 1)[0]  # Transform [Action] to action (int)
        else :
            return np.argmax(self.Q_table)

    def updateQtable(self, action, reward, action_count) :
        self.Q_table[action] = self.Q_table[action] + 1 / action_count * (reward - self.Q_table[action])

    def run(self, force_epsilon = None) :
        self.boxplotter = np.array([[0]] * self.nb_bandits).tolist()
        print(self.boxplotter)
        if force_epsilon is not None :
            self.epsilon = force_epsilon
        env.reset()
        cumul_reward = 0

        for iteration in trange(1, self.timestep) :
            action = self.act(self.epsilon > np.random.random())
            self.action_count[action] += 1
            reward = env.step(action)
            cumul_reward += reward
            self.boxplotter[action].append(reward)
            self.updateQtable(action, reward, self.action_count[action][0])
            self.logger.epsilon_log(self.epsilon)
            self.logger.reward_log(reward)
            if iteration % (self.timestep * self.print_delay) == 0 :
                self.logger.mean_reward_log(cumul_reward / (self.timestep * self.print_delay))
                self.logger.plot_log()
                cumul_reward = 0
                plt.boxplot(solver.boxplotter)
                plt.show()

In [None]:
env = S_bandit_env.Stationary_Bandit()
solver = Q_solver(env, 100000)

In [None]:

solver.run(0.01)

In [None]:
import seaborn as sns
solver.Q_table

In [None]:
sns.heatmap(solver.Q_table.T, cmap="coolwarm")

# UCB

In [None]:
class Q_solver_UCB(Q_solver) :

    def __init__(self, env, timestep = 1000) :
        Q_solver.__init__(self, env, timestep)
        self.c = 0.75
    def act(self, time, act_counter):
        maxer = []
        for action in range(self.nb_bandits):
            maxer.append(self.Q_table[action] + self.c * np.sqrt(math.log(time) / (1 + act_counter[action])))
        return np.argmax(maxer)
    
    def run(self, force_epsilon = None) :

        if force_epsilon is not None :
            self.epsilon = force_epsilon
        env.reset()
        cumul_reward = 0

        for iteration in trange(1, self.timestep) :
            action = self.act(iteration, self.action_count)
            self.action_count[action] += 1
            reward = env.step(action)
            cumul_reward += reward
            self.updateQtable(action, reward, self.action_count[action][0])
            self.logger.reward_log(reward)
            if iteration % (self.timestep * self.print_delay) == 0 :
                self.logger.mean_reward_log(cumul_reward / (self.timestep * self.print_delay))
                self.logger.plot_log(5)
                cumul_reward = 0


In [None]:
env = S_bandit_env.Stationary_Bandit()
solver = Q_solver_UCB(env)

solver.run(0.01)

## Policy based : Gradient bandit

In [None]:
class Q_solver_policy(Q_solver) :

    def __init__(self, env, timestep = 1000) :
        Q_solver.__init__(self, env, timestep)
        self.alpha = 0.1

    def act(self, policy) :
        possible_actions = np.array(range(self.nb_bandits))
        return np.random.choice(possible_actions, size = 1, p = policy.flatten() / np.sum(policy))[0]

    def update_preferences(self, preferences, action, reward, mean_reward, policy) :
        preferences[action] += self.alpha * (reward - mean_reward) * (1 - policy[action])
        for act in range(self.nb_bandits) :
            if act != action :
                preferences[act] -= self.alpha * (reward - mean_reward) * policy[act]
        return preferences

    def compute_policy(self, preferences) :
        policy = np.exp(preferences) / np.sum(np.exp(preferences))
        return policy

    def run(self, force_epsilon = None) :
        self.boxplotter = [[0]] * self.nb_bandits
        env.reset()
        cumul_reward = 0
        preferences = np.zeros((self.nb_bandits, 1))
        average_reward = 0.0
        for iteration in trange(1, self.timestep) :
            policy = self.compute_policy(preferences)
            action = self.act(policy)
            reward = env.step(action)
            self.boxplotter[action].append(reward)
            self.logger.reward_log(reward)
            cumul_reward += reward
            average_reward += (reward - average_reward) / iteration
            preferences = self.update_preferences(preferences, action, reward, average_reward, policy)
            if iteration % (self.timestep * self.print_delay) == 0 :
                self.logger.plot_mean_reward()
                cumul_reward = 0


In [None]:
env = S_bandit_env.Stationary_Bandit()
solver = Q_solver_policy(env, timestep=100)
solver.run()

In [None]:
plt.boxplot(solver.boxplotter)
plt.show()

# Exercice 2 : Non-Stationary Environment - Multi Armed Bandits

In [None]:
import ENV.Non_Stationary_Bandit_ENV as NS_bandit_env


# Experiment last Solver 

In [None]:
env = NS_bandit_env.Non_Stationary_Bandit()
solver = Q_solver(env, 10000)
solver.run(0.1)

## Improvement 

In [None]:
class Q_solver_NS(Q_solver) :
    def __init__(self, env, timestep = 1000) :
        Q_solver.__init__(self, env, timestep)
        self.alpha = 0.1

    def updateQtable(self, action, reward, action_count) :
        self.Q_table[action] = self.Q_table[action] + self.alpha * (reward - self.Q_table[action])


In [None]:
env = NS_bandit_env.Non_Stationary_Bandit()
solver = Q_solver_NS(env)
solver.run(0.1)


# Exercice 3 : Contextual Environment - Multi Armed Bandits

In [None]:
import ENV.ContextualBandit as C_Bandit

In [None]:
class Q_solver_contextual(Q_solver):
    
    def __init__(self, env, timestep = 1000):
        Q_solver.__init__(self, env, timestep)
        self.nb_context = env.get_nb_context 
        self.Q_table = np.zeros((self.nb_bandits, self.nb_context))
        self.action_count = np.zeros((self.nb_bandits, self.nb_context))
        self.timestep = timestep
        # Parameters initialisation        
        self.epsilon = 1.
        self.epsilon_origin = 1.
        self.epsilon_decay = 10
        self.epsilon_min = 0.02
        self.print_delay = 0.1
       
        
    def act(self, act_epsilon_greedy, context):
        if act_epsilon_greedy :
            return np.random.randint(0, self.nb_bandits, 1)[0]  # Transform [Action] to action (int)
        else:
            return np.argmax(self.Q_table, axis = 1)[context]
        
    def updateQtable(self, action, reward, action_count, context):
        self.Q_table[action][context] = \
                    self.Q_table[action][context] + 1 / action_count * (reward - self.Q_table[action][context])
                    
        
    def run(self, force_epsilon = None):
        context = env.reset()
        cumul_reward = 0
        self.boxplotter = [[0]] * self.nb_bandits
        for iteration in trange(1, self.timestep):
            action = self.act(self.epsilon > np.random.random(), context)
            self.action_count[action] += 1
            reward, context = env.step(action)
            self.updateQtable(action, reward, self.action_count[action][0], context)
            # Update epsilon with linear decay
            self.epsilon = max(self.epsilon - self.epsilon_origin / self.epsilon_decay, self.epsilon_min)
            if force_epsilon is not None:
                self.epsilon = force_epsilon
            self.boxplotter[action].append(reward)
            cumul_reward += reward
            self.logger.epsilon_log(self.epsilon)
            self.logger.reward_log(reward)
            if iteration % (self.timestep * self.print_delay) == 0 :
                self.logger.mean_reward_log(cumul_reward / (self.timestep * self.print_delay))
                self.logger.plot_log()
                plt.boxplot(solver.boxplotter)
                plt.show()
                cumul_reward = 0

In [None]:
env = C_Bandit.Contextual_bandit()
solver = Q_solver_contextual(env)
solver.run(force_epsilon = 0.1)


# Exercice 4 ; Cartpole and DQN

In [None]:
import gym
from collections import deque
import tensorflow as tf
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Dense, Input, Add, concatenate, RepeatVector, Flatten, Lambda, Conv2D, MaxPooling2D, UpSampling2D, Reshape
from tensorflow.python.keras.optimizers import Adam
import random

In [None]:
class DQNAgent :
    def __init__(self, env, state_size, action_size) :

        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.995  # discount rate
        self.exploration_rate = 1.  # exploration rate
        self.original_epsilon = 1.
        self.min_epsilon = 0.01
        self.exploration_rate_decay = 150
        self.n_game_max = 300
        self.model = self._build_model()
        self.model.summary()
        np.random.seed(42)

    def _build_model(self) :
        input_state = Input((self.state_size,))
        model = Dense(32, activation = "tanh")(input_state)
        q_value = Dense(self.action_size, activation = "linear")(model)
        dqn = Model([input_state], q_value)
        dqn.compile(optimizer = tf.keras.optimizers.Adam(lr = 0.01), loss = "mse")
        return dqn

    def remember(self, state, action, reward, next_state, done) :
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state) :
        if np.random.rand() <= self.exploration_rate :
            return np.random.randint(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_s) :

        input_batch, action_batch, reward_batch, next_state_batch = [], [], [], []
        minibatch = random.sample(self.memory, min(len(self.memory), batch_s))
        targets = []
        states = []
        for state, action, reward, next_state, done in minibatch :
            # DQN FIT
            vanilla_target = reward
            if not done :
                # Double DQN
                vanilla_target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            vanilla_target_f = self.model.predict(state)
            print(vanilla_target_f.shape)
            vanilla_target_f[state][action] = vanilla_target
            # Train the Neural Net with the state and target_f
            states.append(state[0])
            targets.append(vanilla_target_f[0])
        self.vanillaLoss = self.model.fit(np.array(states), np.array(targets), epochs = 1, verbose = 0).history["loss"][
            0]

    def run(self) :
        done = False
        batch_size = 32
        state = env.reset()
        state = np.reshape(state, (1, self.state_size))
        self.cumulative_reward = 0
        score = []
        epsilon_logger = []
        score_average = []
        cpt_game = 0

        for timer in range(100000) :
            action = self.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, (1, self.state_size))
            self.remember(state, action, reward, next_state, done)

            self.cumulative_reward += reward
            state = next_state
            if done :
                cpt_game += 1
                state = env.reset()
                score_average.append(np.mean(score[-10 :]))
                state = np.reshape(state, (1, self.state_size))
                score.append(self.cumulative_reward)
                print("episode: {}, score: {}, e: {:.2} "
                      .format(cpt_game, self.cumulative_reward, self.exploration_rate))
                self.cumulative_reward = 0
                loss = self.replay(batch_size)
                self.exploration_rate = max(
                    self.exploration_rate - (self.original_epsilon / self.exploration_rate_decay), \
                    self.min_epsilon)
                epsilon_logger.append(self.exploration_rate)
            if timer % 1000 == 0 :
                clear_output(True)
                self.model.summary()
                plt.subplot(311)
                axis = plt.gca()
                axis.set_ylim([0, 200])
                plt.plot(score, label = "Cumulative reward ")
                plt.legend()
                plt.grid()
                plt.show()
                plt.subplot(312)

                axis = plt.gca()
                axis.set_ylim([0, 200])
                plt.plot(score_average, label = "Cumulative Mean reward ")
                plt.legend()
                plt.grid()
                plt.show()
                plt.subplot(313)

                axis = plt.gca()
                axis.set_ylim([0, 1])
                plt.plot(epsilon_logger, label = "Epsilon")
                plt.grid()
                plt.legend()
                plt.show()

In [None]:

env = gym.make('CartPole-v0')
env.reset()
action_size = env.action_space.n
state_size = env.observation_space.shape[0]

agent_vanilla = DQNAgent(env, state_size, action_size)
agent_vanilla.run()

   

In [None]:
model = agent_vanilla.model

state = env.reset()
state = np.reshape(state, (1, state_size))
for i in range(2000):

    clear_output(True)
    action = np.argmax(model.predict(state)[0])
    next_st, reward, done, _= env.step(action)
    state = next_st
    if done :
        state = env.reset()
    state = np.reshape(state, (1, state_size))

    plt.imshow(env.render(mode="rgb_array"))
    plt.show()
