# HW8.3: Deep Reinforcement Learning - Solve Chung's House 2 problem
---
###### Name: Devson Butani
###### ID: 000732711
###### LTU Honor Code: "I pledge that on all academic work that I submit, I will neither give nor receive unauthorized aid, nor will I present another person's work as my own."

# Install Dependencies

In [None]:
# %pip install comet_ml --quiet
# %pip install pillow
# %pip install matplotlib


# Import Dependencies


In [None]:
import numpy as np
from tensorflow import keras
import tensorflow as tf
import random
import pylab as plt
from keras import layers
from keras import models
from keras import optimizers

# Check if GPU available and linked to tensorflow so that keras can use it
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))


# Build and Run DNN Model

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(13, activation="relu"), input_shape=(1, 13)) # input shape of possible actions
    model.add(layers.Dense(56, activation="linear"))
    model.add(layers.Dense(13, activation="softmax")) # output shape of possible states

    model.compile(
        loss="mse",
        optimizer=optimizers.Adam(learning_rate=1e-4),
        metrics=["mae"],
    )
    return model

Input Data: Reward Table

In [None]:
# STARTING TABLES
R1 = np.array(  # >> CJ's House 2, GOAL: 12
    [  #  0, 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12
        [-1, 0, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1],  # 0
        [0, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],  # 1
        [-1, 0, -1, 0, -1, -1, 0, -1, -1, -1, -1, -1, -1],  # 2
        [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],  # 3
        [0, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1],  # 4
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],  # 5 (Blocked)
        [-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1],  # 6
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],  # 7 (Blocked)
        [-1, -1, -1, -1, 0, -1, -1, -1, -1, 0, -1, -1, -1],  # 8
        [-1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 0, -1, -1],  # 9
        [-1, -1, -1, -1, -1, -1, 0, -1, -1, 0, -1, 0, -1],  # 10
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 100],  # 11
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 100],  # 12
    ]
)
Q1 = np.zeros([13, 13])


In [None]:
# >> OLD Q-Learning methodology
def QLearn(np_reward_table, np_quality_table, goal_state, training_range=900, gamma=0.8):
    np_quality_table = np.zeros_like(np_reward_table)  # >> MAKE ZERO ARRAY WITH SAME SHAPE AS REWARD TABLE
    scores = []  # >> INIT EMPTY ARRAY TO KEEP TRACK OF LEARNING

    for episode in range(training_range):  # >> HOW MANY TIMES TO TRAIN RANDOM PATHS
        currSt = random.randint(0, (np.shape(np_reward_table)[0] - 1))  # >> 0 to MAX STATE COUNT
        flag = 0  # to handle if initial currSt chosen at randome is 12.

        # >> GO UNTILL OUTSIDE THE HOUSE (GOAL STATE) AFTER AT LEAST ONE LOOP
        while flag == 0 or currSt != goal_state:
            flag = 1
            nextSts_fr_currSt = []
            nextActs_fr_nextSt = []

            # >> FIND ALL POSSIBLE NEXT STATES FOR CURRENT STATE
            for index, value in enumerate(np_reward_table[currSt]):
                if value != -1:
                    nextSts_fr_currSt.append(index)  # >> MAKES A 1-D ARRAY OF POSSIBLE STATES

            # >> SELECT ONE FROM ABOVE TO BE NEXT STATE
            nextSt = random.choice(np.array(nextSts_fr_currSt))  # choice(): we can get a random sample from 1Darray

            # >> FIND ALL POSSIBLE ACTIONS FROM NEXT STATE
            for index, value in enumerate(np_reward_table[nextSt]):
                if value != -1:
                    nextActs_fr_nextSt.append(index)  # >> MAKES A 1-D ARRAY OF POSSIBLE ACTIONS

            # >> FIND THE MAX YEILDING ACTION FROM ABOVE ARRAY AND ADJUST THE PATH TO BE TAKEN'S WORTH
            np_quality_table[currSt, nextSt] = np_reward_table[currSt, nextSt] + gamma * max(
                np_quality_table[nextSt, nextActs_fr_nextSt]
            )
            # >> RUN IT AGAIN FOR THE NEW CURRENT STATE UNTIL GOAL STATE
            currSt = nextSt

        # >> MAKES A 1-D ARRAY OF CURRENT Q WITH RESPECT TO MAX(Q) UP TILL NOW. THIS ALLOWS GRAPHING THE LEARNING CURVE FOR EACH EPISODE
        if np.max(np_quality_table) > 0:
            scores.append(np.sum(np_quality_table / np.max(np_quality_table) * 100))
        else:
            scores.append(0)

    # >> RESULTING RAW Q TABLE
    # print(np.array_str(np_quality_table, precision=1), end='\n\n')
    # >> Q TABLE AS PERCENTAGE OF MAX(Q) - ROUNDED AFTER PERCENTAGE
    np_quality_table = np.round(
        (np_quality_table * 100 / np.max(np_quality_table))
    )  # >> ROUNDING PRECESION CAN BE CHANGED DEPENDING ON APPLICATION
    print(f"Quality Table = ")
    print(f"{np_quality_table}")

    # >> PLOT THE LEARNING CURVE USING SCORES GATHERED FOR EVERY TRAINING EPISODE
    plt.plot(scores)
    plt.show()

    return np_quality_table


# >> New Q-Learning model
def DeepQLearn(np_reward_table, np_quality_table, goal_state, training_range=900, gamma=0.8):
    np_quality_table = np.zeros_like(np_reward_table)  # >> MAKE ZERO ARRAY WITH SAME SHAPE AS REWARD TABLE
    scores = []  # >> INIT EMPTY ARRAY TO KEEP TRACK OF LEARNING

    for episode in range(training_range):  # >> HOW MANY TIMES TO TRAIN RANDOM PATHS
        currSt = random.randint(0, (np.shape(np_reward_table)[0] - 1))  # >> 0 to MAX STATE COUNT

    # Train per piece inside a for loop? with current state as input and reward[state] as labels
    # Batch size is path length
    # Epoch is training_range
    # Loss function = custom equation
    # Output is model not path or table. Use model to predict Q table to show? 
    # Or just show path taken when predict? Callbacks can help with this.

    # >> RESULTING RAW Q TABLE
    # print(np.array_str(np_quality_table, precision=1), end='\n\n')
    # >> Q TABLE AS PERCENTAGE OF MAX(Q) - ROUNDED AFTER PERCENTAGE
    np_quality_table = np.round(
        (np_quality_table * 100 / np.max(np_quality_table))
    )  # >> ROUNDING PRECESION CAN BE CHANGED DEPENDING ON APPLICATION
    print(f"Quality Table = ")
    print(f"{np_quality_table}")

    # >> PLOT THE LEARNING CURVE USING SCORES GATHERED FOR EVERY TRAINING EPISODE
    plt.plot(scores)
    plt.show()

    return np_quality_table
