Imports

In [1]:
import numpy as np
import random
import matplotlib 
import matplotlib.style
import matplotlib.pyplot as plt
import pandas as pd
import re
import math
from matplotlib.collections import LineCollection
import time
import itertools


from tkinter import *
from tkinter import messagebox
import tkinter as tk

Auxiliary Classes: SimpleTableInput and Framegeneration

In [2]:
#Support Class to Frame Generation
class SimpleTableInput(tk.Frame):
    def __init__(self, parent, rows, columns):
        tk.Frame.__init__(self, parent)

        self._entry = {}
        self.rows = rows
        self.columns = columns

        # Real Time Validation Command
        vcmd = (self.register(self._validate), "%P")

        # Create a table of TkInter Entries with the use of Grid.
        for row in range(self.rows):
            for column in range(self.columns):
                index = (row, column)
                e = tk.Entry(self, validate="key", validatecommand=vcmd)
                e.grid(row=row, column=column, stick="nsew")
                self._entry[index] = e
        # Column alignement
        for column in range(self.columns):
            self.grid_columnconfigure(column, weight=1)
        #Filling extra space
        self.grid_rowconfigure(rows, weight=1)

        # Labels
        L1 = Label(parent, text="Introduce a valid board with no repeated indexes")
        L1.pack()
        L2 = Label(parent, text="Only values from 0 to " + str((self.rows*self.columns)-1) + " are allowed for this matrix")
        L2.pack()
        

    def get(self):
        #Fetches the entries data and store it on a list of arrays
        result = []
        for row in range(self.rows):
            current_row = []
            for column in range(self.columns):
                index = (row, column)
                #Check if a empty value was introduced in the matrix
                if self._entry[index].get() == "":
                	return False
                current_row.append(int(self._entry[index].get()))
            result.append(current_row)
            
        #Repeated Numbers on the Matrix Check
        resultnum = []
        for i in range(len(result)):
            for j in range(len(result[i])):
                resultnum.append(result[i][j])
        uniques = np.unique(resultnum, axis = 0)
        if len(uniques) != self.rows * self.columns:
        	return False
        #Return Result
        return result
    
    def _validate(self, P):
      #Performs Input Validation on real time

        if P.strip() == "":
        	return True

        try:
            f = int(P)
            #Doesn't allows to introduce indexes over rows*columns
            if f > (self.rows * self.columns) -1:
            	self.bell()
            	return False
            #Only allows for Integers
        except ValueError:
            self.bell()
            return False

        return True
#Main Class in charge of Matrix Frame Generation for TkInter
class Framegeneration(tk.Frame):
    #Creates a frame for TkInter
    def __init__(self, parent,states, actions):
        tk.Frame.__init__(self, parent)
        self.table = SimpleTableInput(self, states, actions)
        self.submit = tk.Button(self, text="Submit", command=lambda: self.on_submit(parent))
        self.table.pack(side="top", fill="both", expand=True)
        self.submit.pack(side="bottom")
    #Controls the submit button on the tkinter frame
    def on_submit(self, parent):
        #Check if the matrix was properly introduced
        if self.table.get() == False:
            Qlearn.errorWindow(self)
            return False
        Qlearn.pargui(self.table.get(), parent)


Class Declaration Q-Learn

BackEnd Auxiliary Methods

In [3]:
#Main Class
class Qlearn():

    #BackEnd Auxiliary methods

    #Turns a String Array Into a Integer Array
    def stringToArrayInt(checkpoint,win2):
        res = []
        array = checkpoint.strip().split(",")
        for j in range(len(array)):
            if int(array[j]) > (Qlearn.states * Qlearn.actions)-1:
                Qlearn.errorWindow(win2)
                return False
            res.append(int(array[j]))
        return res
    #Eliminates specified restrictions from the neighbors list
    def applyRestrictions(neighbors, restrictions,win2):
        array = restrictions.strip().split(";")
        for i in range(len(array)):
            current = array[i].strip().split("-")
            if int(current[0]) > (Qlearn.states * Qlearn.actions)-1 or int(current[1]) > (Qlearn.states * Qlearn.actions)-1:
                Qlearn.errorWindow(win2)
                return False
            for j in range(len(neighbors)):
                if neighbors[j][0] == int(current[0]):
                    ind = neighbors[j][1].index(int(current[1]))
                    neighbors[j][1].pop(ind)
                elif neighbors[j][0] == int(current[1]):
                    ind = neighbors[j][1].index(int(current[0]))
                    neighbors[j][1].pop(ind)
        return neighbors

    #This method generates a tuple graph with the possible movements of every state on the matrix
    #Note that the state itself is not reachable from itself. Only the goal square has that property, and it's not controlled in this method.
    def find_neighbours(arr):

        final_neighbors = []

        for i in range(len(arr)):
            for j, value in enumerate(arr[i]):
                if i == 0 or i == len(arr) - 1 or j == 0 or j == len(arr[i]) - 1:
                    # When arr[i][j] object is not a center square
                    # We must check which neighbours are present before evaluating them
                    neighbors = []
                    if i != 0:
                        neighbors.append(arr[i - 1][j])  #Top
                    if j != len(arr[i]) - 1:
                        neighbors.append(arr[i][j + 1])  #Right
                    if i != len(arr) - 1:
                        neighbors.append(arr[i + 1][j])  #Bottom
                    if j != 0:
                        neighbors.append(arr[i][j - 1])  #Left
                    if i!= 0 and j != len(arr[i]) -1:
                        neighbors.append(arr[i-1][j+1])  #Top Right
                    if i!= 0 and j!= 0:
                        neighbors.append(arr[i-1][j-1])  #Top Left
                    if i!= len(arr) - 1 and j != len(arr[i]) - 1:
                        neighbors.append(arr[i+1][j+1])  #Bottom Right
                    if i!= len(arr) -1 and j!= 0:
                        neighbors.append(arr[i+1][j-1])  #Bottom Left
                        

                else:
                    #Center Squares
                    neighbors = [
                        arr[i - 1][j],  # Top
                        arr[i][j + 1],  # Right
                        arr[i + 1][j],  # Bottom
                        arr[i][j - 1],  # Left
                        arr[i-1][j-1],  # Top Left
                        arr[i-1][j+1],  # Top Right
                        arr[i+1][j-1],  # Bottom Left
                        arr[i+1][j+1]   # Bottom Right  
                    ]
                final_neighbors.append([value, neighbors])
        return final_neighbors

    def rendimiento(q, states, actions):
        suma = 0
        biggest = 0
        #This loop gets the biggest number on the Q matrix
        for i in range(states):
            for j in range(actions):
                suma += q[i][j]
                if biggest < q[i][j]:
                    biggest = q[i][j]
        #Calculation of the performance measure after fetching the biggest number on the Q matrix           
        ren = (suma/biggest)*100
        return ren          


    def maximum(foresight, next_state, q):
        result = -1
        #This loop fetches the maximum q value for the next foresighted actions possible
        for i in range(len(foresight)):
            if q[next_state][foresight[i]] > result:
                result = q[next_state][foresight[i]]
            
        return result

    def getAction(current, states, actions,r):
        arr = []
        #This loop collects all the possible actions, when they get a reward of 0 or a reward of 100, discards all -1 (-100 in Phase3) rewards
        for i in range(states):
            if r[(current,i)] == 0 or r[(current,i)] == -50 or r[(current,i)] == 10:
                arr.append(i)
            if r[(current,i)] == 100:
                arr.append(i)
        return arr

    def normalize(q, states, actions):
        tuvi = np.max(q)
        q = q*(100/tuvi)
        for i in range(states):
            for j in range(actions):
                q[i][j] = math.ceil(q[i][j])
        return q


FrontEnd Auxiliary Methods

In [4]:
class Qlearn(Qlearn):
#FrontEnd Auxiliary Methods

        #Checks if the fields are not empty
    def passToFase1(Eep, Ego,Ega,Eis,win2):
        try:
            episodes = int(Eep.get())
            goal = int(Ego.get())
            gamma = float(Ega.get())
            initial = int(Eis)
            if goal > (Qlearn.states * Qlearn.actions)-1 or initial > (Qlearn.states * Qlearn.actions)-1:
                return Qlearn.errorWindow(win2)
        except ValueError:
            return Qlearn.errorWindow(win2)

        return Qlearn.fase1(Eep, Ego,Ega,Eis,win2)
        #Check if the fields have the correct kind of value and are not empty
    def passToFase2(Eep, Ego,Ega,Eeps, Eaph,Eis,win2):
        try:
            episodes = int(Eep)
            goal = int(Ego)
            gamma = float(Ega)
            initial = int(Eis)
            alpha = float(Eaph)
            Epsilon = float(Eeps)
            if goal > (Qlearn.states * Qlearn.actions)-1 or initial > (Qlearn.states * Qlearn.actions)-1:
                return Qlearn.errorWindow(win2)
        except ValueError:
            return Qlearn.errorWindow(win2)

        return Qlearn.fase2(Eep, Ego,Ega,Eeps,Eaph,Eis,win2)
        
        #Check that the fields are not empty or the strings introduced matches the required format to process
    def passToFase3(Eep, Ego,Ega,Eis,Ech,Ere,Ebo,win2):
        correctValues = True
        try:
            episodes = int(Eep)
            goal = int(Ego)
            gamma = float(Ega)
            initial = int(Eis)
            if goal > (Qlearn.states * Qlearn.actions)-1 or initial > (Qlearn.states * Qlearn.actions)-1:
                return Qlearn.errorWindow(win2)
        except ValueError:
            correctValues = False

        patternRes = re.compile("^((\d+-\d+;)+\d+-\d+|(\d+-\d+){1}){1}$|^$")
        patternArr = re.compile("^(\d+,)+(\d+){1}|(\d+){1}$|^$")

        if not re.match(patternRes, Ere) or not re.match(patternArr,Ech) or not re.match(patternArr,Ebo) or not correctValues:
            return Qlearn.errorWindow(win2)


        return Qlearn.fase3(Eep, Ego,Ega,Eis,Ech,Ere,Ebo,win2)

    #Check that the fields are not empty or the strings introduced matches the required format to process
    def passToFase32(Eep, Ego,Ega,Eis,Ech,Ere,Ebo,Eaph, Eeps,win2):
        correctValues = True
        try:
            episodes = int(Eep)
            goal = int(Ego)
            gamma = float(Ega)
            initial = int(Eis)
            alpha = float(Eaph)
            Epsilon = float(Eeps)
            if goal > (Qlearn.states * Qlearn.actions)-1 or initial > (Qlearn.states * Qlearn.actions)-1:
                return Qlearn.errorWindow(win2)
        except ValueError:
            correctValues = False

        patternRes = re.compile("^((\d+-\d+;)+\d+-\d+|(\d+-\d+){1}){1}$|^$")
        patternArr = re.compile("^(\d+,)+(\d+){1}|(\d+){1}$|^$")

        if not re.match(patternRes, Ere) or not re.match(patternArr,Ech) or not re.match(patternArr,Ebo) or not correctValues:
            return Qlearn.errorWindow(win2)

        return Qlearn.fase32(Eep, Ego,Ega,Eis,Ech,Ere,Ebo,Eaph,Eeps,win2)

        #This method validates if the entry is int
    def validateint(P):
        if P.strip() == "":
            return True
        try:
            f = int(P)
        except ValueError:
            return False
        return True
    #This method validates if the entry is float
    def validatefloat(P):
        if P.strip() == "":
            return True
        try:
            f = float(P)
            if f < float(0) or f > float(1):
                return False
        except ValueError:
            return False
        return True
    
    def render(q, states, actions):
        #The loops fetches q_size to generate a string representation of the result matrix, using integers for the matrix values
        map = "+		"
        for i in range(states):
            map +=  str(i) + "	"
        map += '\n' + "-----------------------------------------------------------------------------------------------------------"
        for i in range(states):
                map += '\n' + str(i) + "	|	"
                for j in range(actions):
                    map += str(int(q[i][j])) + "	"
        return map
    
    # looks for the highest q values in the q matrix
    def shortestpath(initialState, goal_state, q):
        path = [initialState]
        next_state = np.argmax(q[initialState,])
        path.append(next_state)
        while next_state != goal_state:
            next_state = np.argmax(q[next_state,])
            path.append(next_state)
        return path


    #Creates an error Windows in case of entry failure
    def errorWindow(win):
        messagebox.showerror("Incorrect Values Given", "You gave incorrect kind of values, try again next time")
        return print("Try again next time")
                    


Reward Main Methods

In [5]:
class Qlearn(Qlearn):
#Reward Main Methods

        #This method will generate a reward matrix for any symetrical matrix
    def reward(goal_state, states, actions, neighbors):
        r = np.matrix(np.ones(shape=[states, actions]))
        r = r * -1

        for row in neighbors:
            r[row] = 0
            row = row[::-1]
            r[row] = 0
        
        for i in range(states):
            for j in range(actions):
                #Whenever the goal state is reachable, the reward for taking that action will be 100
                if j == goal_state and r[(i,j)] != -1:
                    r[(i,j)] = 100
        #When goal goes to itself, will always have 100 as reward           
        r[(goal_state),(goal_state)] = 100
        print(r)
        return r
    #Generate Reward Matrix with Checkpoint
    def reward2(goal_state, states, actions, neighbors, boost, checkpoint):
        r = np.matrix(np.ones(shape=[states, actions]))
        r = r * -100

        for row in neighbors:
            r[row] = 0
            row = row[::-1]
            r[row] = 0
        
        for i in range(states):
            for j in range(actions):
                #Whenever the goal state is reachable, the reward for taking that action will be 100
                if j == goal_state and r[(i,j)] != -100:
                    r[(i,j)] = 100
                #Pitfall Handle
                if checkpoint != "":
                    for k in range(len(checkpoint)):
                        if j == checkpoint[k] and r[(i,j)] != -100:
                            r[(i,checkpoint[k])] = -50
                #Boost Handle
                if boost != "":
                    for v in range(len(boost)):
                        if j == boost[v] and r[(i,j)] != -100:
                            r[(i,boost[v])] = 10
        #When goal goes to itself, will always have 100 as reward           
        r[(goal_state),(goal_state)] = 100
        print(r)
        return r


Main Algorythms

In [6]:
class Qlearn(Qlearn):
#Main Algorythm for Phase 1 and 2

    def evaluate(q, currentState, states, actions, gamma, r):
        #Get all possible actions for the current state
        posactions = Qlearn.getAction(currentState, states, actions, r)
        #Random choice of the next state among the possible ones
        next_state = random.choice(posactions)
        #Get all possible actions for the chosen next state
        foresight = Qlearn.getAction(next_state, states,actions, r)
        #Q-function that evaluates the paths
        q[currentState][next_state] = r[(currentState,next_state)] + (gamma * Qlearn.maximum(foresight, next_state,q))
        #Set the next_state into the currentState
        currentState = next_state
        return q, currentState
    
    def evaluateGreedy(q, currentState, states, actions, gamma, r, epsilon, alpha):
        #espindex is a random number used for epsilon comparison
        epsindex = random.uniform(0., 1.)
        #Get all possible actions for the current state
        posactions = Qlearn.getAction(currentState, states, actions, r)
        #Epsilon is greater than random float, next_state is random among the possible options
        next_state = random.choice(posactions)
        
        learn = int()
        #Get all possible actions for the chosen next state
        foresight = Qlearn.getAction(next_state, states, actions, r)
        #Epsilon (Exploration Rate) is minor than random float, we add a random choice to the main equation to make the agent explore a bit
        if epsindex < epsilon:
            learn = q[next_state][random.choice(foresight)]
        #On the other hand, when the other case happens we choose the best option known by our agent
        else:
            learn = Qlearn.maximum(foresight,next_state,q)
        #Q-function that evaluates the paths
        q[currentState][next_state] = r[(currentState, next_state)] + gamma * learn
        #Set the next_state into the currentState
        currentState = next_state
        #At the end of each iteration, epsilon multiplies with the learning rate alpha
        epsilon *= alpha
        return q, currentState, epsilon
        

    #This is the main algorythm for Phase 1
    def train(q, states, actions, episodes,initialState, goal_state, gamma, r):
        points = []
        start = time.clock()
        for i in range(episodes):
            currentState = np.random.randint(0,states-1)
            #While we haven't reached the goal state
            while currentState != goal_state:
                q, currentState = Qlearn.evaluate(q, currentState, states, actions, gamma, r)
            #Run once on the goal for the sake of convergence
            if currentState == goal_state:
                q, currentState = Qlearn.evaluate(q, currentState, states, actions, gamma, r)
            rendimiento = Qlearn.rendimiento(q,states,actions)
            point = (rendimiento)
            points.append(point)
            if i%5 == 0:
                print("Episodio: " + str(i))
                print(Qlearn.render(q, states,actions))
                print("Rendimiento: " + str(rendimiento))
        end = time.clock()
        timetook = (end-start)*1000            
        plt.plot(points)
        plt.ylabel('Rendimiento')
        plt.xlabel('Episodios')
        plt.show()
        print("El algoritmo ha tardado en ejecutar "+str(episodes)+ " Episodios: " + str(timetook) + " milisegundos")
        return q
    #Main Algorythm for Phase 2
    def train2(q, states, actions, episodes,initialState, goal_state, gamma, r, alpha, epsilon):
        points = []
        start = time.clock()
        for i in range(episodes):
            #Return the agent to the first square
            currentState = initialState
            #Return epsilon to the starting value
            algepsilon = epsilon
            while currentState != goal_state:
                q, currentState, algepsilon = Qlearn.evaluateGreedy(q, currentState, states, actions, gamma, r, algepsilon, alpha)
            if currentState == goal_state:
                q, currentState, algepsilon = Qlearn.evaluateGreedy(q, currentState, states, actions, gamma, r, algepsilon, alpha)
            rendimiento = Qlearn.rendimiento(q,states,actions)
            point = (rendimiento)
            points.append(point)
            if i%5 == 0:
                print("Episodio: " + str(i))
                print(Qlearn.render(q, states,actions))
                print("Rendimiento: " + str(Qlearn.rendimiento(q, states, actions)))
        end = time.clock()
        timetook = (end-start)*1000 
        plt.plot(points)
        plt.ylabel('Rendimiento')
        plt.xlabel('Episodios')
        plt.show()
        print("El algoritmo ha tardado en ejecutar "+str(episodes)+ " Episodios: " + str(timetook) + " milisegundos")


Phase Handles BackEnd

In [7]:
class Qlearn(Qlearn):
#Phase Handles Back End

    def fase1(Eep, Ego, Ega,Eis, win2):
        #Set Initial State
        initialState = int(Eis)
        #Set Goal State
        goal_state = int(Ego.get())
        #Generate graph for creating the reward matrix
        neighbors = Qlearn.find_neighbours(Qlearn.m)
        #Set the Reward and Q matrix dimensions (Rows x Columns)
        Qlearn.states = Qlearn.states * Qlearn.actions
        Qlearn.actions = Qlearn.states
        #Create Q Matrix with all 0
        q = np.zeros((Qlearn.states, Qlearn.actions))
        #Create the Reward matrix for the Entry Matrix
        r= Qlearn.reward(goal_state, Qlearn.states, Qlearn.actions, neighbors)
        #Set Episodes
        episodes = int(Eep.get())
        #Set Gamma
        gamma = float(Ega.get())
        #Destroy Matrix input window
        win2.destroy()
        #Call to Main Algorythm
        q = Qlearn.train(q, Qlearn.states, Qlearn.actions,episodes,initialState,goal_state, gamma, r)
        #Normalize Q output
        q = Qlearn.normalize(q, Qlearn.states, Qlearn.actions)
        #Print on console the result
        print(Qlearn.render(q, Qlearn.states, Qlearn.actions))
        #Create Result Window
        #This is a front end window
        res = Tk()
        res.title("Q-Learning")
        res.geometry("800x400")
        Lres = Label(res, text = Qlearn.render(q, Qlearn.states, Qlearn.actions))
        Lres.pack()
        print(pd.DataFrame(q))
        Lpath = Label(res, text = "Camino más corto: "+  str(Qlearn.shortestpath(initialState, goal_state, q)))
        Lpath.pack()

    def fase2(Eep, Ego, Ega, Eeps, Eaph, Eis, win2):
        #Destroy previous Windows
        win2.destroy()
        #Set Initial State
        initialState = int(Eis)
        #Set Goal State
        goal_state = int(Ego)
        #Find the Neighbors to generate a graph
        neighbors = Qlearn.find_neighbours(Qlearn.m)
        #Set the dimensions of reward and q
        Qlearn.states =  Qlearn.states * Qlearn.actions
        Qlearn.actions = Qlearn.states
        #Create the Q matrix with all 0
        q = np.zeros((Qlearn.states, Qlearn.actions))
        #Create the Reward Matrix for the matrix introduced in the input
        r= Qlearn.reward(goal_state, Qlearn.states, Qlearn.actions, neighbors)
        #Set Episodes
        episodes = int(Eep)
        #Set Gamma value
        gamma = float(Ega)
        #Set Alpha value
        alpha = float(Eaph)
        #Set Epsilon value
        epsilon = float(Eeps)

        #Call to the main algorythm
        Qlearn.train2(q, Qlearn.states, Qlearn.actions,episodes,initialState,goal_state, gamma, r, alpha, epsilon)
        #Normalize the algorythm output
        q = Qlearn.normalize(q, Qlearn.states, Qlearn.actions)
        #Console print for result q
        print(Qlearn.render(q, Qlearn.states, Qlearn.actions))
        #Create result window
        res = Tk()
        res.title("Q-Learning")
        res.geometry("800x400")
        Lres = Label(res, text = Qlearn.render(q, Qlearn.states, Qlearn.actions))
        Lres.pack()
        Lpath = Label(res, text = "Camino más corto: "+  str(Qlearn.shortestpath(initialState, goal_state, q)))
        Lpath.pack()

    def fase3(Eep, Ego, Ega,Eis,Ech,Eer,Ebo, win2):
        #Set Initial State
        initialState = int(Eis)
        #Set Checkpoints
        if Ech != "":
            checkpoints = Qlearn.stringToArrayInt(Ech,win2)
            print("Traps: " + str(checkpoints))
            if checkpoints == False:
                return
        else:
            checkpoints = ""
        #Set Goal State
        goal_state = int(Ego)
        #Generate graph for creating the reward matrix
        neighbors = Qlearn.find_neighbours(Qlearn.m)
        if neighbors == False:
            return
        #Set Restrictions
        if Eer != "":
            restrictions = Qlearn.applyRestrictions(neighbors, str(Eer),win2)
            print("Neighbors: " + str(restrictions))
            if restrictions == False:
                return
        #Boost Square
        if Ebo != "":
            boost = Qlearn.stringToArrayInt(Ebo,win2)
            print("Boosts: " + str(boost))
            if boost == False:
                return
        else:
            boost = ""
        #Set the Reward and Q matrix dimensions
        Qlearn.states = Qlearn.states * Qlearn.actions
        Qlearn.actions = Qlearn.states
        #Create Q Matrix with all 0
        q = np.zeros((Qlearn.states, Qlearn.actions))
        #Create the Reward matrix for the Entry Matrix
        r= Qlearn.reward2(goal_state, Qlearn.states, Qlearn.actions, restrictions,boost, checkpoints)
        #Set Episodes
        episodes = int(Eep)
        #Set Gamma
        gamma = float(Ega)
        #Destroy Matrix input window
        win2.destroy()
        #Call to Main Algorythm
        Qlearn.train(q, Qlearn.states, Qlearn.actions,episodes,initialState,goal_state, gamma, r)
        #Normalize Q output
        q = Qlearn.normalize(q, Qlearn.states, Qlearn.actions)
        #Print on console the result
        print(Qlearn.render(q, Qlearn.states, Qlearn.actions))
        #Create Result Window
        res = Tk()
        res.title("Q-Learning")
        res.geometry("800x400")
        Lres = Label(res, text = Qlearn.render(q, Qlearn.states, Qlearn.actions))
        Lres.pack()
        Lpath = Label(res, text = "Camino más corto: "+  str(Qlearn.shortestpath(initialState, goal_state, q)))
        Lpath.pack()

    def fase32(Eep, Ego, Ega,Eis,Ech,Eer,Ebo,Eaph,Eeps, win2):
        #Set Initial State
        initialState = int(Eis)
        #Set Checkpoints
        if Ech != "":
            checkpoints = Qlearn.stringToArrayInt(Ech,win2)
            print("Traps: " + str(checkpoints))
            if checkpoints == False:
                return
        else:
            checkpoints = ""
        #Set Goal State
        goal_state = int(Ego)
        #Generate graph for creating the reward matrix
        neighbors = Qlearn.find_neighbours(Qlearn.m)
        #Set Restrictions
        if Eer != "":
            restrictions = Qlearn.applyRestrictions(neighbors, str(Eer),win2)
            print("Neighbors: " + str(restrictions))
            if restrictions == False:
                return
        #Boost Square
        if Ebo != "":
            boost = Qlearn.stringToArrayInt(Ebo,win2)
            print("Boosts: " + str(boost))
            if boost == False:
                return
        else:
            boost = ""
        #Set the Reward and Q matrix dimensions
        Qlearn.states = Qlearn.states * Qlearn.actions
        Qlearn.actions = Qlearn.states
        #Create Q Matrix with all 0
        q = np.zeros((Qlearn.states, Qlearn.actions))
        #Create the Reward matrix for the Entry Matrix
        r= Qlearn.reward2(goal_state, Qlearn.states, Qlearn.actions, restrictions,boost, checkpoints)
        #Set Episodes
        episodes = int(Eep)
        #Set Gamma
        gamma = float(Ega)
        #Set Alpha value
        alpha = float(Eaph)
        #Set Epsilon value
        epsilon = float(Eeps)
        #Destroy Matrix input window
        win2.destroy()
        #Call to Main Algorythm
        Qlearn.train2(q, Qlearn.states, Qlearn.actions,episodes,initialState,goal_state, gamma, r, alpha, epsilon)
        #Normalize Q output
        q = Qlearn.normalize(q, Qlearn.states, Qlearn.actions)
        #Print on console the result
        print(Qlearn.render(q, Qlearn.states, Qlearn.actions))
        print(Qlearn.shortestpath(initialState, goal_state, q))
        #Create Result Window
        res = Tk()
        res.title("Q-Learning")
        res.geometry("800x400")
        Lres = Label(res, text = Qlearn.render(q, Qlearn.states, Qlearn.actions))
        Lres.pack()
        Lpath = Label(res, text = "Camino más corto: "+  str(Qlearn.shortestpath(initialState, goal_state, q)))
        Lpath.pack()


Main App Front End

In [8]:
class Qlearn(Qlearn): 
    #Main App FrontEnd

    def pargui(entrymatrix,previouswindow):
        #Set the gameboard
        Qlearn.m = entrymatrix
        #Destroy the matrix input window
        previouswindow.destroy()
        #Param Window Generation
        win2 = Tk()
        vcmd=(win2.register(Qlearn.validateint), "%P")
        vcmdfloat =(win2.register(Qlearn.validatefloat), "%P")
        win2.title("Q-Learning parameters")
        win2.geometry("1000x500")
        #Episodes Entry
        Lep = Label(win2, text="Episodes")
        Lep.pack(side=LEFT)
        Lep.place(x=20, y=10)
        Eep = Entry(win2, bd=5,validate="key",validatecommand=vcmd)
        Eep.pack(side=RIGHT)
        Eep.place(x=150, y=10)
        #Episodes Explanation
        LepExp = Label(win2, text="Number of Episodes to test")
        LepExp.pack()
        LepExp.place(x = 300, y =10)
        #Initial State
        Lis = Label(win2, text="Initial State")
        Lis.pack(side=LEFT)
        Lis.place(x=20, y=40)
        Eis = Entry(win2, bd=5,validate="key",validatecommand=vcmd)
        Eis.pack(side=RIGHT)
        Eis.place(x=150, y=40)
        #Initial State Explanation
        LisExp = Label(win2, text="Initial State of the Agent")
        LisExp.pack()
        LisExp.place(x = 300, y =40)
        #Goal State
        Lgo = Label(win2, text="Goal State")
        Lgo.pack(side=LEFT)
        Lgo.place(x=20, y=70)
        Ego = Entry(win2, bd=5,validate="key",validatecommand=vcmd)
        Ego.pack(side=RIGHT)
        Ego.place(x=150, y=70)
        #Goal State Explanation
        LgoExp = Label(win2, text="Final State of the Agent")
        LgoExp.pack()
        LgoExp.place(x = 300, y =70)
        #Gamma
        Lga = Label(win2, text="Gamma")
        Lga.pack(side=LEFT)
        Lga.place(x=20, y=100)
        Ega = Entry(win2, bd=5,validate="key",validatecommand=vcmdfloat)
        Ega.pack(side=RIGHT)
        Ega.place(x=150, y=100)
        #Goal State Explanation
        LgaExp = Label(win2, text="Future reward looking ratio, the greater it is more will evaluate the potential rewards of a path")
        LgaExp.pack()
        LgaExp.place(x = 300, y =100)
        #Button
        B = Button(win2, text = "Q-Learning", command= lambda: Qlearn.passToFase1(Eep, Ego, Ega,Eis.get(),win2))
        B.pack()
        B.place(x=100, y = 130)
        #Fase 1 Explanation
        Lf1 = Label(win2, text="All of the fields above are REQUIRED")
        Lf1.pack()
        Lf1.place(x=300, y = 130)
        
        #Epsilon
        Leps = Label(win2, text="Epsilon")
        Leps.pack(side=LEFT)
        Leps.place(x=20, y = 160)
        Eeps = Entry(win2, bd=5,validate="key",validatecommand=vcmdfloat)
        Eeps.pack(side = RIGHT)
        Eeps.place(x=150, y = 160)
        #Epsilon Explanation
        LepsExp = Label(win2, text="Choice Politics Cte, range [0,1]")
        LepsExp.pack()
        LepsExp.place(x = 300, y =160)
        #Alpha
        Laph = Label(win2, text="Alpha")
        Laph.pack(side = LEFT)
        Laph.place(x=20, y=190)
        Eaph = Entry(win2, bd=5,validate="key",validatecommand=vcmdfloat)
        Eaph.pack(side= LEFT)
        Eaph.place(x=150, y=190)
        #Alpha Explanation
        LahpExp = Label(win2, text="Epsilon multiplier, range [0,1]")
        LahpExp.pack()
        LahpExp.place(x = 300, y= 190)
        #Button fase 2
        B2 = Button(win2, text="Q-Learning with Exploration Rate", command = lambda: Qlearn.passToFase2(Eep.get(), Ego.get(), Ega.get(), Eeps.get(), Eaph.get(),Eis.get(), win2))
        B2.pack()
        B2.place(x=100, y = 210)
        #Fase 2 Explanation
        Lf2 = Label(win2, text="All of the fields above are REQUIRED")
        Lf2.pack()
        Lf2.place(x=300, y = 210)

        #Pifalls States
        Lch = Label(win2, text="Traps")
        Lch.pack(side=LEFT)
        Lch.place(x = 20, y = 240)
        Ech = Entry(win2, bd=5)
        Ech.pack(side=RIGHT)
        Ech.place(x = 150 , y = 240)
        #Pitfall Explanation
        LchExp = Label(win2, text="Trap Squares (-50 on the reward matrix), format example: 1,2,3")
        LchExp.pack()
        LchExp.place(x = 300, y =240)
        #Restrictions
        Lre = Label(win2, text="Restrictions")
        Lre.pack(side=LEFT)
        Lre.place(x=20, y=270)
        Ere = Entry(win2, bd=5)
        Ere.pack(side=RIGHT)
        Ere.place(x=150, y=270)
        #Forbidden Moves Explanation
        LreExp = Label(win2, text="Forbidden Moves, format example: 0-1;1-3;1-4")
        LreExp.pack()
        LreExp.place(x = 300, y =270)
        #Boost Square
        Lbo = Label(win2, text="Boost Square")
        Lbo.pack(side=LEFT)
        Lbo.place(x=20, y=300)
        Ebo = Entry(win2, bd=5)
        Ebo.pack(side=RIGHT)
        Ebo.place(x=150, y=300)
        #Boost Explanation
        LreExp = Label(win2, text="Boost Square (+10 on the reward matrix), format example: 0,1,2")
        LreExp.pack()
        LreExp.place(x = 300, y =300)

        B3 = Button(win2, text="Q-Learning with restrictions", command = lambda:Qlearn.passToFase3(Eep.get(), Ego.get(), Ega.get(), Eis.get(), Ech.get(), Ere.get(),Ebo.get(), win2))
        B3.pack()
        B3.place(x=100, y = 330)
        #Fase 3 1 Explanation
        Lf3 = Label(win2, text="All of the fields above are REQUIRED except Epsilon and Alpha. Traps, Restrictions and Boosts are Optional")
        Lf3.pack()
        Lf3.place(x=300, y = 330)

        B4 = Button(win2, text="Q-Learning with restrictions and Exploration Rate", command = lambda:Qlearn.passToFase32(Eep.get(), Ego.get(), Ega.get(), Eis.get(), Ech.get(), Ere.get(),Ebo.get(), Eaph.get(), Eeps.get(), win2))
        B4.pack()
        B4.place(x=100, y = 360)
        #Fase 3 2 Explanation
        Lf3 = Label(win2, text="All of the fields above are REQUIRED. Traps, Restrictions and Boosts are Optional")
        Lf3.pack()
        Lf3.place(x=400, y = 360)
        win2.mainloop()

    def magui(Eest, Eact,win):
        #Rows
        Qlearn.states = int(Eest.get())
        #Columns
        Qlearn.actions= int(Eact.get())
        #Destroy previous window
        win.destroy()
        #Create Matrix Input Window
        ven = Tk()
        ven.title("Matrix Input")
        #Matrix Input Frame
        Framegeneration(ven,Qlearn.states,Qlearn.actions).pack(side="top", fill="both", expand=True)
        
    def startgui():
    #Dimension Window generation
        win = Tk()
        #Validation command for int
        vcmd=(win.register(Qlearn.validateint), "%P")
        win.title("Q-Learning")
        win.geometry("300x300")
        #States Entry
        Lest = Label(win, text="Rows")
        Lest.pack(side=LEFT)
        Lest.place(x=20, y=10)
        Eest = Entry(win, bd=5, validate="key", validatecommand=vcmd)
        Eest.pack(side= RIGHT)
        Eest.place(x=150, y = 10)
        #Actions Entry
        Lact = Label(win, text="Columns")
        Lact.pack(side=LEFT)
        Lact.place(x=20, y=40)
        Eact = Entry(win, bd=5, validate="key", validatecommand=vcmd)
        Eact.pack(side = RIGHT)
        Eact.place(x=150, y =40)
        #Button
        B = Button(win, text = "Q-Learning", command= lambda: Qlearn.magui(Eest, Eact,win))
        B.pack()
        B.place(x=100, y = 70)
        win.mainloop()


Initial States

In [9]:
class Qlearn(Qlearn):
    #Initial State
    #State, Actions and the Gameboard (m) are stored as a Class Variable
    states = 0
    actions = 0
    m = []


Application Start

In [None]:
#Application Start
#Start the system with the GUI
Qlearn.startgui()

Try again next time
