## GridWorld
Ph.D Leonarod A, Espinosa, M.Sc Andrej Scherbakov-Parland, BIT Kristoffer Kuvaja Adolfsson

### Bibliography:

* Sutton, Richard S., and Andrew G. Barto. Reinforcement learning: An introduction. MIT press, 2018.
http://incompleteideas.net/book/bookdraft2017nov5.pdf  (chapter 4)

In [1]:
# imports
import numpy as np

In [2]:
# Utilits
def printV(V, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):            
            state = grid.m * idx + idy 
            print('%.2f' % V[state], end='\t')
        print('\n')
    print('--------------------')

def printPolicy(policy, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):            
            state = grid.m * idx + idy 
            if state in grid.stateSpace:
                string = ''.join(policy[state])
                print(string, end='\t')
            else:
                print('', end='\t')
        print('\n')
    print('--------------------')    

def printQ(Q, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):            
            state = grid.m * idx + idy            
            if state != grid.m * grid.n - 1:
                vals = [np.round(Q[state,action], 5) for action in grid.possibleActions]
                print(vals, end='\t')
        print('\n')
    print('--------------------')

def sampleReducedActionSpace(grid, action):
    actions = grid.possibleActions[:]
    actions.remove(action)
    sample = np.random.choice(actions)
    return sample

In [3]:
class WindyGrid(object):
    def __init__(self, m, n, wind):         
        self.grid = np.zeros((m,n))                            # representation of the grid
        self.m = m
        self.n = n
        self.stateSpace = [i for i in range(self.m*self.n)]        
        self.stateSpace.remove(28)                              # Terminal state
        self.stateSpacePlus = [i for i in range(self.m*self.n)] # State space + terminal state
        self.actionSpace = {'U': -self.m, 'D': self.m, 
                            'L': -1, 'R': 1}
        self.possibleActions = ['U', 'D', 'L', 'R']
        self.agentPosition = 0
        self.wind = wind

    def isTerminalState(self, state):
        return state in self.stateSpacePlus and state not in self.stateSpace 

    def getAgentRowAndColumn(self):                               # position of agent
        x = self.agentPosition // self.m
        y = self.agentPosition % self.n
        return x, y
    
    def setState(self, state):
        x, y = self.getAgentRowAndColumn() 
        self.grid[x][y] = 0            
        self.agentPosition = state        
        x, y = self.getAgentRowAndColumn() 
        self.grid[x][y] = 1   
    
    def offGridMove(self, newState, oldState):
        # if we move into a row not in the grid
        if newState not in self.stateSpacePlus:
            return True
        # if we're trying to wrap around to next row
        elif oldState % self.m == 0 and newState  % self.m == self.m - 1:
            return True
        elif oldState % self.m == self.m - 1 and newState % self.m == 0:
            return True
        else:
            return False   
        
    # Include wind stenght.
    def step22(self, action):
        agentX, agentY = self.getAgentRowAndColumn()
        if agentY >= 0 and agentY < len(self.wind):  # Kontrollera om agentY är ett giltigt index
            resultingState = self.agentPosition + self.actionSpace[action] + self.wind[agentY] * self.actionSpace['U']
            if resultingState < 0:  # If the wind is trying to push the agent off the grid
                resultingState += self.m

        if agentX > 0:
            resultingState = self.agentPosition + self.actionSpace[action] + \
                            self.wind[agentY] * self.actionSpace['U']
            if resultingState < 0: #if the wind is trying to push agent off grid
                resultingState += self.m
        else:
            if action == 'L' or action == 'R':
                resultingState = self.agentPosition + self.actionSpace[action]
            else:
                resultingState = self.agentPosition + self.actionSpace[action] + \
                            self.wind[agentY] * self.actionSpace['U']
        #reward = -1 if not self.isTerminalState(resultingState) else 0
        reward = -1
        if not self.offGridMove(resultingState, self.agentPosition):
            self.setState(resultingState)
            return resultingState, reward, self.isTerminalState(resultingState), None
        else:
            return self.agentPosition, reward, self.isTerminalState(self.agentPosition), None
        
    def step(self, action):
        agentX, agentY = self.getAgentRowAndColumn()

        if agentY >= 0 and agentY < len(self.wind):  # Kontrollera om agentY är ett giltigt index
            wind_effect = self.wind[agentY] * self.actionSpace['U']
        else:
            wind_effect = 0

        if agentX > 0:
            resultingState = self.agentPosition + self.actionSpace[action] + wind_effect
        else:
            if action == 'L' or action == 'R':
                resultingState = self.agentPosition + self.actionSpace[action]
            else:
                resultingState = self.agentPosition + self.actionSpace[action] + wind_effect

        if resultingState < 0:  # If the wind or action is trying to push the agent off the grid
            resultingState += self.m

        reward = -1

        if not self.offGridMove(resultingState, self.agentPosition):
            self.setState(resultingState)
            return resultingState, reward, self.isTerminalState(resultingState), None
        else:
            return self.agentPosition, reward, self.isTerminalState(self.agentPosition), None

    def reset(self):
        self.agentPosition = 0
        self.grid = np.zeros((self.m,self.n))
        return self.agentPosition, False


    def render(self):
        print('------------------------------------------')
        for row in self.grid:
            for col in row:
                if col == 0:
                    print('-', end='\t')
                elif col == 1:
                    print('X', end='\t')
            print('\n')
        print('------------------------------------------')
        

## First visit Monte Carlo Prediction

In [4]:
def MC_first_visit(X =6 ,loop=500,Y=1,  wind=[0, 0, 1, 2, 1, 0]):
    print("GAMMA:",Y)
    print("Size:",X,X)

    grid = WindyGrid(X,X, wind)
    GAMMA = 1.0
    if(Y):
        GAMMA = Y
    

    policy = {}                              #  a dictionary that maps each
    for state in grid.stateSpace:            #  state to the list of possible actions
        policy[state] = grid.possibleActions

    V = {}                                   # Initialize our initial estimate of the value
    for state in grid.stateSpacePlus:        # function. Each state gets a value of 0.
        V[state] = 0                                                              

    returns = {}                             #Initialize a dictionary that keeps a list
    for state in grid.stateSpace:            #of the returns for each state.
        returns[state] = []

    for i in range(loop):                     # Loop over 500 games,  
        observation, done = grid.reset()     # resetting the grid and memory with each game.
        memory = []                          # empty list to keep track of the states visited 
        statesReturns = []                   # and returns at each time step
        if i % 100 == 0:                     # Just to know if the game is running.
            print('starting episode', i)
        while not done:                      # While the game isn't done 
            # attempt to follow the policy. In this case choose an action 
            # according to the random equiprobable strategy.
            action = np.random.choice(policy[observation])    
            observation_, reward, done, info = grid.step(action)  # Take that action, get new state, reward and done
            memory.append((observation, action, reward))
            observation = observation_

        # append terminal state
        memory.append((observation, action, reward))

        G = 0                                  # set G=0
        last = True                            # initialize a Boolean to keep track of the visit to the last state                   
        for state, action, reward in reversed(memory): 
            if last:
                last = False
            else:                                    # Skip the terminal state and append the set of states
                statesReturns.append((state,G))      #  and returns to the statesReturns list. 
            G = GAMMA*G + reward

        statesReturns.reverse()                  # to ge it in chronological order
        statesVisited = []                       # keep track of the visited states during the episode.
        for state, G in statesReturns:
            if state not in statesVisited:       # Iterate over the episode and see 
                returns[state].append(G)         # if each state has been visited before. 
                V[state] = np.mean(returns[state]) 
                statesVisited.append(state)
                
                #If it hasn't, meaning this is the agent's first visit, go ahead and append 
                #the returns to the returns dictionary for that state.
                #Calculate the value function by taking the mean of the returns for that state, and finally, 
                #append that state to the list of statesVisited. 
    print("\n") 
    printV(V, grid)

## Del 1:

- Använd *first visit* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras värdefunktion om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med värdefunktion?
        - Prova med 𝛾=0,9, vad händer med värdefunktion?
    - 10x10
        - Ändra på vinden, vad händer med värdefunktion?
        - Prova med 𝛾=0,9, vad händer med värdefunktion?

In [5]:
MC_first_visit(6,500,0.5)
MC_first_visit(6,500,0.9)
MC_first_visit(6,500,0.95)

GAMMA: 0.5
Size: 6 6
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-2.00	-2.00	-2.00	-2.00	-2.00	-2.00	

-2.00	-2.00	-2.00	-2.00	-2.00	-2.00	

-2.00	-2.00	-2.00	-2.00	-2.00	-1.99	

-2.00	-2.00	-2.00	-2.00	-2.00	-1.95	

-2.00	-2.00	-2.00	-2.00	0.00	-1.74	

-2.00	-2.00	-2.00	0.00	-1.96	-1.94	

--------------------
GAMMA: 0.9
Size: 6 6
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.99	-9.98	-9.99	-9.97	-9.96	-9.92	

-10.00	-9.99	-9.98	-9.98	-9.94	-9.78	

-9.99	-10.00	-9.98	-9.99	-9.91	-9.43	

-9.99	-10.00	-9.99	-9.97	-9.89	-8.43	

-10.00	-10.00	-10.00	-9.95	0.00	-6.02	

-9.99	-9.99	-9.98	0.00	-8.59	-8.03	

--------------------
GAMMA: 0.95
Size: 6 6
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-19.95	-19.94	-19.90	-19.86	-19.74	-19.62	

-19.94	-19.92	-19.91	-19.83	-19.74	-19.42	

-19.94	-19.90	-19.86	-19.87	-19.7

Här är observationer:

- 𝛾 = 0.5:
Värdefunktionen verkar inte ge mycket vikt åt framtida belöningar. Det är tydligt eftersom värdena i den nedre högra delen av rutnätet är fortfarande ganska låga, även när det finns en positiv belöning där.

- 𝛾 = 0.9:
Här ser det ut som att systemet ger mer vikt åt framtida belöningar. Värdena i den nedre högra delen av rutnätet är lägre än i det första fallet, vilket tyder på att systemet tar hänsyn till de långsiktiga konsekvenserna.

- 𝛾 = 0.95:
Detta scenario verkar ge ännu mer vikt åt framtida belöningar. Värdena i den nedre högra delen av rutnätet är ännu lägre, och systemet verkar vara mer inriktat på att maximera de långsiktiga belöningarna.

Sammanfattningsvis kan du säga att med ökande värden på gamma ger systemet mer vikt åt framtida belöningar och blir mer inriktat på att maximera de långsiktiga belöningarna jämfört med omedelbara belöningar. Detta är en typisk observation i förstärkningsinlärning, där valet av gamma påverkar agentens inlärningsbeteende.

In [6]:
MC_first_visit(8,500,0.9)
MC_first_visit(8,500,0.9,wind=[0,0,2,3,2,0])

GAMMA: 0.9
Size: 8 8
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.98	-9.97	-9.96	-9.91	-9.86	-9.77	-9.81	-9.75	

-9.98	-9.98	-9.96	-9.93	-9.88	-9.60	-9.60	-9.64	

-9.97	-9.97	-9.97	-9.95	-9.86	-9.10	-9.32	-9.49	

-9.96	-9.95	-9.95	-9.96	0.00	-6.69	-8.77	-9.39	

-9.94	-9.92	-9.92	-9.97	-8.91	-8.58	-9.17	-9.39	

-9.90	-9.86	-9.98	-6.81	-7.12	-8.73	-9.28	-9.57	

-9.77	-9.63	-9.15	-9.36	-8.19	-9.11	-9.41	-9.62	

-9.89	-9.83	-9.68	0.00	-8.91	-9.35	-9.53	-9.68	

--------------------
GAMMA: 0.9
Size: 8 8
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.99	-9.98	-9.98	-9.96	-9.92	-9.84	-9.78	-9.81	

-9.99	-9.98	-9.98	-9.96	-9.87	-9.66	-9.63	-9.66	

-9.99	-9.99	-9.98	-9.93	-9.89	-8.92	-9.28	-9.48	

-9.99	-9.99	-9.98	-9.96	0.00	-6.97	-8.81	-9.33	

-9.99	-9.97	-9.94	-9.98	-7.04	-8.29	-9.22	-9.37	

-9.99	-9.99	-9.96	-9.99	-9.01	-9.02	-9.25	-9.45	

-9.99	-9.98	-9.95	0.00	-7.85	-

Första simuleringen:
GAMMA: 0.9, Wind=[0,0,1,2,1,0]: Värdefunktionen påverkas av vind och gamma. Starkare vind leder till lägre värden, och systemet tar hänsyn till svårigheten att röra sig i den riktningen.

Andra simuleringen:
GAMMA: 0.9, Wind=[0,0,2,3,2,0]: Liknande påverkan som i den första simuleringen. Starkare vind ger lägre värden, och systemet tar hänsyn till ökad svårighet att navigera genom vinden.

In [7]:
MC_first_visit(10,500,0.9,wind=[0,0,1,2,1,0])
MC_first_visit(10,500,0.9,wind=[0,0,3,3,3,0])

GAMMA: 0.9
Size: 10 10
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.97	-9.95	-9.91	-9.86	-9.75	-9.48	-9.07	-8.38	-8.04	-8.03	

-9.97	-9.96	-9.93	-9.83	-9.72	-9.45	-8.74	-7.64	-6.34	-7.03	

-9.98	-9.95	-9.93	-9.83	-9.66	-9.31	-8.62	-6.55	0.00	-5.47	

-9.97	-9.96	-9.92	-9.84	-9.57	-9.39	-9.26	-7.76	-6.49	-7.34	

-9.96	-9.95	-9.93	-9.82	-9.53	-9.59	-9.38	-8.84	-8.31	-8.88	

-9.95	-9.91	-9.89	-9.86	-9.89	-9.70	-9.54	-9.55	-9.16	-9.43	

-9.98	-9.95	-9.92	-9.53	-9.68	-9.79	-9.79	-9.69	-9.47	-9.52	

-9.97	-9.96	-9.92	-10.00	-9.67	-9.82	-9.79	-9.81	-9.74	-9.68	

-9.95	-9.96	-9.89	-9.99	-9.90	-9.90	-9.96	-9.87	-9.77	-9.89	

-9.97	-9.99	-10.00	0.00	-9.89	-9.91	-9.93	-9.91	-9.90	-9.90	

--------------------
GAMMA: 0.9
Size: 10 10
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.97	-9.96	-9.91	-9.86	-9.71	-9.42	-9.03	-8.34	-7.81	-7.77	

-9.97	-9.96	-9.94	-9.93	-9.82	-9.33	-8.68

Första simuleringen (wind=[0,0,1,2,1,0]):
GAMMA: 0.9: Värdefunktionen påverkas av vind och gamma. Starkare vind leder till lägre värden, särskilt i områden där vinden är starkare. Systemet tar hänsyn till ökad svårighet att navigera genom vinden.

Andra simuleringen (wind=[0,0,3,3,3,0]):
GAMMA: 0.9: Påverkan av vind och gamma är liknande den första simuleringen. Starkare vind ger lägre värden, och systemet tar hänsyn till ökad svårighet att röra sig genom de områden där vinden är stark.

## Exploring Start Monte Carlo

In [20]:
def MC_exploring_starts(X =6,Y=1,wind=[0, 0, 1, 2, 1, 0]):
    grid = WindyGrid(X ,X, wind)
    GAMMA = Y
    print("GAMMA:",Y)
    print("Size:",X,X)
   
    # Initialize Q, returns, and pairs visited
    Q = {}          
    returns = {}
    pairsVisited = {}
    for state in grid.stateSpacePlus:
        for action in grid.possibleActions:
            Q[(state, action)] = 0
            returns[(state,action)] = 0
            pairsVisited[(state,action)] = 0
    
    # initialize a random policy
    policy = {}
    for state in grid.stateSpace:
        policy[state] = np.random.choice(grid.possibleActions)
    
    for i in range(1000000):  
        if i % 50000 == 0:
            print('starting episode', i)
        statesActionsReturns = []
        observation = np.random.choice(grid.stateSpace)
        action = np.random.choice(grid.possibleActions)
        grid.setState(observation)
        observation_, reward, done, info = grid.step(action)
        memory = [(observation, action, reward)]
        steps = 1
        while not done:
            action = policy[observation_]
            steps += 1
            observation, reward, done, info = grid.step(action)
            if steps > 15 and not done:
                done = True
                reward = -steps
            memory.append((observation_, action, reward))
            observation_ = observation

        # append the terminal state
        memory.append((observation_, action, reward))
        
        G = 0        
        last = True # start at t = T - 1
        for state, action, reward in reversed(memory):
            if last:
                last = False  
            else:
                statesActionsReturns.append((state,action, G))
            G = GAMMA*G + reward

        statesActionsReturns.reverse()
        statesAndActions = []
        for state, action, G in statesActionsReturns:
            if (state, action) not in statesAndActions:
                pairsVisited[(state,action)] += 1
                returns[(state,action)] += (1 / pairsVisited[(state,action)])*(G-returns[(state,action)])                   
                Q[(state,action)] = returns[(state,action)]
                statesAndActions.append((state,action))
                values = np.array([Q[(state,a)] for a in grid.possibleActions])
                best = np.argmax(values)
                policy[state] = grid.possibleActions[best]
            
    printQ(Q, grid)
    printPolicy(policy,grid)

## Del 2


- Använd  *exploring starts* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras policyn om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med policyn?
        - Prova med 𝛾=0,9, vad händer med policyn?
    - 10x10
        - Ändra på vinden, vad händer med policyn?

In [21]:
MC_exploring_starts(6,0.5,wind=[0, 0, 2, 3, 2, 0])
MC_exploring_starts(6,0.9,wind=[0, 0, 2, 3, 2, 0])
MC_exploring_starts(6,0.95,wind=[0, 0, 2, 3, 2, 0])

GAMMA: 0.5
Size: 6 6
starting episode 0
starting episode 50000
starting episode 100000
starting episode 150000
starting episode 200000
starting episode 250000
starting episode 300000
starting episode 350000
starting episode 400000
starting episode 450000
starting episode 500000
starting episode 550000
starting episode 600000
starting episode 650000
starting episode 700000
starting episode 750000
starting episode 800000
starting episode 850000
starting episode 900000
starting episode 950000
[-1.99913, -1.99917, -1.99914, -1.99808]	[-1.9982, -1.99856, -1.9992, -1.99613]	[-1.99651, -1.99627, -1.99841, -1.99222]	[-1.99235, -1.99316, -1.99623, -1.98438]	[-1.98448, -1.98508, -1.99262, -1.96875]	[-1.96899, -1.9375, -1.9844, -1.96921]	

[-1.99912, -1.99909, -1.99908, -1.99826]	[-1.99807, -1.99815, -1.99987, -1.99612]	[-1.99625, -1.99638, -1.99814, -1.99219]	[-1.99612, -1.99219, -1.99617, -1.99622]	[-1.9844, -1.98445, -1.99221, -1.96878]	[-1.96891, -1.875, -1.98441, -1.93759]	

[-1.99925, -1.99

GAMMA: 0.5
Slutvärdesfunktionen verkar minska gradvis från det högsta värdet längst upp till vänster till det lägsta värdet längst ner till höger.

GAMMA: 0.9
Slutvärdesfunktionen verkar vara mindre benägen att minska snabbt jämfört med gamma 0.5. Det finns en ökad tendens till att höga värden sprider sig över området.

GAMMA: 0.95
Slutvärdesfunktionen verkar ha en ännu mindre benägenhet att minska snabbt. Det finns en ökad utjämning av höga värden över hela området.

In [22]:
MC_exploring_starts(8,0.9,wind=[0, 0, 2, 3, 2, 0])

GAMMA: 0.9
Size: 8 8
starting episode 0
starting episode 50000
starting episode 100000
starting episode 150000
starting episode 200000
starting episode 250000
starting episode 300000
starting episode 350000
starting episode 400000
starting episode 450000
starting episode 500000
starting episode 550000
starting episode 600000
starting episode 650000
starting episode 700000
starting episode 750000
starting episode 800000
starting episode 850000
starting episode 900000
starting episode 950000
[-6.63444, -6.56437, -6.63036, -6.15865]	[-6.15302, -6.23117, -6.73056, -5.70572]	[-5.80481, -5.71858, -6.19972, -5.22225]	[-5.35982, -5.26968, -5.73351, -4.68801]	[-4.74432, -4.70543, -5.22317, -4.09857]	[-4.1036, -3.44216, -4.69033, -4.72314]	[-4.6955, -4.09546, -4.13332, -5.22421]	[-5.22254, -4.70123, -4.68881, -5.22402]	

[-6.7038, -6.56719, -6.57393, -6.15349]	[-6.17294, -6.18371, -6.61247, -5.69768]	[-5.7287, -5.727, -6.19422, -5.21855]	[-5.74815, -5.22531, -5.70804, -5.71454]	[-4.69788, -4.776

In [23]:
MC_exploring_starts(10,0.9,wind=[0, 0, 2, 3, 2, 0])

GAMMA: 0.9
Size: 10 10
starting episode 0
starting episode 50000
starting episode 100000
starting episode 150000
starting episode 200000
starting episode 250000
starting episode 300000
starting episode 350000
starting episode 400000
starting episode 450000
starting episode 500000
starting episode 550000
starting episode 600000
starting episode 650000
starting episode 700000
starting episode 750000
starting episode 800000
starting episode 850000
starting episode 900000
starting episode 950000
[-7.19725, -6.98567, -7.25034, -6.67004]	[-6.7863, -6.88453, -7.26878, -6.18669]	[-6.40165, -6.62071, -7.15188, -5.71547]	[-5.75464, -5.89609, -6.76459, -5.21744]	[-5.47549, -5.23362, -5.76989, -4.69355]	[-4.73632, -4.10172, -5.27879, -4.12004]	[-4.10926, -3.45081, -4.72296, -3.44288]	[-3.44419, -2.71055, -4.10499, -2.76407]	[-2.72078, -1.90122, -3.47712, -3.46308]	[-3.45787, -2.712, -2.72561, -3.44871]	

[-7.23241, -6.9199, -6.95729, -6.62303]	[-6.76124, -6.68341, -6.97416, -6.13853]	[-6.29119, -6

Analys av 8x8:
Observationer:
Generellt sett verkar policyn anpassa sig till den givna vinden (wind) och försöker hitta en väg till målet.
I vissa områden där vinden är stark, kan policyn välja att gå neråt (D) för att dra nytta av vinden.
Policyn verkar vara känslig för både hinder och vindens påverkan, vilket kan leda till alternativa vägar för att nå målet.

Analys av 10x10:
Observationer:
Mönstret är liknande 8x8, men på grund av en större miljö har policyn mer utrymme att anpassa sig och hitta effektivare vägar.
Policyns beteende verkar vara mer robust i större miljöer och kan hantera de långa vindsträckorna bättre.

Sammanfattning:
Policyn tycks anpassa sig väl till både vind och hinder för att nå målet.
Större miljöer tillåter mer flexibilitet och bättre anpassning till vindförhållandena.

## On-policy first visit Monte Carlo for $\varepsilon$-soft policies

In [25]:

def MC_without_exploring_starts(X =6,Y=0.9,wind=[0, 0, 1, 2, 1, 0]):
    grid = WindyGrid(X,X, wind)
    GAMMA = Y
    EPS = 0.4

    Q = {}
    returns = {}
    pairsVisited = {}
    for state in grid.stateSpacePlus:
        for action in grid.actionSpace.keys():
            Q[(state, action)] = 0
            returns[(state,action)] = 0
            pairsVisited[(state,action)] = 0

    policy = {}
    for state in grid.stateSpace:
        policy[state] = grid.possibleActions

    for i in range(1000000):
        statesActionsReturns = []
        if i % 100000 == 0:
            print('starting episode', i)
        observation, done = grid.reset()       
        memory = []
        steps = 0
        while not done:       
            if len(policy[observation]) > 1:
                action = np.random.choice(policy[observation])
            else:
                action = policy[observation]
            observation_, reward, done, info = grid.step(action)
            steps += 1
            if steps > 25 and not done:
                done = True
                reward = -steps
            memory.append((observation, action, reward))
            observation = observation_

        #append the terminal state
        memory.append((observation, action, reward))

        G = 0        
        last = True # start at t = T - 1
        for state, action, reward in reversed(memory):                                    
            if last:
                last = False
            else:
                statesActionsReturns.append((state,action,G))           
            G = GAMMA*G + reward
        statesActionsReturns.reverse()

        statesAndActions = []
        for state, action, G in statesActionsReturns:
            if (state, action) not in statesAndActions:
                pairsVisited[(state,action)] += 1
                returns[(state,action)] += (1 / pairsVisited[(state,action)])*(G-returns[(state,action)])                   
                Q[(state,action)] = returns[(state,action)]
                statesAndActions.append((state,action))
                values = np.array([Q[(state,a)] for a in grid.possibleActions])
                best = np.random.choice(np.where(values==values.max())[0])                    
                rand = np.random.random()
                if rand < 1 - EPS:
                    policy[state] = grid.possibleActions[best]
                else:                        
                    policy[state] = np.random.choice(grid.possibleActions)

    printQ(Q, grid)
    printPolicy(policy,grid)

## Del 3
- Använd *without exploring starts* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras policyn om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med policyn?
        - Prova med 𝛾=0,9, vad händer med policyn?
    - 10x10
        - Ändra på vinden, vad händer med policyn?

In [26]:
MC_without_exploring_starts(6,0.5,wind=[0, 0, 2, 3, 2, 0])
MC_without_exploring_starts(6,0.9,wind=[0, 0, 2, 3, 2, 0])
MC_without_exploring_starts(6,0.95,wind=[0, 0, 2, 3, 2, 0])


starting episode 0
starting episode 100000
starting episode 200000
starting episode 300000
starting episode 400000
starting episode 500000
starting episode 600000
starting episode 700000
starting episode 800000
starting episode 900000
[-2.0, -1.99997, -2.00001, -1.99999]	[-2.00002, -1.99998, -2.00001, -2.00013]	[-2.00014, -2.00009, -2.00013, -2.00061]	[-2.0014, -2.00074, -2.00006, -1.99821]	[-2.00012, -2.00024, -2.00012, -1.99478]	[-2.00025, -1.98498, -2.00032, -2.00024]	

[-2.00001, -1.99998, -2.00001, -1.99992]	[-2.00001, -1.99994, -2.00001, -1.99979]	[-2.00002, -1.99999, -2.00002, -1.99937]	[-2.00788, -2.00823, -2.00767, -2.00692]	[-2.00193, -2.00208, -2.00226, -2.00178]	[-2.00048, -1.95676, -2.00045, -2.0005]	

[-2.00002, -2.0, -2.00001, -1.99994]	[-1.99999, -1.99999, -2.00002, -1.99978]	[-2.00001, -1.99983, -2.00004, -1.99939]	[-2.00034, -1.99987, -2.00031, -1.99834]	[-2.00411, -2.0041, -2.01622, -2.00342]	[-2.00094, -1.87588, -2.00099, -2.00091]	

[-2.00004, -2.00001, -2.00004, -

Gamma = 0.5:
The agent seems to prioritize short-term rewards. You can see that the values tend to decrease quickly as you move away from positive rewards, indicating a strong discount on future rewards.

Gamma = 0.9:
With a higher gamma, the agent considers a balance between short-term and long-term rewards. The values decrease gradually, indicating a smoother transition in discounting future rewards. The agent is likely to consider both immediate and future gains.

Gamma = 0.95:
An even higher gamma could lead to the agent focusing more on long-term rewards. The values might decrease more slowly, indicating a stronger consideration of future rewards. The agent might be more patient in its decision-making.

In [27]:
MC_without_exploring_starts(8,0.5,wind=[0, 0, 2, 3, 2, 0])

starting episode 0
starting episode 100000
starting episode 200000
starting episode 300000
starting episode 400000
starting episode 500000
starting episode 600000
starting episode 700000
starting episode 800000
starting episode 900000
[-2.0, -2.0, -2.0, -2.0]	[-2.00003, -2.00003, -2.00003, -2.00004]	[-2.00012, -2.00017, -2.00011, -2.02577]	[-2.04985, -2.07643, -2.00577, -2.07419]	[-2.00036, -2.00082, -2.34832, -2.00067]	[-2.00283, -2.01371, -2.00173, -2.00106]	[-2.00612, -2.00819, -2.00646, -2.00204]	[-2.00632, -2.03818, -2.02268, -2.00311]	

[-2.0, -2.0, -2.0, -2.00001]	[-2.00001, -2.00001, -2.00003, -2.0009]	[-2.00007, -2.01656, -2.00006, -2.03414]	[-2.00062, -2.00066, -2.00061, -2.00073]	[-2.00226, 0, -2.0192, -2.57812]	[-2.00192, -3.15632, -2.28906, -2.00258]	[-2.00435, -2.0096, -2.00226, -2.07227]	[-2.14453, -4.3125, -2.57812, -2.00468]	

[-2.00001, -2.00002, -2.00002, -2.00002]	[-2.00002, -2.00002, -2.00002, -2.00003]	[-2.00027, -2.00016, -2.00015, -2.00013]	[-2.00052, -2.00074, 

In [28]:
MC_without_exploring_starts(10,0.5,wind=[0, 0, 2, 3, 2, 0])

starting episode 0
starting episode 100000
starting episode 200000
starting episode 300000
starting episode 400000
starting episode 500000
starting episode 600000
starting episode 700000
starting episode 800000
starting episode 900000
[-2.00024, -1.99995, -2.0, -1.99987]	[-2.00002, -1.99988, -2.00053, -1.99963]	[-2.00006, -2.00002, -2.00014, -1.99895]	[-2.00003, -2.00004, -2.00003, -1.99691]	[-2.00006, -2.00007, -2.00006, -1.99113]	[-2.00012, -1.98172, -2.00056, -1.97741]	[-2.00022, -1.94274, -2.00038, -1.94834]	[-2.0013, -1.90756, -2.00165, -1.87594]	[-2.00261, -1.74551, -2.00232, -1.95422]	[-2.00676, -1.8981, -2.0058, -2.00645]	

[-2.00001, -1.99997, -2.00001, -1.99986]	[-1.99996, -1.99989, -2.00001, -1.99961]	[-2.00002, -1.99967, -1.99998, -1.99897]	[-2.01166, -2.01641, -2.01369, -2.01682]	[-2.00161, -2.00136, -2.00128, -2.01337]	[-2.00052, -1.9534, -2.00045, -1.96209]	[-2.00033, -1.87325, -1.98521, -1.85677]	[-1.96078, -1.65607, -2.00082, -1.64914]	[-2.00067, -1.0, -2.00126, -1.879

Policyn verkar stöta på svårigheter i att anpassa sig till den givna miljön och vindförhållandena.
Det finns områden där policyn inte når målet och fastnar, som indikeras av återkommande värden som -2.0.
Policyn har svårt att hantera vissa vindsträckor och hinder, vilket resulterar i suboptimala rörelser och förseningar i att nå målet.
I vissa fall verkar det som policyn går in i en loop och kan inte hitta en effektiv väg till målet.

## Off-Policy Monte Carlo prediction

In [29]:
def MC_off_policy_prediction(X =6,Y=0.9, wind=[0, 0, 1, 2, 1, 0]):
    grid = WindyGrid(X,X, wind)
    GAMMA = Y

    print("GAMMA:",Y)
    print("Size:",X,X)

    Q = {}
    C = {}
    for state in grid.stateSpacePlus:
        for action in grid.possibleActions:
            Q[(state,action)] = 0
            C[(state,action)] = 0
    
    targetPolicy = {}
    for state in grid.stateSpace:
        targetPolicy[state] = np.random.choice(grid.possibleActions)

    for i in range(1000000):
        if i % 100000 == 0:
            print(i)            
        behaviorPolicy = {}
        for state in grid.stateSpace:
            behaviorPolicy[state] = grid.possibleActions
        memory = []
        observation, done = grid.reset()
        steps = 0
        while not done:
            action = np.random.choice(behaviorPolicy[observation])
            observation_, reward, done, info = grid.step(action)
            steps += 1
            if steps > 25:
                done = True
                reward = -steps
            memory.append((observation, action, reward))
            observation = observation_
        memory.append((observation, action, reward))
        
        G = 0
        W = 1
        last = True
        for (state, action, reward) in reversed(memory):            
            if last:
                last = False
            else:
                C[state,action] += W
                Q[state,action] += (W / C[state,action])*(G-Q[state,action])
                prob = 1 if action in targetPolicy[state] else 0
                W *= prob/(1/len(behaviorPolicy[state]))
                if W == 0:
                    break
            G = GAMMA*G + reward
    printQ(Q, grid)
    printPolicy(targetPolicy,grid)

## Del 4
- Använd *off-policy prediction* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras policyn om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med policyn?
        - Prova med 𝛾=0,9, vad händer med policyn?
    - 10x10
        - Ändra på vinden, vad händer med policyn?

In [30]:
MC_off_policy_prediction(6,0.5, [0, 0, 1, 2, 1, 0])
MC_off_policy_prediction(6,0.9, [0, 0, 1, 2, 1, 0])
MC_off_policy_prediction(6,0.95, [0, 0, 1, 2, 1, 0])

GAMMA: 0.5
Size: 6 6
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
[-10.71822, -15.65889, -12.34482, -3.62715]	[-11.02789, -7.01795, -7.40019, -7.11491]	[-12.20539, -14.11802, -12.91215, -11.94801]	[-11.9366, -9.70344, -15.26897, -14.59943]	[-14.39834, -7.76413, -12.54181, -12.6446]	[-15.77219, -11.73522, -14.5374, -15.70233]	

[-13.15019, -5.65206, -14.53301, -12.79667]	[-15.18637, -13.53124, -9.60344, -15.36218]	[-7.31196, -17.92787, -14.49805, -11.17038]	[-18.27904, -19.07607, -17.27642, -18.4919]	[-21.43665, -9.10704, -15.30995, -20.73424]	[-14.65261, -15.89583, -17.9774, -13.44308]	

[-8.4978, -14.73219, -8.24958, -12.48932]	[-12.9796, -15.3868, -14.24971, -16.14793]	[-19.20996, -16.24813, -18.07917, -16.54291]	[-3.10576, -15.19053, -25.44942, -15.61807]	[-16.67507, -20.85293, -16.39573, -19.52158]	[-18.25447, -18.6963, -16.54243, -18.76527]	

[-14.89202, -14.80185, -15.83627, -12.45649]	[-12.36147, -15.67557, -11.51872, -7.8353]	[-13.77151, -12.65753, -9.77406,

GAMMA = 0.5:
Den låga diskonteringsfaktorn gör att agenten ger mindre vikt åt framtida belöningar.
Det kan resultera i en kortare "synvinkel" där endast närliggande belöningar är betydelsefulla.
Det kan ses i vissa fall där agenten kanske inte verkar optimera för det mest långsiktiga fördelaktiga beslutet.

GAMMA = 0.9:
En högre diskonteringsfaktor ger mer vikt åt framtida belöningar.
Agenten tar mer hänsyn till långsiktiga konsekvenser av sina beslut.
Detta kan resultera i en försiktighet och en strävan att optimera för ett mer långsiktigt mål.

GAMMA = 0.95:
En ännu högre diskonteringsfaktor gör att agenten blir ännu mer inriktad på långsiktiga belöningar.
Det kan göra agenten mer benägen att offra kortsiktiga belöningar för att uppnå större långsiktiga vinster.
Observera att i detta fall kan det leda till mindre hänsyn till kortsiktiga vinster.

In [31]:
MC_off_policy_prediction(8,0.9, [0, 0, 1, 2, 1, 0])

GAMMA: 0.9
Size: 8 8
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
[-33.38676, -34.70979, -37.93479, -31.01806]	[-33.01396, -36.79171, -35.38746, -38.99121]	[-35.96386, -32.38546, -35.88426, -38.42559]	[-36.49026, -35.08748, -35.52896, -30.45335]	[-38.2124, -31.04574, -33.12158, -37.13681]	[-39.34743, -29.41528, -38.5736, -38.71746]	[-36.2378, -38.88799, -38.526, -25.76161]	[-39.84433, -38.42086, -39.60174, -28.26196]	

[-30.40114, -37.6033, -38.40985, -36.65975]	[-38.98797, -37.92558, -37.17248, -34.04191]	[-38.9199, -38.12026, -27.61864, -35.10073]	[-36.34852, -40.0459, -39.75427, -39.16561]	[-39.67262, -37.21322, -39.27908, -39.03474]	[-37.96333, -35.23268, -39.7147, -31.36915]	[-39.19903, -35.15434, -38.34476, -35.22413]	[-38.03693, -38.09592, -38.09634, -39.2727]	

[-39.01427, -37.44892, -38.19888, -37.54004]	[-38.12041, -39.35991, -35.44625, -38.87136]	[-39.75524, -40.27696, -35.35065, -39.67725]	[-36.30404, -40.36309, -36.39933, -32.03213]	[-39.77338, -40.2543

In [33]:
MC_off_policy_prediction(10,0.9 ,[0, 0, 2, 3, 2, 0])

GAMMA: 0.9
Size: 10 10
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
[-31.321, -35.43355, -36.72997, -34.89671]	[-34.44834, -37.89669, -36.25853, -31.58773]	[-35.4929, -36.11777, -35.93204, -32.45446]	[-31.48135, -36.2289, -37.1795, -37.85073]	[-31.26421, -35.20207, -27.14261, -35.46457]	[-37.2303, -37.1701, -36.95786, -38.49325]	[-39.38013, -32.04814, -37.79362, -34.67127]	[-37.03926, -39.62635, -39.59036, -38.9121]	[-40.22593, -38.93862, -40.18552, -39.87263]	[-39.8266, -38.35926, -40.54031, -39.82219]	

[-36.98188, -33.97512, -34.23349, -39.19558]	[-34.00614, -39.43199, -38.01651, -37.75582]	[-36.28924, -38.0781, -38.53278, -40.48671]	[-33.78653, -37.05266, -39.38818, -37.2783]	[-40.00042, -37.03202, -40.08101, -33.87233]	[-35.59036, -38.42471, -39.30617, -38.44632]	[-38.11898, -34.90753, -38.41528, -38.08177]	[-39.34439, -10.43822, -39.39738, -39.44095]	[-38.83327, -4.54081, -39.42862, -39.18213]	[-40.09143, -40.15345, -39.901, -40.78873]	

[-38.95805, -35.3539, 

För 8x8 miljön med gamma = 0.9:
Slutvärdesfunktionen är representerad som en matris av belöningar i varje tillstånd.
Policyn representeras som en sekvens av handlingar för varje tillstånd.

-Slutvärdesfunktionen:
Högsta värden ses nära det nedre högra hörnet, vilket indikerar att dessa områden har höga förväntade avkastningar.

-Policy:
Handlingarna varierar beroende på det förväntade värdet i varje tillstånd.
Agenten väljer att röra sig mot områden med högre förväntade avkastningar.


För 10x10 miljön med gamma = 0.9:
Resultaten förändras beroende på miljöns storlek och gamma-värdet.

-Slutvärdesfunktionen:
Här kan vi se förväntade avkastningar över hela miljön. Höga värden indikerar områden med hög förväntad avkastning.

-Policy:
Policyn varierar beroende på det förväntade värdet i varje tillstånd.
Agenten rör sig mot områden med höga förväntade avkastningar och undviker områden med låga förväntade avkastningar.

Sammanfattning:
Ökande gamma kan göra att agenten fokuserar mer på långsiktiga belöningar.
Ökande miljöns storlek påverkar hur agenten utforskar och utnyttjar områden med olika förväntade avkastningar.
Resultaten ger insikt i hur agentens beteende anpassar sig till olika scenarier och diskonteringsfaktorer.
För att ytterligare anpassa agentens beteende kan man experimentera med olika värden på gamma och andra relevanta parametrar i Q-learning-algoritmen.