## GridWorld
Ph.D Leonarod A, Espinosa, M.Sc Andrej Scherbakov-Parland, BIT Kristoffer Kuvaja Adolfsson

### Bibliography:

* Sutton, Richard S., and Andrew G. Barto. Reinforcement learning: An introduction. MIT press, 2018.
http://incompleteideas.net/book/bookdraft2017nov5.pdf  (chapter 4)

In [1]:
# imports
import numpy as np

In [2]:
# Utilits
def printV(V, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):            
            state = grid.m * idx + idy 
            print('%.2f' % V[state], end='\t')
        print('\n')
    print('--------------------')

def printPolicy(policy, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):            
            state = grid.m * idx + idy 
            if state in grid.stateSpace:
                string = ''.join(policy[state])
                print(string, end='\t')
            else:
                print('', end='\t')
        print('\n')
    print('--------------------')    

def printQ(Q, grid):
    for idx, row in enumerate(grid.grid):
        for idy, _ in enumerate(row):            
            state = grid.m * idx + idy            
            if state != grid.m * grid.n - 1:
                vals = [np.round(Q[state,action], 5) for action in grid.possibleActions]
                print(vals, end='\t')
        print('\n')
    print('--------------------')

def sampleReducedActionSpace(grid, action):
    actions = grid.possibleActions[:]
    actions.remove(action)
    sample = np.random.choice(actions)
    return sample

In [3]:
class WindyGrid(object):
    def __init__(self, m, n, wind):         
        self.grid = np.zeros((m,n))                            # representation of the grid
        self.m = m
        self.n = n
        self.stateSpace = [i for i in range(self.m*self.n)]        
        self.stateSpace.remove(28)                              # Terminal state
        self.stateSpacePlus = [i for i in range(self.m*self.n)] # State space + terminal state
        self.actionSpace = {'U': -self.m, 'D': self.m, 
                            'L': -1, 'R': 1}
        self.possibleActions = ['U', 'D', 'L', 'R']
        self.agentPosition = 0
        self.wind = wind

    def isTerminalState(self, state):
        return state in self.stateSpacePlus and state not in self.stateSpace 

    def getAgentRowAndColumn(self):                               # position of agent
        x = self.agentPosition // self.m
        y = self.agentPosition % self.n
        return x, y
    
    def setState(self, state):
        x, y = self.getAgentRowAndColumn() 
        self.grid[x][y] = 0            
        self.agentPosition = state        
        x, y = self.getAgentRowAndColumn() 
        self.grid[x][y] = 1   
    
    def offGridMove(self, newState, oldState):
        # if we move into a row not in the grid
        if newState not in self.stateSpacePlus:
            return True
        # if we're trying to wrap around to next row
        elif oldState % self.m == 0 and newState  % self.m == self.m - 1:
            return True
        elif oldState % self.m == self.m - 1 and newState % self.m == 0:
            return True
        else:
            return False   
        
    # Include wind stenght.
    def step22(self, action):
        agentX, agentY = self.getAgentRowAndColumn()
        if agentY >= 0 and agentY < len(self.wind):  # Kontrollera om agentY är ett giltigt index
            resultingState = self.agentPosition + self.actionSpace[action] + self.wind[agentY] * self.actionSpace['U']
            if resultingState < 0:  # If the wind is trying to push the agent off the grid
                resultingState += self.m

        if agentX > 0:
            resultingState = self.agentPosition + self.actionSpace[action] + \
                            self.wind[agentY] * self.actionSpace['U']
            if resultingState < 0: #if the wind is trying to push agent off grid
                resultingState += self.m
        else:
            if action == 'L' or action == 'R':
                resultingState = self.agentPosition + self.actionSpace[action]
            else:
                resultingState = self.agentPosition + self.actionSpace[action] + \
                            self.wind[agentY] * self.actionSpace['U']
        #reward = -1 if not self.isTerminalState(resultingState) else 0
        reward = -1
        if not self.offGridMove(resultingState, self.agentPosition):
            self.setState(resultingState)
            return resultingState, reward, self.isTerminalState(resultingState), None
        else:
            return self.agentPosition, reward, self.isTerminalState(self.agentPosition), None
        
    def step(self, action):
        agentX, agentY = self.getAgentRowAndColumn()

        if agentY >= 0 and agentY < len(self.wind):  # Kontrollera om agentY är ett giltigt index
            wind_effect = self.wind[agentY] * self.actionSpace['U']
        else:
            wind_effect = 0

        if agentX > 0:
            resultingState = self.agentPosition + self.actionSpace[action] + wind_effect
        else:
            if action == 'L' or action == 'R':
                resultingState = self.agentPosition + self.actionSpace[action]
            else:
                resultingState = self.agentPosition + self.actionSpace[action] + wind_effect

        if resultingState < 0:  # If the wind or action is trying to push the agent off the grid
            resultingState += self.m

        reward = -1

        if not self.offGridMove(resultingState, self.agentPosition):
            self.setState(resultingState)
            return resultingState, reward, self.isTerminalState(resultingState), None
        else:
            return self.agentPosition, reward, self.isTerminalState(self.agentPosition), None

    def reset(self):
        self.agentPosition = 0
        self.grid = np.zeros((self.m,self.n))
        return self.agentPosition, False


    def render(self):
        print('------------------------------------------')
        for row in self.grid:
            for col in row:
                if col == 0:
                    print('-', end='\t')
                elif col == 1:
                    print('X', end='\t')
            print('\n')
        print('------------------------------------------')
        

## First visit Monte Carlo Prediction

In [4]:
def MC_first_visit(X =6 ,loop=500,Y=1,  wind=[0, 0, 1, 2, 1, 0]):
    print("GAMMA:",Y)
    print("Size:",X,X)

    grid = WindyGrid(X,X, wind)
    GAMMA = 1.0
    if(Y):
        GAMMA = Y
    

    policy = {}                              #  a dictionary that maps each
    for state in grid.stateSpace:            #  state to the list of possible actions
        policy[state] = grid.possibleActions

    V = {}                                   # Initialize our initial estimate of the value
    for state in grid.stateSpacePlus:        # function. Each state gets a value of 0.
        V[state] = 0                                                              

    returns = {}                             #Initialize a dictionary that keeps a list
    for state in grid.stateSpace:            #of the returns for each state.
        returns[state] = []

    for i in range(loop):                     # Loop over 500 games,  
        observation, done = grid.reset()     # resetting the grid and memory with each game.
        memory = []                          # empty list to keep track of the states visited 
        statesReturns = []                   # and returns at each time step
        if i % 100 == 0:                     # Just to know if the game is running.
            print('starting episode', i)
        while not done:                      # While the game isn't done 
            # attempt to follow the policy. In this case choose an action 
            # according to the random equiprobable strategy.
            action = np.random.choice(policy[observation])    
            observation_, reward, done, info = grid.step(action)  # Take that action, get new state, reward and done
            memory.append((observation, action, reward))
            observation = observation_

        # append terminal state
        memory.append((observation, action, reward))

        G = 0                                  # set G=0
        last = True                            # initialize a Boolean to keep track of the visit to the last state                   
        for state, action, reward in reversed(memory): 
            if last:
                last = False
            else:                                    # Skip the terminal state and append the set of states
                statesReturns.append((state,G))      #  and returns to the statesReturns list. 
            G = GAMMA*G + reward

        statesReturns.reverse()                  # to ge it in chronological order
        statesVisited = []                       # keep track of the visited states during the episode.
        for state, G in statesReturns:
            if state not in statesVisited:       # Iterate over the episode and see 
                returns[state].append(G)         # if each state has been visited before. 
                V[state] = np.mean(returns[state]) 
                statesVisited.append(state)
                
                #If it hasn't, meaning this is the agent's first visit, go ahead and append 
                #the returns to the returns dictionary for that state.
                #Calculate the value function by taking the mean of the returns for that state, and finally, 
                #append that state to the list of statesVisited. 
    print("\n") 
    printV(V, grid)

## Del 1:

- Använd *first visit* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras värdefunktion om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med värdefunktion?
        - Prova med 𝛾=0,9, vad händer med värdefunktion?
    - 10x10
        - Ändra på vinden, vad händer med värdefunktion?
        - Prova med 𝛾=0,9, vad händer med värdefunktion?

In [5]:
MC_first_visit(6,500,0.5)
MC_first_visit(6,500,0.9)
MC_first_visit(6,500,0.95)

GAMMA: 0.5
Size: 6 6
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-2.00	-2.00	-2.00	-2.00	-2.00	-2.00	

-2.00	-2.00	-2.00	-2.00	-2.00	-2.00	

-2.00	-2.00	-2.00	-2.00	-2.00	-1.99	

-2.00	-2.00	-2.00	-2.00	-2.00	-1.95	

-2.00	-2.00	-2.00	-2.00	0.00	-1.69	

-2.00	-2.00	-2.00	0.00	-1.97	-1.93	

--------------------
GAMMA: 0.9
Size: 6 6
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.99	-9.99	-9.98	-9.97	-9.92	-9.86	

-9.99	-9.99	-9.99	-9.97	-9.96	-9.77	

-9.99	-9.99	-9.99	-9.96	-9.93	-9.31	

-9.99	-9.99	-9.97	-9.97	-9.79	-8.51	

-9.99	-9.99	-9.99	-9.97	0.00	-5.93	

-10.00	-9.99	-9.98	0.00	-8.88	-8.11	

--------------------
GAMMA: 0.95
Size: 6 6
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-19.95	-19.94	-19.90	-19.86	-19.78	-19.64	

-19.95	-19.94	-19.89	-19.86	-19.74	-19.41	

-19.96	-19.94	-19.88	-19.80	-19.49	-18

Här är observationer:

- 𝛾 = 0.5:
Värdefunktionen verkar inte ge mycket vikt åt framtida belöningar. Det är tydligt eftersom värdena i den nedre högra delen av rutnätet är fortfarande ganska låga, även när det finns en positiv belöning där.

- 𝛾 = 0.9:
Här ser det ut som att systemet ger mer vikt åt framtida belöningar. Värdena i den nedre högra delen av rutnätet är lägre än i det första fallet, vilket tyder på att systemet tar hänsyn till de långsiktiga konsekvenserna.

- 𝛾 = 0.95:
Detta scenario verkar ge ännu mer vikt åt framtida belöningar. Värdena i den nedre högra delen av rutnätet är ännu lägre, och systemet verkar vara mer inriktat på att maximera de långsiktiga belöningarna.

Sammanfattningsvis kan du säga att med ökande värden på gamma ger systemet mer vikt åt framtida belöningar och blir mer inriktat på att maximera de långsiktiga belöningarna jämfört med omedelbara belöningar. Detta är en typisk observation i förstärkningsinlärning, där valet av gamma påverkar agentens inlärningsbeteende.

In [6]:
MC_first_visit(8,500,0.9)
MC_first_visit(8,500,0.9,wind=[0,0,2,3,2,0])

GAMMA: 0.9
Size: 8 8
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.99	-9.99	-9.97	-9.95	-9.88	-9.78	-9.74	-9.76	

-9.98	-9.99	-9.98	-9.94	-9.93	-9.65	-9.59	-9.63	

-9.97	-9.97	-9.96	-9.97	-9.84	-9.05	-9.31	-9.44	

-9.97	-9.94	-9.98	-9.98	0.00	-6.98	-8.68	-9.36	

-9.95	-9.90	-9.98	-9.92	-9.15	-8.79	-9.30	-9.45	

-9.87	-9.83	-9.93	-7.69	-6.74	-8.50	-9.29	-9.49	

-9.83	-9.60	-8.74	-9.77	-8.41	-9.09	-9.47	-9.63	

-9.83	-9.66	-9.99	0.00	-8.77	-9.13	-9.51	-9.69	

--------------------
GAMMA: 0.9
Size: 8 8
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.99	-9.99	-9.98	-9.95	-9.93	-9.86	-9.77	-9.74	

-9.99	-9.99	-9.99	-9.99	-9.90	-9.71	-9.67	-9.70	

-9.99	-9.98	-9.97	-9.87	-9.87	-9.20	-9.42	-9.49	

-9.99	-9.99	-9.99	-9.96	0.00	-7.08	-9.03	-9.32	

-9.98	-9.98	-9.98	-9.87	-7.53	-8.54	-9.09	-9.31	

-9.98	-9.97	-9.96	-9.98	-8.67	-8.95	-9.24	-9.45	

-9.96	-9.96	-9.92	0.00	-7.49	-

Första simuleringen:
GAMMA: 0.9, Wind=[0,0,1,2,1,0]: Värdefunktionen påverkas av vind och gamma. Starkare vind leder till lägre värden, och systemet tar hänsyn till svårigheten att röra sig i den riktningen.

Andra simuleringen:
GAMMA: 0.9, Wind=[0,0,2,3,2,0]: Liknande påverkan som i den första simuleringen. Starkare vind ger lägre värden, och systemet tar hänsyn till ökad svårighet att navigera genom vinden.

In [16]:
MC_first_visit(10,500,0.9,wind=[0,0,1,2,1,0])
MC_first_visit(10,500,0.9,wind=[0,0,3,3,3,0])

GAMMA: 0.9
Size: 10 10
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.97	-9.96	-9.93	-9.85	-9.71	-9.45	-8.92	-8.30	-7.76	-8.09	

-9.97	-9.96	-9.90	-9.81	-9.84	-9.37	-8.70	-7.48	-5.93	-6.86	

-9.96	-9.95	-9.90	-9.78	-9.59	-9.29	-8.46	-6.35	0.00	-5.06	

-9.96	-9.95	-9.94	-9.81	-9.55	-9.51	-8.91	-7.64	-5.74	-6.68	

-9.96	-9.94	-9.87	-9.89	-9.58	-9.66	-9.22	-8.56	-8.13	-8.23	

-9.96	-9.93	-9.93	-9.79	-9.81	-9.80	-9.63	-9.47	-9.29	-8.94	

-9.96	-9.94	-9.92	-9.64	-9.71	-9.77	-9.79	-9.55	-9.38	-9.61	

-9.95	-9.93	-9.88	-9.74	-9.74	-9.85	-9.83	-9.77	-9.85	-9.87	

-9.98	-9.98	-9.97	-9.73	-9.81	-9.89	-9.83	-9.81	-9.70	-9.82	

-9.96	-9.97	-9.97	0.00	-9.80	-9.93	-9.94	-9.95	-9.85	-9.84	

--------------------
GAMMA: 0.9
Size: 10 10
starting episode 0
starting episode 100
starting episode 200
starting episode 300
starting episode 400


-9.97	-9.95	-9.90	-9.85	-9.69	-9.41	-8.99	-8.43	-7.54	-7.61	

-9.97	-9.96	-9.95	-9.97	-9.76	-9.41	-8.76	-

Första simuleringen (wind=[0,0,1,2,1,0]):
GAMMA: 0.9: Värdefunktionen påverkas av vind och gamma. Starkare vind leder till lägre värden, särskilt i områden där vinden är starkare. Systemet tar hänsyn till ökad svårighet att navigera genom vinden.

Andra simuleringen (wind=[0,0,3,3,3,0]):
GAMMA: 0.9: Påverkan av vind och gamma är liknande den första simuleringen. Starkare vind ger lägre värden, och systemet tar hänsyn till ökad svårighet att röra sig genom de områden där vinden är stark.

## Exploring Start Monte Carlo

In [8]:
def MC_exploring_starts(X =6,Y=1):
    grid = WindyGrid(X ,X, wind=[0, 0, 1, 2, 1, 0])
    GAMMA = Y
    print("GAMMA:",Y)
    print("Size:",X,X)
   
    # Initialize Q, returns, and pairs visited
    Q = {}          
    returns = {}
    pairsVisited = {}
    for state in grid.stateSpacePlus:
        for action in grid.possibleActions:
            Q[(state, action)] = 0
            returns[(state,action)] = 0
            pairsVisited[(state,action)] = 0
    
    # initialize a random policy
    policy = {}
    for state in grid.stateSpace:
        policy[state] = np.random.choice(grid.possibleActions)
    
    for i in range(1000000):  
        if i % 50000 == 0:
            print('starting episode', i)
        statesActionsReturns = []
        observation = np.random.choice(grid.stateSpace)
        action = np.random.choice(grid.possibleActions)
        grid.setState(observation)
        observation_, reward, done, info = grid.step(action)
        memory = [(observation, action, reward)]
        steps = 1
        while not done:
            action = policy[observation_]
            steps += 1
            observation, reward, done, info = grid.step(action)
            if steps > 15 and not done:
                done = True
                reward = -steps
            memory.append((observation_, action, reward))
            observation_ = observation

        # append the terminal state
        memory.append((observation_, action, reward))
        
        G = 0        
        last = True # start at t = T - 1
        for state, action, reward in reversed(memory):
            if last:
                last = False  
            else:
                statesActionsReturns.append((state,action, G))
            G = GAMMA*G + reward

        statesActionsReturns.reverse()
        statesAndActions = []
        for state, action, G in statesActionsReturns:
            if (state, action) not in statesAndActions:
                pairsVisited[(state,action)] += 1
                returns[(state,action)] += (1 / pairsVisited[(state,action)])*(G-returns[(state,action)])                   
                Q[(state,action)] = returns[(state,action)]
                statesAndActions.append((state,action))
                values = np.array([Q[(state,a)] for a in grid.possibleActions])
                best = np.argmax(values)
                policy[state] = grid.possibleActions[best]
            
    printQ(Q, grid)
    printPolicy(policy,grid)

## Del 2


- Använd  *exploring starts* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras policyn om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med policyn?
        - Prova med 𝛾=0,9, vad händer med policyn?
    - 10x10
        - Ändra på vinden, vad händer med policyn?

In [9]:
# MC_exploring_starts(6,0.5)
# MC_exploring_starts(6,0.9)
# MC_exploring_starts(6,0.95)

GAMMA: 0.5
Size: 6 6
starting episode 0
starting episode 50000
starting episode 100000
starting episode 150000
starting episode 200000
starting episode 250000
starting episode 300000
starting episode 350000
starting episode 400000
starting episode 450000
starting episode 500000
starting episode 550000
starting episode 600000
starting episode 650000
starting episode 700000
starting episode 750000
starting episode 800000
starting episode 850000
starting episode 900000
starting episode 950000
[-1.99905, -1.99913, -1.9991, -1.99806]	[-1.99812, -1.99828, -1.9991, -1.9961]	[-1.99616, -1.99615, -1.99808, -1.99221]	[-1.99222, -1.9922, -1.99642, -1.98438]	[-1.9844, -1.98448, -1.99221, -1.96875]	[-1.96879, -1.9375, -1.98439, -1.96877]	

[-1.99907, -1.99911, -1.99906, -1.99807]	[-1.99809, -1.99809, -1.99906, -1.9961]	[-1.99612, -1.99613, -1.99815, -1.99219]	[-1.99221, -1.99223, -1.99612, -1.98438]	[-1.98439, -1.98441, -1.99226, -1.96875]	[-1.96882, -1.875, -1.98442, -1.93754]	

[-1.99906, -1.9990

In [10]:
MC_exploring_starts(8,0.9)

GAMMA: 0.9
Size: 8 8
starting episode 0
starting episode 50000
starting episode 100000
starting episode 150000
starting episode 200000
starting episode 250000
starting episode 300000
starting episode 350000
starting episode 400000
starting episode 450000
starting episode 500000
starting episode 550000
starting episode 600000
starting episode 650000
starting episode 700000
starting episode 750000
starting episode 800000
starting episode 850000
starting episode 900000
starting episode 950000
[-6.55871, -6.52087, -6.55568, -6.15726]	[-6.16975, -6.24052, -6.56752, -5.69999]	[-5.77364, -5.76525, -6.36935, -5.21974]	[-5.30895, -5.26129, -5.98676, -4.68707]	[-4.70745, -4.76131, -5.29144, -4.09516]	[-4.10704, -3.43905, -4.69849, -4.73016]	[-4.69169, -4.10006, -4.13919, -5.22105]	[-5.21996, -4.68847, -4.6948, -5.22726]	

[-6.54622, -6.12981, -6.51661, -6.1329]	[-6.16094, -5.69668, -6.52862, -5.75458]	[-5.74617, -5.76074, -6.22307, -5.22632]	[-5.23139, -5.28776, -5.74796, -4.68838]	[-4.69633, -4

In [11]:
# MC_exploring_starts(10,0.9)

GAMMA: 0.9
Size: 10 10
starting episode 0
starting episode 50000
starting episode 100000
starting episode 150000
starting episode 200000
starting episode 250000
starting episode 300000
starting episode 350000
starting episode 400000
starting episode 450000
starting episode 500000
starting episode 550000
starting episode 600000
starting episode 650000
starting episode 700000
starting episode 750000
starting episode 800000
starting episode 850000
starting episode 900000
starting episode 950000
[-6.87397, -6.89381, -6.86985, -6.52562]	[-6.53508, -6.54839, -6.89165, -6.13259]	[-6.13776, -6.24099, -6.54736, -5.69917]	[-5.70051, -5.70498, -6.17375, -5.22222]	[-5.22309, -5.22807, -5.70456, -4.68853]	[-4.69106, -4.0978, -5.22279, -4.11224]	[-4.11552, -3.44038, -4.70503, -3.45012]	[-3.45265, -2.7157, -4.1261, -2.71099]	[-2.71957, -1.9, -3.4509, -3.4439]	[-3.44404, -2.71491, -2.71, -3.44332]	

[-6.89921, -6.8969, -6.88817, -6.51881]	[-6.51988, -6.54104, -6.91742, -6.1278]	[-6.15944, -6.12963, -6

## On-policy first visit Monte Carlo for $\varepsilon$-soft policies

In [12]:
def MC_without_exploring_starts(X =6,Y=0.9):
    grid = WindyGrid(X,X, wind=[0, 0, 1, 2, 1, 0])
    GAMMA = Y
    EPS = 0.4

    Q = {}
    returns = {}
    pairsVisited = {}
    for state in grid.stateSpacePlus:
        for action in grid.actionSpace.keys():
            Q[(state, action)] = 0
            returns[(state,action)] = 0
            pairsVisited[(state,action)] = 0

    policy = {}
    for state in grid.stateSpace:
        policy[state] = grid.possibleActions

    for i in range(1000000):
        statesActionsReturns = []
        if i % 100000 == 0:
            print('starting episode', i)
        observation, done = grid.reset()       
        memory = []
        steps = 0
        while not done:       
            if len(policy[observation]) > 1:
                action = np.random.choice(policy[observation])
            else:
                action = policy[observation]
            observation_, reward, done, info = grid.step(action)
            steps += 1
            if steps > 25 and not done:
                done = True
                reward = -steps
            memory.append((observation, action, reward))
            observation = observation_

        #append the terminal state
        memory.append((observation, action, reward))

        G = 0        
        last = True # start at t = T - 1
        for state, action, reward in reversed(memory):                                    
            if last:
                last = False
            else:
                statesActionsReturns.append((state,action,G))           
            G = GAMMA*G + reward
        statesActionsReturns.reverse()

        statesAndActions = []
        for state, action, G in statesActionsReturns:
            if (state, action) not in statesAndActions:
                pairsVisited[(state,action)] += 1
                returns[(state,action)] += (1 / pairsVisited[(state,action)])*(G-returns[(state,action)])                   
                Q[(state,action)] = returns[(state,action)]
                statesAndActions.append((state,action))
                values = np.array([Q[(state,a)] for a in grid.possibleActions])
                best = np.random.choice(np.where(values==values.max())[0])                    
                rand = np.random.random()
                if rand < 1 - EPS:
                    policy[state] = grid.possibleActions[best]
                else:                        
                    policy[state] = np.random.choice(grid.possibleActions)

    printQ(Q, grid)
    printPolicy(policy,grid)

## Del 3
- Använd *without exploring starts* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras policyn om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med policyn?
        - Prova med 𝛾=0,9, vad händer med policyn?
    - 10x10
        - Ändra på vinden, vad händer med policyn?

In [13]:
# MC_without_exploring_starts(6,0.5)
# MC_without_exploring_starts(6,0.9)
# MC_without_exploring_starts(6,0.95)


starting episode 0
starting episode 100000
starting episode 200000
starting episode 300000
starting episode 400000
starting episode 500000
starting episode 600000
starting episode 700000
starting episode 800000
starting episode 900000
[-2.0, -2.0, -2.0, -2.0]	[-2.00001, -2.00001, -2.00001, -2.00001]	[-2.00002, -2.00003, -2.00002, -2.00003]	[-2.00007, -2.00006, -2.00008, -2.0001]	[-2.00021, -2.0002, -2.00017, -2.0006]	[-2.02788, -2.00769, -2.00048, -2.11629]	

[-2.00001, -2.00001, -2.00001, -2.00001]	[-2.00001, -2.00002, -2.00001, -2.00001]	[-2.00003, -2.00003, -2.00003, -2.00003]	[-2.00033, -2.00013, -2.00012, -2.00015]	[-10.00038, -2.00085, -2.00056, -2.00094]	[-2.0965, -2.00555, -39.0, -2.04841]	

[-2.01136, -2.00002, -2.01724, -2.00002]	[-2.00004, -2.00057, -2.00002, -2.00009]	[-2.00005, -2.00005, -2.00141, -2.00005]	[-2.00022, -2.00018, -2.00036, -2.00032]	[0, -2.00113, -2.14905, 0]	[-2.00071, -2.00103, -2.00452, -2.07227]	

[-2.07746, -2.00006, -2.00005, -2.0001]	[-2.00014, -2.000

KeyboardInterrupt: 

In [None]:
# MC_without_exploring_starts(8,0.5)

starting episode 0
starting episode 100000
starting episode 200000
starting episode 300000
starting episode 400000
starting episode 500000
starting episode 600000
starting episode 700000
starting episode 800000
starting episode 900000
[-2.0, -1.99992, -2.0, -1.99976]	[-2.00001, -1.9998, -2.00001, -1.99935]	[-2.00008, -2.00005, -2.00007, -1.99818]	[-2.00044, -2.00003, -2.00004, -1.99476]	[-2.00007, -2.00072, -2.00008, -1.98451]	[-2.00021, -1.95565, -2.00026, -1.99357]	[-2.00035, -1.98254, -2.00034, -1.99899]	[-2.00098, -1.99733, -2.08633, -2.00165]	

[-2.0, -1.99997, -2.0, -1.99978]	[-1.99994, -1.9998, -2.00003, -1.9994]	[-1.99954, -2.00005, -1.99999, -1.99847]	[-2.00526, -2.00161, -2.00462, -1.99841]	[-2.004, -2.002, -2.00356, -2.00558]	[-2.00034, -1.87481, -2.00032, -1.98759]	[-2.0006, -1.95408, -1.97496, -1.99745]	[-2.07366, -1.98644, -2.07395, -2.00279]	

[-2.00001, -2.00087, -2.00001, -2.00007]	[-2.00016, -2.00004, -2.00064, -1.99934]	[-1.99999, -2.00061, -2.00055, -1.99906]	[-2.02

In [None]:
# MC_without_exploring_starts(10,0.5)

starting episode 0
starting episode 100000
starting episode 200000
starting episode 300000
starting episode 400000
starting episode 500000
starting episode 600000
starting episode 700000
starting episode 800000
starting episode 900000
[-2.0, -1.99995, -2.0, -1.99986]	[-2.00001, -1.99986, -2.00001, -1.99962]	[-2.00038, -2.00001, -2.00004, -1.99894]	[-2.00059, -2.00008, -2.00026, -1.9969]	[-2.0001, -2.0001, -2.00021, -1.99111]	[-2.00013, -1.97935, -2.00015, -1.9775]	[-2.00024, -1.94854, -2.00024, -1.94317]	[-2.0005, -1.85604, -2.00051, -1.87188]	[-2.00189, -1.76744, -2.0021, -1.95507]	[-2.00452, -1.91031, -2.00333, -2.00453]	

[-2.00001, -1.99998, -2.00001, -1.99986]	[-1.99994, -1.99995, -2.00001, -1.9996]	[-1.99965, -2.00381, -1.99995, -1.99892]	[-2.00126, -2.0041, -2.00106, -2.00834]	[-2.00255, -2.00112, -2.00534, -2.16535]	[-2.00045, -1.95379, -2.00866, -1.95343]	[-1.99278, -1.87339, -1.99944, -1.88967]	[-2.0007, -1.64985, -1.95934, -1.64709]	[-2.00023, -1.0, -1.99964, -1.87946]	[-2.0

## Off-Policy Monte Carlo prediction

In [None]:
def MC_off_policy_prediction(X =6,Y=0.9):
    grid = WindyGrid(X,X, wind=[0,0,1,2,1,0])
    GAMMA = Y

    Q = {}
    C = {}
    for state in grid.stateSpacePlus:
        for action in grid.possibleActions:
            Q[(state,action)] = 0
            C[(state,action)] = 0
    
    targetPolicy = {}
    for state in grid.stateSpace:
        targetPolicy[state] = np.random.choice(grid.possibleActions)

    for i in range(1000000):
        if i % 100000 == 0:
            print(i)            
        behaviorPolicy = {}
        for state in grid.stateSpace:
            behaviorPolicy[state] = grid.possibleActions
        memory = []
        observation, done = grid.reset()
        steps = 0
        while not done:
            action = np.random.choice(behaviorPolicy[observation])
            observation_, reward, done, info = grid.step(action)
            steps += 1
            if steps > 25:
                done = True
                reward = -steps
            memory.append((observation, action, reward))
            observation = observation_
        memory.append((observation, action, reward))
        
        G = 0
        W = 1
        last = True
        for (state, action, reward) in reversed(memory):            
            if last:
                last = False
            else:
                C[state,action] += W
                Q[state,action] += (W / C[state,action])*(G-Q[state,action])
                prob = 1 if action in targetPolicy[state] else 0
                W *= prob/(1/len(behaviorPolicy[state]))
                if W == 0:
                    break
            G = GAMMA*G + reward
    printQ(Q, grid)
    printPolicy(targetPolicy,grid)

## Del 4
- Använd *off-policy prediction* Monte Carlo Metoden

1. Öka vindstyrkan med en enhet.
    - Hur ändras slutvärdesfunktionen?


2. Hur ändras policyn om man ändra gamma till:
    - 𝛾=0.5
    - 𝛾=0,9
    - 𝛾=0,95


3. Testa rutnätsvärlden i storlekarna:
    - 8x8
        - Ändra på vinden, vad händer med policyn?
        - Prova med 𝛾=0,9, vad händer med policyn?
    - 10x10
        - Ändra på vinden, vad händer med policyn?

In [None]:
# MC_off_policy_prediction(6,0.9)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
[-39.23447, -38.49958, -38.86191, -35.18632]	[-38.4638, -35.30407, -36.61028, -38.19792]	[-36.22497, -38.6344, -36.17559, -38.67755]	[-38.0837, -36.21945, -38.76053, -37.71766]	[-39.10127, -37.99514, -34.66265, -38.75918]	[-34.99953, -34.99247, -36.5026, -38.82799]	

[-38.02532, -34.86707, -36.70831, -37.92989]	[-29.72206, -38.19134, -36.45657, -34.4744]	[-36.61567, -39.92826, -40.00747, -38.79786]	[-39.84223, -37.30144, -39.55175, -37.29805]	[-39.40642, -37.84338, -39.39893, -36.89309]	[-31.19318, -36.80728, -39.25784, -39.2818]	

[-36.20115, -36.39004, -33.51558, -36.6242]	[-38.57087, -38.00882, -38.6983, -39.87783]	[-38.62826, -37.41869, -30.91866, -38.43645]	[-39.94549, -40.12308, -39.40274, -39.47136]	[-39.19754, -32.24161, -38.55747, -35.87728]	[-39.89892, -39.7686, -38.25772, -40.26807]	

[-38.56407, -37.51039, -35.32864, -38.18172]	[-37.4443, -39.45794, -36.07936, -37.41671]	[-39.24939, -39.69681, -38.63484, -38.0

In [None]:
# MC_off_policy_prediction(8,0.9)

In [None]:
# MC_off_policy_prediction(10,0.9)