In [1]:
import sys
import numpy as np
import gym
import csv
from random import random
from tqdm import tqdm
import pickle

sys.stderr = open("log", "w", buffering=1)

In [2]:
env = gym.make("MsPacmanDeterministic-v4", obs_type="grayscale", frameskip=60)
initial_state=current_state = tuple(env.reset().flatten())
gamma=0.85
decay=0.97

value_function={}

known_paths={current_state:0}
known_paths[current_state]={}
known_paths[current_state][1]={"reward":0, "next_state":0}

In [3]:
def policy_improvement (states , nA , value_function, gamma=0.9):
    policy={}
    for s in states:
        value_per_action = np.zeros(shape=(nA,))
        for action in known_paths[s]:
                
            action_val =  known_paths[s][action]['reward'] + gamma * value_function[known_paths[s][action]["next_state"]]
            value_per_action[action] = action_val
        
        best_action = np.argmax(value_per_action)
        policy[s] = best_action
    
    return policy


def value_iteration(nA, epsilon, episodes, gamma=0.9, tol=10):
    
    numIters = 0

    maxChange = np.inf
    
    done=False

    total_reward=0
    
    rand=0
    think=0

    for i in tqdm(range(episodes)):
        obs=env.reset()
        s=tuple(obs.flatten())
        done=False
        while not done:

            a = env.action_space.sample()
            next_state, reward, done, _  = env.step(a)
            next_state=tuple(next_state.flatten())
            total_reward+=reward
            known_paths[s][a]={"reward":reward,"next_state":next_state}

            if next_state not in known_paths:
                known_paths[next_state]={}
                known_paths[next_state][0]={"reward":0,"next_state":0}

            s=next_state
            if done:      
                break

    print('Incepe convergenta')
    
    states = list(known_paths.keys())
    while maxChange > tol:
        numIters += 1
        maxChange = 0.0
        for s in states:
            bestActionValue = -np.inf
            for action in known_paths[s]:
                 
                if known_paths[s][action]['next_state'] not in value_function:
                    value_function[known_paths[s][action]['next_state']]=0
                
                value_for_thisAction = 0.0

                if known_paths[s][action]['next_state'] ==0:      
                    value_for_thisAction =  known_paths[s][action]['reward'] 
                else:
                    value_for_thisAction = known_paths[s][action]['reward'] + gamma * value_function[known_paths[s][action]['next_state']]
                    
                if value_for_thisAction > bestActionValue:
                    bestActionValue = value_for_thisAction
            
            if s in value_function:
                maxChange = max(maxChange, abs(value_function[s] - bestActionValue))

            value_function[s] = bestActionValue

        
    print(f"Value iteration converged after {numIters} steps\n")
    policy = policy_improvement(states, nA, value_function, gamma)
    return value_function, policy


In [4]:
def runEpisode(env, policy):
     
    total_reward = 0
    print("Joaca ce a invatat")
    done=False
    obs=env.reset()
    s=tuple(obs.flatten()) 
        
    while not done:
        if s in policy:
            action = policy[s]    
            newObs, reward, done, _,  = env.step(action)
            newObs= tuple(newObs.flatten())
        else:
            done=True

        s=newObs
        total_reward += reward
        if done:
            env.close()
            break
    
    print(f"Scorul politicii {total_reward}")  

In [5]:
if __name__=="__main__":

    epsilon=1
    best_policy={}
   
    print(f"##############Episode: {0}###############\nEpsilon: {epsilon}\n")
    states=list(known_paths.keys())
    best_value, best_policy = value_iteration(env.action_space.n, epsilon, 70, gamma=gamma, tol=10e-3)
    epsilon =max(0.05,epsilon*decay)
    with open('policy_2.pkl','wb') as f:
            pickle.dump(best_policy, f)
    print(f"Stie {len(best_policy.keys())} mutari\n")

##############Episode: 0###############
Epsilon: 1



A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


Incepe convergenta
Value iteration converged after 52 steps

Stie 2130 mutari



In [6]:
runEpisode(env, policy=best_policy)

Joaca ce a invatat
Scorul politicii 910.0
