In [14]:
import sys
import numpy as np
import gym
import csv
from random import random

sys.stderr = open("log", "w", buffering=1)

In [15]:
env = gym.make("MsPacmanDeterministic-v4", obs_type="grayscale", frameskip=8)
initial_state=current_state = tuple(env.reset().flatten())
gamma=0.85
decay=0.80

value_function={}

known_paths={current_state:0}
known_paths[current_state]={}
known_paths[current_state][1]={"reward":0, "next_state":0}

In [16]:
def policy_improvement (states , nA , value_function, gamma=0.9):
    policy={}
    for s in states:
        value_per_action = np.zeros(shape=(nA,))
        for action in known_paths[s]:
                
            action_val =  known_paths[s][action]['reward'] + gamma * value_function[known_paths[s][action]["next_state"]]
            value_per_action[action] = action_val
        
        best_action = np.argmax(value_per_action)
        policy[s] = best_action
    
    return policy


def value_iteration(states, nA, epsilon, gamma=0.9, tol=1e-3):
    
    numIters = 0

    maxChange = np.inf
    
    done=False

    total_reward=0
    
    rand=0
    think=0
    obs=env.reset()
    s=tuple(obs.flatten())
    while not done:

        if s not in best_policy or random()<epsilon:
            a = env.action_space.sample()
            rand+=1
            #print("A mutat random")
            while not a:
                a = env.action_space.sample()
        else:
            #print("A gandit")
            think+=1
            a=best_policy[s]

        next_state, reward, done, _  = env.step(a)
        next_state=tuple(next_state.flatten())
        total_reward+=reward
        known_paths[s][a]={"reward":reward,"next_state":next_state}

        if next_state not in known_paths:
            known_paths[next_state]={}
            known_paths[next_state][0]={"reward":0,"next_state":0}
        
        s=next_state
        if done:      
            break

    print(f"Scorul dupa explorare {total_reward}")
    print(f'Miscari gandite {think}\nMiscari aleatoare {rand}')

    while maxChange > tol:
        numIters += 1
        maxChange = 0.0
        for s in states:
            bestActionValue = -np.inf
            for action in known_paths[s]:
                 
                if known_paths[s][action]['next_state'] not in value_function:
                    value_function[known_paths[s][action]['next_state']]=0
                
                value_for_thisAction = 0.0

                if known_paths[s][action]['next_state'] ==0:      
                    value_for_thisAction =  known_paths[s][action]['reward'] 
                else:
                    value_for_thisAction = known_paths[s][action]['reward'] + gamma * value_function[known_paths[s][action]['next_state']]
                    
                if value_for_thisAction > bestActionValue:
                    bestActionValue = value_for_thisAction
            
            if s in value_function:
                maxChange = max(maxChange, abs(value_function[s] - bestActionValue))

            value_function[s] = bestActionValue

        
    print(f"Value iteration converged after {numIters} steps\n")
    policy = policy_improvement(states, nA, value_function, gamma)
    return value_function, policy


In [17]:
def runEpisode(env, policy):
     
    total_reward = 0
    print("Joaca ce a invatat")
    done=False
    obs=env.reset()
    s=tuple(obs.flatten()) 
        
    while not done:
        if s in policy:
            action = policy[s]    
            newObs, reward, done, _,  = env.step(action)
            newObs= tuple(newObs.flatten())
        else:
            done=True

        s=newObs
        total_reward += reward
        if done:
            env.close()
            break
    
    print(f"Scorul politicii {total_reward}")  

In [18]:
if __name__=="__main__":

    epsilon=1
    best_policy={}
    for i in range(20):
        print(f"##############Episode: {i}###############\nEpsilon: {epsilon}\n")
        states=list(known_paths.keys())
        best_value, best_policy = value_iteration(states, env.action_space.n, epsilon, gamma=gamma, tol=10e-3)
        epsilon =max(0.05,epsilon*decay)
        with open('policy.csv','w') as f:
            fieldnames=['State','Action']
            writer =csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            rows = [{'State': key[0], 'Action': key[1]} for key in best_policy.items()]
            writer.writerows(rows)

        print(f"Stie {len(best_policy.keys())} mutari\n")

##############Episode: 0###############
Epsilon: 1

Scorul dupa explorare 340.0
Miscari gandite 0
Miscari aleatoare 243
Value iteration converged after 1 steps

Stie 1 mutari

##############Episode: 1###############
Epsilon: 0.8

Scorul dupa explorare 220.0
Miscari gandite 1
Miscari aleatoare 246
Value iteration converged after 44 steps

Stie 197 mutari

##############Episode: 2###############
Epsilon: 0.6400000000000001

Scorul dupa explorare 420.0
Miscari gandite 16
Miscari aleatoare 284
Value iteration converged after 44 steps

Stie 388 mutari

##############Episode: 3###############
Epsilon: 0.5120000000000001

Scorul dupa explorare 250.0
Miscari gandite 17
Miscari aleatoare 170
Value iteration converged after 44 steps

Stie 634 mutari

##############Episode: 4###############
Epsilon: 0.40960000000000013

Scorul dupa explorare 290.0
Miscari gandite 15
Miscari aleatoare 245
Value iteration converged after 44 steps

Stie 767 mutari

##############Episode: 5###############
Epsilon: 0.

In [19]:
runEpisode(env, policy=best_policy)

Joaca ce a invatat
Scorul politicii 360.0
