In [1]:
import collections
import numpy as np
import random
import pickle
import os
import sys
import time
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
q_dict = collections.defaultdict(dict)
state_track = collections.defaultdict(dict)

def initialize_track_states():
    sample_q = [('x-x-x-x-x-x-x-x-x', (5,9)), \
                ('x-x-x-x-x-x-x-6-1', (0,3)), \
                ('4-x-x-x-x-x-x-x-7', (4,1)), \
                ('x-x-x-x-x-x-x-8-5', (4,9)), \
                ]
    
    for q_vals in sample_q:
        state = q_vals[0]
        action = q_vals[1]

        state_track[state][action] = []

def save_obj(obj, name):
    with open(name + '.pkl',"wb") as f:
        pickle.dump(obj,f,pickle.HIGHEST_PROTOCOL)

def save_track_states():
    for state in state_track.keys():
        for action in state_track[state].keys():
            if state in q_dict and action in q_dict[state]:
                state_track[state][action].append(q_dict[state][action])
                

In [3]:
from Env import TicTacToe

env = TicTacToe()

def q_state(state):
    return ("-".join(str(e) for e in state)).replace('nan','x')


def valid_action(state):
    valid_actions = [i for i in env.action_space(state)[0]]
    return valid_actions

def add_to_dict(state):
    state_1 = q_state(state)
    valid_act = valid_action(state)
    if state_1 not in q_dict.keys():
        for action in valid_act:
            q_dict[state_1][action] = 0



In [4]:
def epsilon_greedy(state, step_account,z):
    if step_account > z:
        q_states = q_state(state)
        action = max(q_dict[q_states],key=q_dict[q_states].get)

    else:
        action = random.choice(valid_action(state))

    return action

In [5]:
initialize_track_states()

In [6]:
episodes = 500
lr = 0.01
gamma = 0.95
debug = False

def run_agent(z, episodes=episodes, lr=lr, gamma=gamma, debug=debug,env=env):
    start_time = time.time()
    summary = {10:0, -10:0, 0:0}

    for episode in range(episodes):
        env = TicTacToe()
        current_state = env.state
        add_to_dict(current_state)

        step_count = 0
        total_reward = 0
        terminated = False

        if debug:
            print(f"\nEpisodes:{episode}")
            print(str(current_state[0]).replace('nan','x'), str(current_state[1]).replace('nan','x'),str(current_state[2]).replace('nan','x'))
            print(str(current_state[3]).replace('nan','x'), str(current_state[4]).replace('nan','x'),str(current_state[5]).replace('nan','x'))
            print(str(current_state[6]).replace('nan','x'), str(current_state[7]).replace('nan','x'),str(current_state[8]).replace('nan','x'))

        while terminated == False:
            current_action = epsilon_greedy(current_state,step_count,z)
            next_state, reward, terminated = env.step(current_state,current_action)
            add_to_dict(next_state)
            
            next_q_state = q_state(next_state)
            current_q_state = q_state(current_state)

            if len(q_dict[next_q_state]) > 0:
                next_q_value = max(q_dict[next_q_state], key=q_dict[next_q_state].get)
                q_dict[current_q_state][current_action] += lr *((reward + (gamma * q_dict[next_q_state][max_next]))-q_dict[current_q_state][current_action])

            current_state = next_state
            total_reward += 1 
            step_count += 1 


            if debug:
                print(f"\nEpisodes:{episode}")
                print(str(current_state[0]).replace('nan','x'), str(current_state[1]).replace('nan','x'),str(current_state[2]).replace('nan','x'))
                print(str(current_state[3]).replace('nan','x'), str(current_state[4]).replace('nan','x'),str(current_state[5]).replace('nan','x'))
                print(str(current_state[6]).replace('nan','x'), str(current_state[7]).replace('nan','x'),str(current_state[8]).replace('nan','x'))
                print()

        summary[reward] = summary[reward]+1
        total_steps = total_steps + step_count

        if debug:
            print(f"Results reward: {reward}")

        save_track_states()

    elapsed_time = time.time() - start_time
    save_obj(state_track,'state_tracked')
    save_obj(q_dict,"Policy")
    
    return elapsed_time, summary, round(total_steps/episodes,2)

df = {"z":[], "run_time":[], "win_count": [], "losse_count": [], "tie_count":[], "average_step_per_episode":[]}
env = TicTacToe()
for z in range(-1,6):
    q_dict = collections.defaultdict(dict)
    state_track = collections.defaultdict(dict)

    run_time, reward_summary, average_step_per_episodes = run_agent(z, episodes=5000, env=env) 
    df["z"].append(z)
    df["run_time"].append(run_time)
    df["win_count"].append(reward_summary[1])
    df["losse_count"].append(reward_summary[-1])
    df["tie_count"].append(reward_summary[0])
    df["average_step_per_episode"].append(average_step_per_episodes)
    print("Completed!!!")




TypeError: unhashable type: 'list'