In [1]:
import collections
import numpy as np
import random
import pickle
import os
import sys
import time
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
q_dict = collections.defaultdict(dict)
state_track = collections.defaultdict(dict)

def initialize_track_states():
    sample_q = [('x-x-x-x-x-x-x-x-x', (5,9)), \
                ('x-x-x-x-x-x-x-6-1', (0,3)), \
                ('4-x-x-x-x-x-x-x-7', (4,1)), \
                ('x-x-x-x-x-x-x-8-5', (4,9)), \
                ]
    
    for q_vals in sample_q:
        state = q_vals[0]
        action = q_vals[1]

        state_track[state][action] = []

def save_obj(obj, name):
    with open(name + '.pkl',"wb") as f:
        pickle.dump(obj,f,pickle.HIGHEST_PROTOCOL)

def save_track_states():
    for state in state_track.keys():
        for action in state_track[state].keys():
            if state in q_dict and action in q_dict[state]:
                state_track[state][action].append(q_dict[state][action])
                

In [3]:
from Env import TicTacToe

env = TicTacToe()

def q_state(state):
    return ("-".join(str(e) for e in state)).replace('nan','x')


def valid_action(state):
    agent_actions, _ = env.action_space(state)
    # Đảm bảo mọi phần tử trong valid_act là tuple
    return [tuple(action) if isinstance(action, list) else action for action in agent_actions]


def add_to_dict(state):
    state1 = convert_to_hashable(q_state(state))  # Chuyển state1 thành dạng hashable
    valid_act = valid_action(state)
    if state1 not in Q_dict:
        Q_dict[state1] = {}  # Khởi tạo dictionary con
        for action in valid_act:
            # Kiểm tra nếu action không phải là dạng hashable, chuyển đổi thành tuple
            if not isinstance(action, tuple):
                action = tuple(action)
            Q_dict[state1][action] = 0





In [4]:
def epsilon_greedy(state, step_account,z):
    if step_account > z:
        q_states = q_state(state)
        action = max(q_dict[q_states],key=q_dict[q_states].get)

    else:
        action = random.choice(valid_action(state))

    return action

In [5]:
initialize_track_states()

In [6]:
episodes = 500
lr = 0.01
gamma = 0.95
debug = False

def run_agent(z, episodes=episodes, lr=lr, gamma=gamma, debug=debug,env=env):
    start_time = time.time()
    summary = {10:0, -10:0, 0:0}

    for episode in range(episodes):
        env = TicTacToe()
        current_state = env.state
        add_to_dict(current_state)

        step_count = 0
        total_reward = 0
        terminated = False

        if debug:
            print(f"\nEpisodes:{episode}")
            print(str(current_state[0]).replace('nan','x'), str(current_state[1]).replace('nan','x'),str(current_state[2]).replace('nan','x'))
            print(str(current_state[3]).replace('nan','x'), str(current_state[4]).replace('nan','x'),str(current_state[5]).replace('nan','x'))
            print(str(current_state[6]).replace('nan','x'), str(current_state[7]).replace('nan','x'),str(current_state[8]).replace('nan','x'))

        while terminated == False:
            current_action = epsilon_greedy(current_state,step_count,z)
            next_state, reward, terminated = env.step(current_state,current_action)
            add_to_dict(next_state)
            
            next_q_state = q_state(next_state)
            current_q_state = q_state(current_state)

            if len(q_dict[next_q_state]) > 0:
                next_q_value = max(q_dict[next_q_state], key=q_dict[next_q_state].get)
                q_dict[current_q_state][current_action] += lr *((reward + (gamma * q_dict[next_q_state][max_next]))-q_dict[current_q_state][current_action])

            current_state = next_state
            total_reward += 1 
            step_count += 1 


            if debug:
                print(f"\nEpisodes:{episode}")
                print(str(current_state[0]).replace('nan','x'), str(current_state[1]).replace('nan','x'),str(current_state[2]).replace('nan','x'))
                print(str(current_state[3]).replace('nan','x'), str(current_state[4]).replace('nan','x'),str(current_state[5]).replace('nan','x'))
                print(str(current_state[6]).replace('nan','x'), str(current_state[7]).replace('nan','x'),str(current_state[8]).replace('nan','x'))
                print()

        summary[reward] = summary[reward]+1
        total_steps = total_steps + step_count

        if debug:
            print(f"Results reward: {reward}")

        save_track_states()

    elapsed_time = time.time() - start_time
    save_obj(state_track,'state_tracked')
    save_obj(q_dict,"Policy")
    
    return elapsed_time, summary, round(total_steps/episodes,2)

df = {"z":[], "run_time":[], "win_count": [], "losse_count": [], "tie_count":[], "average_step_per_episode":[]}
env = TicTacToe()
for z in range(-1,6):
    q_dict = collections.defaultdict(dict)
    state_track = collections.defaultdict(dict)

    run_time, reward_summary, average_steps_per_episode = run_agent(z, episodes=50000, env=env) 
    df["z"].append(z)
    df["run_time"].append(run_time)
    df["win_count"].append(reward_summary[1])
    df["losse_count"].append(reward_summary[-1])
    df["tie_count"].append(reward_summary[0])
    df["average_step_per_episode"].append(average_steps_per_episode)
    print("Completed!!!")




TypeError: unhashable type: 'list'

In [7]:
import collections
import numpy as np
import random
import pickle
import os
import sys
import time
import datetime
import matplotlib.pyplot as plt
from Env import TicTacToe

Q_dict = collections.defaultdict(dict)
state_track = collections.defaultdict(dict)

def initialize_track_states():
    sample_q = [('x-x-x-x-x-x-x-x-x', (5,9)),
                ('x-x-x-x-x-x-x-6-1', (0,3)),
                ('4-x-x-x-x-x-x-x-7', (4,1)),
                ('x-x-x-x-x-x-x-8-5', (4,9))]
    
    for q_vals in sample_q:
        state = q_vals[0]
        action = q_vals[1]
        state_track[state][action] = []

def save_obj(obj, name):
    with open(name + '.pkl', "wb") as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def save_track_states():
    for state in state_track.keys():
        for action in state_track[state].keys():
            if state in q_dict and action in q_dict[state]:
                state_track[state][action].append(q_dict[state][action])

def q_state(state):
    return "-".join('x' if isinstance(x, float) and np.isnan(x) else str(int(x)) for x in state)

def valid_action(state):
    agent_actions, _ = env.action_space(state)
    # Đảm bảo mọi phần tử trong valid_act là tuple
    return [tuple(action) if isinstance(action, list) else action for action in agent_actions]

def convert_to_hashable(obj):
    """Chuyển đổi đối tượng sang dạng có thể băm được."""
    if isinstance(obj, list):
        # Nếu là danh sách, chuyển đổi mỗi phần tử thành dạng hashable
        return tuple(convert_to_hashable(item) for item in obj)
    elif isinstance(obj, dict):
        # Nếu là từ điển, chuyển mỗi cặp key-value thành tuple
        return tuple((key, convert_to_hashable(value)) for key, value in obj.items())
    elif isinstance(obj, set):
        # Nếu là tập hợp, chuyển thành tuple
        return tuple(sorted(convert_to_hashable(item) for item in obj))
    else:
        # Trả về đối tượng nếu là dạng hashable (số, chuỗi, tuple, v.v.)
        return obj

def add_to_dict(state):
    state1 = convert_to_hashable(q_state(state))  # Chuyển state1 thành dạng hashable
    valid_act = valid_action(state)
    if state1 not in Q_dict:
        Q_dict[state1] = {}  # Khởi tạo dictionary con
        for action in valid_act:
            # Kiểm tra nếu action không phải là dạng hashable, chuyển đổi thành tuple
            if not isinstance(action, tuple):
                action = tuple(action)
            Q_dict[state1][action] = 0





def epsilon_greedy(state, step_account, z):
    state_str = q_state(state)
    valid_act = valid_action(state)
    
    if not valid_act:  # If no valid actions
        return None
    
    if step_account > z and state_str in q_dict and q_dict[state_str]:
        return max(q_dict[state_str].items(), key=lambda x: x[1])[0]
    else:
        return random.choice(valid_act)

def run_agent(z, episodes=500, lr=0.01, gamma=0.95, debug=False, env=None):
    if env is None:
        env = TicTacToe()
        
    start_time = time.time()
    summary = {10: 0, -10: 0, 0: 0}
    total_steps = 0
    
    for episode in range(episodes):
        env = TicTacToe()  # Reset environment
        current_state = env.state
        add_to_dict(current_state)
        
        step_count = 0
        terminated = False
        
        if debug:
            print(f"\nEpisode: {episode}")
            print_state(current_state)
        
        while not terminated:
            current_action = epsilon_greedy(current_state, step_count, z)
            if current_action is None:
                break
                
            next_state, reward, terminated = env.step(current_state, current_action)
            
            add_to_dict(next_state)
            next_q_state = q_state(next_state)
            current_q_state = q_state(current_state)
            
            # Update Q-value
            if next_q_state in q_dict and q_dict[next_q_state]:
                next_max_q = max(q_dict[next_q_state].values())
                q_dict[current_q_state][current_action] += lr * (
                    reward + gamma * next_max_q - q_dict[current_q_state][current_action]
                )
            
            current_state = next_state
            step_count += 1
            
            if debug:
                print(f"\nStep: {step_count}")
                print_state(current_state)
        
        if reward in summary:
            summary[reward] += 1
        total_steps += step_count
        
        if episode % 1000 == 0:
            print(f"Episode {episode} completed")
            
        save_track_states()
    
    elapsed_time = time.time() - start_time
    save_obj(state_track, 'state_tracked')
    save_obj(q_dict, "Policy")
    
    return elapsed_time, summary, round(total_steps/episodes, 2)

def print_state(state):
    for i in range(0, 9, 3):
        print(" ".join('x' if isinstance(x, float) and np.isnan(x) else str(int(x)) for x in state[i:i+3]))
    print()

# Main execution
df = {
    "z": [], 
    "run_time": [], 
    "win_count": [], 
    "losse_count": [], 
    "tie_count": [],
    "average_step_per_episode": []
}

env = TicTacToe()
for z in range(-1, 6):
    print(f"\nStarting training with z={z}")
    q_dict = collections.defaultdict(dict)
    state_track = collections.defaultdict(dict)
    
    run_time, reward_summary, average_steps_per_episode = run_agent(z, episodes=50000, env=env)
    
    df["z"].append(z)
    df["run_time"].append(run_time)
    df["win_count"].append(reward_summary[10])
    df["losse_count"].append(reward_summary[-10])
    df["tie_count"].append(reward_summary[0])
    df["average_step_per_episode"].append(average_steps_per_episode)
    print(f"Completed z={z}!")


Starting training with z=-1


TypeError: unhashable type: 'list'