<a href="https://colab.research.google.com/github/Bosy-Ayman/DSAI-402-RL/blob/main/Assignment3_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
NUM_DISKS = 3
PEGS = ('A', 'B', 'C')         # 0->A, 1->B, 2->C
GOAL_STATE = (2, 2, 2)         # (C, C, C)




In [27]:
def get_next_state(state, disk_to_move, to_peg_num):
    new_state = list(state)
    new_state[disk_to_move - 1] = to_peg_num
    return tuple(new_state)



In [28]:

def is_legal_move(state, from_peg, to_peg):
    disk_to_move = None

    for disk in range(1, NUM_DISKS + 1):
        if state[disk - 1] == from_peg:
            disk_to_move = disk
            break

    if disk_to_move is None:
        return False, None

    # Find top disk
    top_disk_target = None
    for disk in range(1, NUM_DISKS + 1):
        if state[disk - 1] == to_peg:
            top_disk_target = disk
            break

    # Rule check
    if top_disk_target is not None and disk_to_move > top_disk_target:
        return False, None

    return True, disk_to_move

In [29]:

current_disk_to_try = 1

def naive_policy(state):
    global current_disk_to_try
    if state == GOAL_STATE:
        return None, None, None
    NUM_DISKS = len(state)

    # First try forward moves (A→B→C)
    for i in range(NUM_DISKS):
        disk_to_try = ((current_disk_to_try - 1 + i) % NUM_DISKS) + 1
        from_peg = state[disk_to_try - 1]

        for to_peg in range(3):
            if to_peg == from_peg:
                continue
            legal, disk = is_legal_move(state, from_peg, to_peg)
            if legal and disk == disk_to_try:
                current_disk_to_try = (disk_to_try % NUM_DISKS) + 1
                return from_peg, to_peg, disk_to_try

    return None, None, None


In [30]:

def print_pegs(state):
    pegs = {0: [], 1: [], 2: []}
    for disk in range(NUM_DISKS, 0, -1):
        pegs[state[disk - 1]].append(disk)
    print(f"A: {pegs[0]}")
    print(f"B: {pegs[1]}")
    print(f"C: {pegs[2]}")
    print()





In [31]:

def print_pegs_stacks(stacks):
    for p in range(3):
        print(f"{PEGS[p]}: {stacks[p]}")
    print()




In [32]:
def legal_move(fr, to, stacks):
    if len(stacks[fr]) == 0:
        return False
    if len(stacks[to]) == 0 or stacks[fr][-1] < stacks[to][-1]:
        disk = stacks[fr].pop()
        stacks[to].append(disk)
        print(f"Move Disk {disk} {PEGS[fr]} → {PEGS[to]}")
        return True
    else:
        return False


In [33]:
def iterative_naive_solve(n=NUM_DISKS, src=0, aux=1, dest=2):
    stacks = [[], [], []]
    stacks[src] = list(range(n, 0, -1))
    total_moves = (1 << n) - 1
    if n % 2 == 0:
        aux, dest = dest, aux
    print("Initial:")
    print_pegs_stacks(stacks)
    step = 0
    for i in range(1, total_moves + 1):
        if i % 3 == 1:
            if not legal_move(src, dest, stacks):
                legal_move(dest, src, stacks)
        elif i % 3 == 2:
            if not legal_move(src, aux, stacks):
                legal_move(aux, src, stacks)
        else:
            if not legal_move(aux, dest, stacks):
                legal_move(dest, aux, stacks)
        step += 1
        print_pegs_stacks(stacks)
    print(f"Goal reached in {step} moves.")


In [34]:

def one_iteration():
    value_function = {}
    for d1 in range(3):
        for d2 in range(3):
            for d3 in range(3):
                state = (d1, d2, d3)
                from_peg, to_peg, disk = naive_policy(state)
                if from_peg is None:
                    next_state = state
                else:
                    next_state = get_next_state(state, disk, to_peg)

                reward = 1 if next_state == GOAL_STATE else 0
                value_function[tuple(PEGS[p] for p in state)] = reward
    return value_function


In [37]:
print("--- Iterative Naive Policy Simulation ---")
iterative_naive_solve()

print("\n--- Value Function v_pi After One Iteration  ---")
v1 = one_iteration()
for s in sorted(v1.keys()):
    print(f"State {s}: {v1[s]}")

--- Iterative Naive Policy Simulation ---
Initial:
A: [3, 2, 1]
B: []
C: []

Move Disk 1 A → C
A: [3, 2]
B: []
C: [1]

Move Disk 2 A → B
A: [3]
B: [2]
C: [1]

Move Disk 1 C → B
A: [3]
B: [2, 1]
C: []

Move Disk 3 A → C
A: []
B: [2, 1]
C: [3]

Move Disk 1 B → A
A: [1]
B: [2]
C: [3]

Move Disk 2 B → C
A: [1]
B: []
C: [3, 2]

Move Disk 1 A → C
A: []
B: []
C: [3, 2, 1]

Goal reached in 7 moves.

--- Value Function v_pi After One Iteration  ---
State ('A', 'A', 'A'): 0
State ('A', 'A', 'B'): 0
State ('A', 'A', 'C'): 0
State ('A', 'B', 'A'): 0
State ('A', 'B', 'B'): 0
State ('A', 'B', 'C'): 0
State ('A', 'C', 'A'): 0
State ('A', 'C', 'B'): 0
State ('A', 'C', 'C'): 0
State ('B', 'A', 'A'): 0
State ('B', 'A', 'B'): 0
State ('B', 'A', 'C'): 0
State ('B', 'B', 'A'): 0
State ('B', 'B', 'B'): 0
State ('B', 'B', 'C'): 0
State ('B', 'C', 'A'): 0
State ('B', 'C', 'B'): 0
State ('B', 'C', 'C'): 0
State ('C', 'A', 'A'): 0
State ('C', 'A', 'B'): 0
State ('C', 'A', 'C'): 0
State ('C', 'B', 'A'): 0
State 