In [None]:
import numpy as np
import gym
import pandas as pd

# NumPy uyumluluƒüu (bazƒ± sistemlerde bool8 eksik olabiliyor)
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

# FrozenLake ortamƒ± (deterministik, yeni API kullanƒ±larak)
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)

# Q-table (16 state x 4 action)
q_table = np.zeros((env.observation_space.n, env.action_space.n))

# √ñƒürenme parametreleri
alpha = 0.3      # √∂ƒürenme oranƒ±
gamma = 0.99     # indirim fakt√∂r√º
epsilon = 0.5    # ke≈üfetme oranƒ±
max_episodes = 3000
grid_size = 4

# Y√∂nler
action_map = {0: "‚Üê", 1: "‚Üì", 2: "‚Üí", 3: "‚Üë"}

# Duvara √ßarpma kontrol√º
def is_invalid_move(state, action):
    row, col = divmod(state, grid_size)
    if action == 0 and col == 0: return True   # sol duvar
    if action == 1 and row == grid_size - 1: return True  # alt duvar
    if action == 2 and col == grid_size - 1: return True  # saƒü duvar
    if action == 3 and row == 0: return True   # √ºst duvar
    return False

# Yeni: Episode ge√ßmi≈üini saklamak i√ßin
episode_logs = []

# Eƒüitim d√∂ng√ºs√º
for episode in range(max_episodes):
    state = env.reset()
    done = False
    step = 0
    episode_info = {
        "episode": episode,
        "steps": [],
        "reached_goal": False
    }

    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        if is_invalid_move(state, action):
            reward = -0.5
            next_state = state
            done = False
        else:
            next_state, reward, done, info = env.step(action)

            if done and reward == 0:
                reward = -1

        # Q-value g√ºncelle
        old_q = q_table[state, action]
        next_max = np.max(q_table[next_state])
        q_table[state, action] = old_q + alpha * (reward + gamma * next_max - old_q)

        # Step kaydƒ±
        episode_info["steps"].append({
            "step": step,
            "state": state,
            "action": action_map[action],
            "next_state": next_state,
            "reward": reward
        })

        if done and reward == 1:
            episode_info["reached_goal"] = True

        state = next_state
        step += 1

    episode_info["q_table"] = q_table.copy()

    # Episode sonunda logu ekle
    episode_logs.append(episode_info)


# Son Q-tablosu
print("\n‚úÖ Eƒüitim tamamlandƒ±. Son Q-tablosu:")
final_df = pd.DataFrame(np.round(q_table, 2), columns=["‚Üê", "‚Üì", "‚Üí", "‚Üë"])
print(final_df)

# Optimal politika
print("\nüß≠ Optimal Policy:")
for s in range(env.observation_space.n):
    best_a = np.argmax(q_table[s])
    print(f"State {s}: {action_map[best_a]}")



def print_episode_log(ep_log):
    print(f"\nüìò Episode {ep_log['episode']} | Reached Goal: {ep_log['reached_goal']}")
    for step_info in ep_log["steps"]:
        print(f"Step {step_info['step']}: {step_info['state']} ‚Üí {step_info['action']} ‚Üí {step_info['next_state']}, Reward: {step_info['reward']}")

    # Episode sonrasƒ± Q-tablosu g√∂sterimi (ge√ßmi≈ü snapshot)
    print("üìä Bu episode SONRASINDAKƒ∞ Q-Tablosu:")
    df = pd.DataFrame(np.round(ep_log["q_table"], 2), columns=["‚Üê", "‚Üì", "‚Üí", "‚Üë"])
    print(df)



# ƒ∞lk 10 episode
print("\n--- üü¢ ƒ∞lk 10 Episode ---")
for ep in episode_logs[:10]:
    print_episode_log(ep)

# Hedefe ula≈üƒ±lan episode'lar
goal_episodes = [i for i, ep in enumerate(episode_logs) if ep["reached_goal"]]

def get_range_around(idx, before=1, after=5):
    start = max(0, idx - before)
    end = min(len(episode_logs), idx + after + 1)
    return list(range(start, end))

# Hedefe ula≈üƒ±lan ilk, ikinci ve son episode ve √ßevresi
selected_indexes = set()
if len(goal_episodes) > 0:
    selected_indexes.update(get_range_around(goal_episodes[0]))
if len(goal_episodes) > 1:
    selected_indexes.update(get_range_around(goal_episodes[1]))
if len(goal_episodes) >= 1:
    selected_indexes.update(get_range_around(goal_episodes[-1]))

# En son 5 episode
selected_indexes.update(range(max(0, len(episode_logs) - 5), len(episode_logs)))

# √áiftleri tekrar etmeden sƒ±rayla yazdƒ±r
print("\n--- üî∂ Belirli Episode √áƒ±ktƒ±larƒ± ---")
for i in sorted(selected_indexes):
    print_episode_log(episode_logs[i])


  deprecation(
  deprecation(



‚úÖ Eƒüitim tamamlandƒ±. Son Q-tablosu:
       ‚Üê     ‚Üì     ‚Üí     ‚Üë
0   0.44  0.95  0.95  0.44
1   0.94 -1.00  0.96  0.45
2   0.95  0.97  0.95  0.46
3   0.96 -0.99  0.45  0.45
4   0.45  0.96 -1.00  0.94
5   0.00  0.00  0.00  0.00
6  -1.00  0.98 -1.00  0.96
7   0.00  0.00  0.00  0.00
8   0.46 -1.00  0.97  0.95
9   0.96  0.98  0.98 -1.00
10  0.97  0.99 -1.00  0.97
11  0.00  0.00  0.00  0.00
12  0.00  0.00  0.00  0.00
13 -1.00  0.48  0.99  0.97
14  0.98  0.49  1.00  0.98
15  0.00  0.00  0.00  0.00

üß≠ Optimal Policy:
State 0: ‚Üì
State 1: ‚Üí
State 2: ‚Üì
State 3: ‚Üê
State 4: ‚Üì
State 5: ‚Üê
State 6: ‚Üì
State 7: ‚Üê
State 8: ‚Üí
State 9: ‚Üì
State 10: ‚Üì
State 11: ‚Üê
State 12: ‚Üê
State 13: ‚Üí
State 14: ‚Üí
State 15: ‚Üê

--- üü¢ ƒ∞lk 10 Episode ---

üìò Episode 0 | Reached Goal: False
Step 0: 0 ‚Üí ‚Üê ‚Üí 0, Reward: -0.5
Step 1: 0 ‚Üí ‚Üì ‚Üí 4, Reward: 0.0
Step 2: 4 ‚Üí ‚Üí ‚Üí 5, Reward: -1
üìä Bu episode SONRASINDAKƒ∞ Q-Tablosu:
       ‚Üê    ‚Üì    ‚Üí    ‚Üë
0  -

In [None]:
target_action = 0  # ‚Üê
found_index = None

# State 0 i√ßin ‚Üê aksiyonunun ilk pozitif olduƒüu episode'u bul
for i, ep in enumerate(episode_logs):
    if ep["q_table"][0][target_action] > 0:
        found_index = i
        break

# Eƒüer bulunduysa, √∂nceki 1 ve sonraki 5 episode ile birlikte yazdƒ±r
if found_index is not None:
    print(f"\nüéØ State 0 i√ßin ‚Üê y√∂n√º ilk kez pozitif oldu ‚Üí Episode {found_index}")

    # Yazdƒ±rƒ±lacak index aralƒ±ƒüƒ±nƒ± olu≈ütur
    start = max(0, found_index - 1)
    end = min(len(episode_logs), found_index + 6)

    for idx in range(start, end):
        ep = episode_logs[idx]
        print(f"\nüìò Episode {ep['episode']} | Reached Goal: {ep['reached_goal']}")
        df = pd.DataFrame(np.round(ep["q_table"], 2), columns=["‚Üê", "‚Üì", "‚Üí", "‚Üë"])
        print(df)
else:
    print("‚ùå Hi√ßbir episode'da state 0 i√ßin ‚Üê y√∂n√º pozitif olmamƒ±≈ü.")



üéØ State 0 i√ßin ‚Üê y√∂n√º ilk kez pozitif oldu ‚Üí Episode 71

üìò Episode 70 | Reached Goal: False
       ‚Üê     ‚Üì     ‚Üí     ‚Üë
0  -0.38  0.91  0.14  0.22
1   0.49  0.00  0.02  0.00
2   0.00  0.18  0.00  0.00
3   0.00  0.00  0.00  0.00
4   0.35  0.93 -0.99  0.57
5   0.00  0.00  0.00  0.00
6  -0.51  0.59 -0.30  0.04
7   0.00  0.00  0.00  0.00
8   0.21 -1.00  0.95  0.45
9   0.76  0.71  0.97 -0.88
10  0.41  0.99 -0.76  0.28
11  0.00  0.00  0.00  0.00
12  0.00  0.00  0.00  0.00
13 -0.66 -0.15  0.94  0.00
14  0.45  0.00  1.00  0.29
15  0.00  0.00  0.00  0.00

üìò Episode 71 | Reached Goal: False
       ‚Üê     ‚Üì     ‚Üí     ‚Üë
0   0.02  0.92  0.14  0.22
1   0.49  0.00  0.02  0.00
2   0.00  0.18  0.00  0.00
3   0.00  0.00  0.00  0.00
4   0.37  0.93 -0.99  0.67
5   0.00  0.00  0.00  0.00
6  -0.51  0.59 -0.30  0.04
7   0.00  0.00  0.00  0.00
8   0.21 -1.00  0.95  0.45
9   0.76  0.71  0.97 -0.88
10  0.41  0.99 -0.76  0.28
11  0.00  0.00  0.00  0.00
12  0.00  0.00  0.00  0.00
13