In [None]:
import numpy as np
import gym
import pandas as pd

# NumPy uyumluluğu (bazı sistemlerde bool8 eksik olabiliyor)
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

# FrozenLake ortamı (deterministik, yeni API kullanılarak)
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)

# Q-table (16 state x 4 action)
q_table = np.zeros((env.observation_space.n, env.action_space.n))

# Öğrenme parametreleri
alpha = 0.3      # öğrenme oranı
gamma = 0.99     # indirim faktörü
epsilon = 0.5    # keşfetme oranı
max_episodes = 3000
grid_size = 4

# Yönler
action_map = {0: "←", 1: "↓", 2: "→", 3: "↑"}

# Duvara çarpma kontrolü
def is_invalid_move(state, action):
    row, col = divmod(state, grid_size)
    if action == 0 and col == 0: return True   # sol duvar
    if action == 1 and row == grid_size - 1: return True  # alt duvar
    if action == 2 and col == grid_size - 1: return True  # sağ duvar
    if action == 3 and row == 0: return True   # üst duvar
    return False

# Yeni: Episode geçmişini saklamak için
episode_logs = []

# Eğitim döngüsü
for episode in range(max_episodes):
    state = env.reset()
    done = False
    step = 0
    episode_info = {
        "episode": episode,
        "steps": [],
        "reached_goal": False
    }

    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        if is_invalid_move(state, action):
            reward = -0.5
            next_state = state
            done = False
        else:
            next_state, reward, done, info = env.step(action)

            if done and reward == 0:
                reward = -1

        # Q-value güncelle
        old_q = q_table[state, action]
        next_max = np.max(q_table[next_state])
        q_table[state, action] = old_q + alpha * (reward + gamma * next_max - old_q)

        # Step kaydı
        episode_info["steps"].append({
            "step": step,
            "state": state,
            "action": action_map[action],
            "next_state": next_state,
            "reward": reward
        })

        if done and reward == 1:
            episode_info["reached_goal"] = True

        state = next_state
        step += 1

    episode_info["q_table"] = q_table.copy()

    # Episode sonunda logu ekle
    episode_logs.append(episode_info)


# Son Q-tablosu
print("\n✅ Eğitim tamamlandı. Son Q-tablosu:")
final_df = pd.DataFrame(np.round(q_table, 2), columns=["←", "↓", "→", "↑"])
print(final_df)

# Optimal politika
print("\n🧭 Optimal Policy:")
for s in range(env.observation_space.n):
    best_a = np.argmax(q_table[s])
    print(f"State {s}: {action_map[best_a]}")



def print_episode_log(ep_log):
    print(f"\n📘 Episode {ep_log['episode']} | Reached Goal: {ep_log['reached_goal']}")
    for step_info in ep_log["steps"]:
        print(f"Step {step_info['step']}: {step_info['state']} → {step_info['action']} → {step_info['next_state']}, Reward: {step_info['reward']}")

    # Episode sonrası Q-tablosu gösterimi (geçmiş snapshot)
    print("📊 Bu episode SONRASINDAKİ Q-Tablosu:")
    df = pd.DataFrame(np.round(ep_log["q_table"], 2), columns=["←", "↓", "→", "↑"])
    print(df)



# İlk 10 episode
print("\n--- 🟢 İlk 10 Episode ---")
for ep in episode_logs[:10]:
    print_episode_log(ep)

# Hedefe ulaşılan episode'lar
goal_episodes = [i for i, ep in enumerate(episode_logs) if ep["reached_goal"]]

def get_range_around(idx, before=1, after=5):
    start = max(0, idx - before)
    end = min(len(episode_logs), idx + after + 1)
    return list(range(start, end))

# Hedefe ulaşılan ilk, ikinci ve son episode ve çevresi
selected_indexes = set()
if len(goal_episodes) > 0:
    selected_indexes.update(get_range_around(goal_episodes[0]))
if len(goal_episodes) > 1:
    selected_indexes.update(get_range_around(goal_episodes[1]))
if len(goal_episodes) >= 1:
    selected_indexes.update(get_range_around(goal_episodes[-1]))

# En son 5 episode
selected_indexes.update(range(max(0, len(episode_logs) - 5), len(episode_logs)))

# Çiftleri tekrar etmeden sırayla yazdır
print("\n--- 🔶 Belirli Episode Çıktıları ---")
for i in sorted(selected_indexes):
    print_episode_log(episode_logs[i])


  deprecation(
  deprecation(



✅ Eğitim tamamlandı. Son Q-tablosu:
       ←     ↓     →     ↑
0   0.44  0.95  0.95  0.44
1   0.94 -1.00  0.96  0.45
2   0.95  0.97  0.95  0.46
3   0.96 -0.99  0.45  0.45
4   0.45  0.96 -1.00  0.94
5   0.00  0.00  0.00  0.00
6  -1.00  0.98 -1.00  0.96
7   0.00  0.00  0.00  0.00
8   0.46 -1.00  0.97  0.95
9   0.96  0.98  0.98 -1.00
10  0.97  0.99 -1.00  0.97
11  0.00  0.00  0.00  0.00
12  0.00  0.00  0.00  0.00
13 -1.00  0.48  0.99  0.97
14  0.98  0.49  1.00  0.98
15  0.00  0.00  0.00  0.00

🧭 Optimal Policy:
State 0: ↓
State 1: →
State 2: ↓
State 3: ←
State 4: ↓
State 5: ←
State 6: ↓
State 7: ←
State 8: →
State 9: ↓
State 10: ↓
State 11: ←
State 12: ←
State 13: →
State 14: →
State 15: ←

--- 🟢 İlk 10 Episode ---

📘 Episode 0 | Reached Goal: False
Step 0: 0 → ← → 0, Reward: -0.5
Step 1: 0 → ↓ → 4, Reward: 0.0
Step 2: 4 → → → 5, Reward: -1
📊 Bu episode SONRASINDAKİ Q-Tablosu:
       ←    ↓    →    ↑
0  -0.15  0.0  0.0  0.0
1   0.00  0.0  0.0  0.0
2   0.00  0.0  0.0  0.0
3   0.00  0.0  0

In [None]:
target_action = 0  # ←
found_index = None

# State 0 için ← aksiyonunun ilk pozitif olduğu episode'u bul
for i, ep in enumerate(episode_logs):
    if ep["q_table"][0][target_action] > 0:
        found_index = i
        break

# Eğer bulunduysa, önceki 1 ve sonraki 5 episode ile birlikte yazdır
if found_index is not None:
    print(f"\n🎯 State 0 için ← yönü ilk kez pozitif oldu → Episode {found_index}")

    # Yazdırılacak index aralığını oluştur
    start = max(0, found_index - 1)
    end = min(len(episode_logs), found_index + 6)

    for idx in range(start, end):
        ep = episode_logs[idx]
        print(f"\n📘 Episode {ep['episode']} | Reached Goal: {ep['reached_goal']}")
        df = pd.DataFrame(np.round(ep["q_table"], 2), columns=["←", "↓", "→", "↑"])
        print(df)
else:
    print("❌ Hiçbir episode'da state 0 için ← yönü pozitif olmamış.")



🎯 State 0 için ← yönü ilk kez pozitif oldu → Episode 71

📘 Episode 70 | Reached Goal: False
       ←     ↓     →     ↑
0  -0.38  0.91  0.14  0.22
1   0.49  0.00  0.02  0.00
2   0.00  0.18  0.00  0.00
3   0.00  0.00  0.00  0.00
4   0.35  0.93 -0.99  0.57
5   0.00  0.00  0.00  0.00
6  -0.51  0.59 -0.30  0.04
7   0.00  0.00  0.00  0.00
8   0.21 -1.00  0.95  0.45
9   0.76  0.71  0.97 -0.88
10  0.41  0.99 -0.76  0.28
11  0.00  0.00  0.00  0.00
12  0.00  0.00  0.00  0.00
13 -0.66 -0.15  0.94  0.00
14  0.45  0.00  1.00  0.29
15  0.00  0.00  0.00  0.00

📘 Episode 71 | Reached Goal: False
       ←     ↓     →     ↑
0   0.02  0.92  0.14  0.22
1   0.49  0.00  0.02  0.00
2   0.00  0.18  0.00  0.00
3   0.00  0.00  0.00  0.00
4   0.37  0.93 -0.99  0.67
5   0.00  0.00  0.00  0.00
6  -0.51  0.59 -0.30  0.04
7   0.00  0.00  0.00  0.00
8   0.21 -1.00  0.95  0.45
9   0.76  0.71  0.97 -0.88
10  0.41  0.99 -0.76  0.28
11  0.00  0.00  0.00  0.00
12  0.00  0.00  0.00  0.00
13 -0.66 -0.15  0.94  0.00
14  0.4