In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# ============================================================================
# SUPER TIC-TAC-TOE - KAGGLE GPU TRAINING CELL
# ============================================================================
# Run this single cell in Kaggle with GPU enabled
# Downloads: super_ttt_agents.zip (optimized for quick load in Streamlit)
# ============================================================================

import numpy as np
from collections import deque
import random
import json
import zipfile
import io
from tqdm import tqdm
import time

# ============================================================================
# ENVIRONMENT
# ============================================================================

class SuperTicTacToe:
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.small_boards = [np.zeros((3, 3), dtype=int) for _ in range(9)]
        self.meta_board = np.zeros(9, dtype=int)
        self.current_player = 1
        self.active_board = None
        self.game_over = False
        self.winner = None
        self.move_history = []
        return self.get_state()
    
    def get_state(self):
        small_boards_flat = tuple(tuple(board.flatten()) for board in self.small_boards)
        return (small_boards_flat, tuple(self.meta_board), self.active_board)
    
    def get_available_actions(self):
        actions = []
        if self.game_over:
            return actions
        
        if self.active_board is not None and self.meta_board[self.active_board] == 0:
            boards_to_check = [self.active_board]
        else:
            boards_to_check = [i for i in range(9) if self.meta_board[i] == 0]
        
        for board_idx in boards_to_check:
            for r in range(3):
                for c in range(3):
                    if self.small_boards[board_idx][r, c] == 0:
                        actions.append((board_idx, r, c))
        
        return actions
    
    def make_move(self, action):
        if self.game_over:
            return self.get_state(), 0, True
        
        board_idx, row, col = action
        available = self.get_available_actions()
        if action not in available:
            return self.get_state(), -100, True
        
        self.small_boards[board_idx][row, col] = self.current_player
        self.move_history.append((action, self.current_player))
        
        if self._check_small_board_win(board_idx, self.current_player):
            self.meta_board[board_idx] = self.current_player
            reward = 10
        elif self._check_small_board_full(board_idx):
            self.meta_board[board_idx] = -1
            reward = 0
        else:
            reward = 0
        
        if self._check_meta_win(self.current_player):
            self.game_over = True
            self.winner = self.current_player
            return self.get_state(), 1000, True
        
        if np.all(self.meta_board != 0):
            self.game_over = True
            self.winner = 0
            return self.get_state(), 0, True
        
        next_board = row * 3 + col
        if self.meta_board[next_board] == 0:
            self.active_board = next_board
        else:
            self.active_board = None
        
        self.current_player = 3 - self.current_player
        return self.get_state(), reward, False
    
    def _check_small_board_win(self, board_idx, player):
        board = self.small_boards[board_idx]
        for i in range(3):
            if np.all(board[i, :] == player) or np.all(board[:, i] == player):
                return True
        if board[0, 0] == player and board[1, 1] == player and board[2, 2] == player:
            return True
        if board[0, 2] == player and board[1, 1] == player and board[2, 0] == player:
            return True
        return False
    
    def _check_small_board_full(self, board_idx):
        return np.all(self.small_boards[board_idx] != 0)
    
    def _check_meta_win(self, player):
        meta = self.meta_board.reshape(3, 3)
        for i in range(3):
            if np.all(meta[i, :] == player) or np.all(meta[:, i] == player):
                return True
        if meta[0, 0] == player and meta[1, 1] == player and meta[2, 2] == player:
            return True
        if meta[0, 2] == player and meta[1, 1] == player and meta[2, 0] == player:
            return True
        return False
    
    def evaluate_position(self, player):
        if self.winner == player:
            return 100000
        if self.winner == (3 - player):
            return -100000
        if self.game_over:
            return 0
        
        opponent = 3 - player
        score = 0
        
        meta = self.meta_board.reshape(3, 3)
        score += self._count_meta_lines(player, 2) * 500
        score += self._count_meta_lines(player, 1) * 100
        score -= self._count_meta_lines(opponent, 2) * 600
        score -= self._count_meta_lines(opponent, 1) * 100
        
        strategic_boards = [4]
        corner_boards = [0, 2, 6, 8]
        
        for b in strategic_boards:
            if self.meta_board[b] == player:
                score += 200
            elif self.meta_board[b] == opponent:
                score -= 200
        
        for b in corner_boards:
            if self.meta_board[b] == player:
                score += 100
            elif self.meta_board[b] == opponent:
                score -= 100
        
        for board_idx in range(9):
            if self.meta_board[board_idx] == 0:
                board_score = self._evaluate_small_board(board_idx, player)
                score += board_score * 0.5
        
        if self.active_board is not None:
            if self.meta_board[self.active_board] == 0:
                score += 50
        
        return score
    
    def _count_meta_lines(self, player, count):
        meta = self.meta_board.reshape(3, 3)
        lines = 0
        
        for i in range(3):
            row = meta[i, :]
            col = meta[:, i]
            if np.sum(row == player) == count and np.sum(row == (3-player)) == 0:
                lines += 1
            if np.sum(col == player) == count and np.sum(col == (3-player)) == 0:
                lines += 1
        
        diag1 = [meta[0, 0], meta[1, 1], meta[2, 2]]
        diag2 = [meta[0, 2], meta[1, 1], meta[2, 0]]
        
        if diag1.count(player) == count and (3-player) not in diag1:
            lines += 1
        if diag2.count(player) == count and (3-player) not in diag2:
            lines += 1
        
        return lines
    
    def _evaluate_small_board(self, board_idx, player):
        board = self.small_boards[board_idx]
        opponent = 3 - player
        score = 0
        
        lines_2 = 0
        lines_1 = 0
        opp_lines_2 = 0
        
        for i in range(3):
            row = board[i, :]
            col = board[:, i]
            
            if np.sum(row == player) == 2 and np.sum(row == opponent) == 0:
                lines_2 += 1
            elif np.sum(row == player) == 1 and np.sum(row == opponent) == 0:
                lines_1 += 1
            
            if np.sum(row == opponent) == 2 and np.sum(row == player) == 0:
                opp_lines_2 += 1
            
            if np.sum(col == player) == 2 and np.sum(col == opponent) == 0:
                lines_2 += 1
            elif np.sum(col == player) == 1 and np.sum(col == opponent) == 0:
                lines_1 += 1
            
            if np.sum(col == opponent) == 2 and np.sum(col == player) == 0:
                opp_lines_2 += 1
        
        diag1 = [board[0, 0], board[1, 1], board[2, 2]]
        diag2 = [board[0, 2], board[1, 1], board[2, 0]]
        
        if diag1.count(player) == 2 and opponent not in diag1:
            lines_2 += 1
        if diag2.count(player) == 2 and opponent not in diag2:
            lines_2 += 1
        if diag1.count(opponent) == 2 and player not in diag1:
            opp_lines_2 += 1
        if diag2.count(opponent) == 2 and player not in diag2:
            opp_lines_2 += 1
        
        score = lines_2 * 10 + lines_1 * 2 - opp_lines_2 * 12
        return score

# ============================================================================
# AGENT
# ============================================================================

class SuperTTTAgent:
    def __init__(self, player_id, lr=0.1, gamma=0.95, epsilon=1.0,
                 epsilon_decay=0.9995, epsilon_min=0.05):
        self.player_id = player_id
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.q_table = {}
        self.experience_replay = deque(maxlen=50000)
        self.minimax_depth = 2
        
        self.wins = 0
        self.losses = 0
        self.draws = 0
    
    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)
    
    def choose_action(self, env, training=True):
        available = env.get_available_actions()
        if not available:
            return None
        
        # Immediate tactics
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            board_idx = action[0]
            if sim.meta_board[board_idx] == self.player_id and env.meta_board[board_idx] == 0:
                if sim._check_meta_win(self.player_id):
                    return action
        
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            if sim.winner == self.player_id:
                return action
        
        opponent = 3 - self.player_id
        for action in available:
            sim = self._simulate_move(env, action, opponent)
            if sim.winner == opponent:
                return action
        
        # Strategic planning
        if training and random.random() < self.epsilon:
            strategic_actions = [a for a in available if a[0] in [4, 0, 2, 6, 8]]
            if strategic_actions:
                return random.choice(strategic_actions)
            return random.choice(available)
        
        best_score = -float('inf')
        best_actions = []
        
        alpha = -float('inf')
        beta = float('inf')
        
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            score = self._minimax(sim, self.minimax_depth - 1, alpha, beta, False)
            
            q_boost = self.get_q_value(env.get_state(), action) * 0.05
            total_score = score + q_boost
            
            if total_score > best_score:
                best_score = total_score
                best_actions = [action]
            elif abs(total_score - best_score) < 0.01:
                best_actions.append(action)
            
            alpha = max(alpha, best_score)
        
        return random.choice(best_actions) if best_actions else random.choice(available)
    
    def _minimax(self, env, depth, alpha, beta, is_maximizing):
        if env.winner == self.player_id:
            return 10000 + depth
        if env.winner == (3 - self.player_id):
            return -10000 - depth
        if env.game_over:
            return 0
        if depth == 0:
            return env.evaluate_position(self.player_id)
        
        available = env.get_available_actions()
        
        if is_maximizing:
            max_eval = -float('inf')
            for action in available:
                sim = self._simulate_move(env, action, self.player_id)
                eval_score = self._minimax(sim, depth - 1, alpha, beta, False)
                max_eval = max(max_eval, eval_score)
                alpha = max(alpha, eval_score)
                if beta <= alpha:
                    break
            return max_eval
        else:
            min_eval = float('inf')
            opponent = 3 - self.player_id
            for action in available:
                sim = self._simulate_move(env, action, opponent)
                eval_score = self._minimax(sim, depth - 1, alpha, beta, True)
                min_eval = min(min_eval, eval_score)
                beta = min(beta, eval_score)
                if beta <= alpha:
                    break
            return min_eval
    
    def _simulate_move(self, env, action, player):
        sim = SuperTicTacToe()
        sim.small_boards = [board.copy() for board in env.small_boards]
        sim.meta_board = env.meta_board.copy()
        sim.current_player = player
        sim.active_board = env.active_board
        sim.make_move(action)
        return sim
    
    def update_q_value(self, state, action, reward, next_state, next_actions):
        current_q = self.get_q_value(state, action)
        if next_actions:
            max_next_q = max([self.get_q_value(next_state, a) for a in next_actions])
        else:
            max_next_q = 0
        
        td_error = reward + self.gamma * max_next_q - current_q
        new_q = current_q + self.lr * td_error
        self.q_table[(state, action)] = new_q
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def reset_stats(self):
        self.wins = 0
        self.losses = 0
        self.draws = 0

# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================

def play_game(env, agent1, agent2, training=True):
    env.reset()
    game_history = []
    agents = {1: agent1, 2: agent2}
    
    while not env.game_over:
        current_player = env.current_player
        current_agent = agents[current_player]
        
        state = env.get_state()
        action = current_agent.choose_action(env, training)
        
        if action is None:
            break
        
        game_history.append((state, action, current_player))
        next_state, reward, done = env.make_move(action)
        
        if training:
            next_actions = env.get_available_actions()
            current_agent.update_q_value(state, action, reward, next_state, next_actions)
        
        if done:
            if env.winner == 1:
                agent1.wins += 1
                agent2.losses += 1
                if training:
                    _update_outcome(agent1, game_history, 1, 100)
                    _update_outcome(agent2, game_history, 2, -50)
            elif env.winner == 2:
                agent2.wins += 1
                agent1.losses += 1
                if training:
                    _update_outcome(agent1, game_history, 1, -50)
                    _update_outcome(agent2, game_history, 2, 100)
            else:
                agent1.draws += 1
                agent2.draws += 1
                if training:
                    _update_outcome(agent1, game_history, 1, -10)
                    _update_outcome(agent2, game_history, 2, -10)
    
    return env.winner

def _update_outcome(agent, history, player_id, final_reward):
    agent_moves = [(s, a) for s, a, p in history if p == player_id]
    for i in range(len(agent_moves) - 1, -1, -1):
        state, action = agent_moves[i]
        discount = agent.gamma ** (len(agent_moves) - 1 - i)
        adjusted_reward = final_reward * discount
        current_q = agent.get_q_value(state, action)
        new_q = current_q + agent.lr * (adjusted_reward - current_q)
        agent.q_table[(state, action)] = new_q

# ============================================================================
# OPTIMIZED SERIALIZATION (Critical for fast upload/download)
# ============================================================================

def serialize_q_table_optimized(q_table):
    """Ultra-compact serialization using string keys"""
    serialized = {}
    
    for (state, action), value in q_table.items():
        # Compact string representation
        # State: small_boards (9x9), meta_board (9), active_board
        small_boards_str = ','.join([''.join(map(str, board)) for board in state[0]])
        
        # FIX: Use comma delimiter for meta_board to handle -1 (draws) correctly
        meta_str = ','.join(map(str, state[1]))
        
        active_str = str(state[2]) if state[2] is not None else 'N'
        
        # Action: (board_idx, row, col)
        action_str = f"{action[0]}{action[1]}{action[2]}"
        
        # Combine into single compact key
        key = f"{small_boards_str}|{meta_str}|{active_str}|{action_str}"
        
        # Store only significant digits to reduce size
        serialized[key] = round(float(value), 4)
    
    return serialized
    

def deserialize_q_table_optimized(serialized):
    """Deserialize the compact format"""
    q_table = {}
    
    for key, value in serialized.items():
        parts = key.split('|')
        
        # Parse small boards
        boards_str = parts[0].split(',')
        small_boards = tuple(
            tuple(int(c) for c in board_str)
            for board_str in boards_str
        )
        
        # FIX: Split by comma to parse negative numbers (-1) correctly
        meta_board = tuple(int(x) for x in parts[1].split(','))
        
        # Parse active board
        active_board = None if parts[2] == 'N' else int(parts[2])
        
        # Parse action
        action_str = parts[3]
        action = (int(action_str[0]), int(action_str[1]), int(action_str[2]))
        
        # Reconstruct state
        state = (small_boards, meta_board, active_board)
        
        q_table[(state, action)] = value
    
    return q_table
    

def create_training_zip(agent1, agent2, config, training_stats):
    """Create optimized zip file for download"""
    
    print("üì¶ Packaging agents...")
    
    agent1_data = {
        "q_table": serialize_q_table_optimized(agent1.q_table),
        "epsilon": round(agent1.epsilon, 6),
        "lr": agent1.lr,
        "gamma": agent1.gamma,
        "minimax_depth": agent1.minimax_depth,
        "wins": agent1.wins,
        "losses": agent1.losses,
        "draws": agent1.draws
    }
    
    agent2_data = {
        "q_table": serialize_q_table_optimized(agent2.q_table),
        "epsilon": round(agent2.epsilon, 6),
        "lr": agent2.lr,
        "gamma": agent2.gamma,
        "minimax_depth": agent2.minimax_depth,
        "wins": agent2.wins,
        "losses": agent2.losses,
        "draws": agent2.draws
    }
    
    buffer = io.BytesIO()
    with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED, compresslevel=9) as zf:
        zf.writestr("agent1.json", json.dumps(agent1_data))
        zf.writestr("agent2.json", json.dumps(agent2_data))
        zf.writestr("config.json", json.dumps(config))
        zf.writestr("training_stats.json", json.dumps(training_stats))
        
        # Add metadata
        metadata = {
            "trained_episodes": config.get("episodes", 0),
            "final_q_size_agent1": len(agent1.q_table),
            "final_q_size_agent2": len(agent2.q_table),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
        }
        zf.writestr("metadata.json", json.dumps(metadata, indent=2))
    
    buffer.seek(0)
    return buffer

# ============================================================================
# MAIN TRAINING LOOP
# ============================================================================

print("üéØ Super Tic-Tac-Toe - GPU Training Session")
print("=" * 60)

# Hyperparameters (optimized for speed and quality)
EPISODES = 1000  # Adjust based on available time
LR1 = 0.15
GAMMA1 = 0.98
MINIMAX_DEPTH1 = 3

LR2 = 0.15
GAMMA2 = 0.98
MINIMAX_DEPTH2 = 3

EPSILON_DECAY = 0.9998
UPDATE_FREQ = 100

print(f"\n‚öôÔ∏è  Configuration:")
print(f"   Episodes: {EPISODES:,}")
print(f"   Learning Rate: {LR1}")
print(f"   Gamma: {GAMMA1}")
print(f"   Minimax Depth: {MINIMAX_DEPTH1}")
print(f"   Epsilon Decay: {EPSILON_DECAY}")
print()

# Initialize
env = SuperTicTacToe()
agent1 = SuperTTTAgent(1, lr=LR1, gamma=GAMMA1, epsilon_decay=EPSILON_DECAY)
agent1.minimax_depth = MINIMAX_DEPTH1
agent2 = SuperTTTAgent(2, lr=LR2, gamma=GAMMA2, epsilon_decay=EPSILON_DECAY)
agent2.minimax_depth = MINIMAX_DEPTH2

# Training stats
training_stats = {
    'episodes': [],
    'agent1_wins': [],
    'agent2_wins': [],
    'draws': [],
    'agent1_epsilon': [],
    'agent2_epsilon': [],
    'agent1_q_size': [],
    'agent2_q_size': []
}

# Training loop with progress bar
print("üöÄ Training started...\n")
start_time = time.time()

for episode in tqdm(range(1, EPISODES + 1), desc="Training"):
    play_game(env, agent1, agent2, training=True)
    agent1.decay_epsilon()
    agent2.decay_epsilon()
    
    # Update stats periodically
    if episode % UPDATE_FREQ == 0:
        training_stats['episodes'].append(episode)
        training_stats['agent1_wins'].append(agent1.wins)
        training_stats['agent2_wins'].append(agent2.wins)
        training_stats['draws'].append(agent1.draws)
        training_stats['agent1_epsilon'].append(round(agent1.epsilon, 6))
        training_stats['agent2_epsilon'].append(round(agent2.epsilon, 6))
        training_stats['agent1_q_size'].append(len(agent1.q_table))
        training_stats['agent2_q_size'].append(len(agent2.q_table))
        
        # Print progress
        win_rate_1 = agent1.wins / episode * 100
        win_rate_2 = agent2.wins / episode * 100
        draw_rate = agent1.draws / episode * 100
        
        print(f"\nüìä Episode {episode:,}/{EPISODES:,}")
        print(f"   Agent 1: {agent1.wins:,} wins ({win_rate_1:.1f}%) | Œµ={agent1.epsilon:.4f} | Q={len(agent1.q_table):,}")
        print(f"   Agent 2: {agent2.wins:,} wins ({win_rate_2:.1f}%) | Œµ={agent2.epsilon:.4f} | Q={len(agent2.q_table):,}")
        print(f"   Draws: {agent1.draws:,} ({draw_rate:.1f}%)")

elapsed_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")

# Final statistics
print("\n" + "=" * 60)
print("üìà FINAL STATISTICS")
print("=" * 60)
print(f"Agent 1 (Blue):")
print(f"  Wins: {agent1.wins:,} ({agent1.wins/EPISODES*100:.1f}%)")
print(f"  Q-Table Size: {len(agent1.q_table):,} states")
print(f"  Final Epsilon: {agent1.epsilon:.6f}")
print()
print(f"Agent 2 (Red):")
print(f"  Wins: {agent2.wins:,} ({agent2.wins/EPISODES*100:.1f}%)")
print(f"  Q-Table Size: {len(agent2.q_table):,} states")
print(f"  Final Epsilon: {agent2.epsilon:.6f}")
print()
print(f"Draws: {agent1.draws:,} ({agent1.draws/EPISODES*100:.1f}%)")
print()

# Create config
config = {
    "episodes": EPISODES,
    "lr1": LR1,
    "gamma1": GAMMA1,
    "minimax1": MINIMAX_DEPTH1,
    "lr2": LR2,
    "gamma2": GAMMA2,
    "minimax2": MINIMAX_DEPTH2,
    "epsilon_decay": EPSILON_DECAY,
    "training_time_seconds": round(elapsed_time, 2)
}

# Save to zip
print("üíæ Creating download package...")
zip_buffer = create_training_zip(agent1, agent2, config, training_stats)

# Save to file
output_filename = "super_ttt_agents_adv.zip"
with open(output_filename, "wb") as f:
    f.write(zip_buffer.getvalue())

file_size_mb = len(zip_buffer.getvalue()) / (1024 * 1024)
print(f"‚úÖ Saved to: {output_filename} ({file_size_mb:.2f} MB)")
print()
print("=" * 60)
print("üéâ SUCCESS! Download the .zip file and upload to Streamlit!")
print("=" * 60)
print()
print("üìù Quick Stats:")
print(f"   ‚Ä¢ Total Episodes: {EPISODES:,}")
print(f"   ‚Ä¢ Training Time: {elapsed_time/60:.1f} minutes")
print(f"   ‚Ä¢ Agent 1 Q-States: {len(agent1.q_table):,}")
print(f"   ‚Ä¢ Agent 2 Q-States: {len(agent2.q_table):,}")
print(f"   ‚Ä¢ Package Size: {file_size_mb:.2f} MB")
print()
print("üöÄ Ready for deployment in Streamlit!")

üéØ Super Tic-Tac-Toe - GPU Training Session

‚öôÔ∏è  Configuration:
   Episodes: 10,000
   Learning Rate: 0.12
   Gamma: 0.98
   Minimax Depth: 2
   Epsilon Decay: 0.9998

üöÄ Training started...



Training:   1%|          | 100/10000 [00:22<43:54,  3.76it/s]


üìä Episode 100/10,000
   Agent 1: 42 wins (42.0%) | Œµ=0.9802 | Q=2,769
   Agent 2: 30 wins (30.0%) | Œµ=0.9802 | Q=2,767
   Draws: 28 (28.0%)


Training:   2%|‚ñè         | 200/10000 [00:48<1:03:09,  2.59it/s]


üìä Episode 200/10,000
   Agent 1: 85 wins (42.5%) | Œµ=0.9608 | Q=5,483
   Agent 2: 64 wins (32.0%) | Œµ=0.9608 | Q=5,488
   Draws: 51 (25.5%)


Training:   3%|‚ñé         | 300/10000 [01:15<39:09,  4.13it/s]  


üìä Episode 300/10,000
   Agent 1: 124 wins (41.3%) | Œµ=0.9418 | Q=8,149
   Agent 2: 94 wins (31.3%) | Œµ=0.9418 | Q=8,163
   Draws: 82 (27.3%)


Training:   4%|‚ñç         | 400/10000 [01:46<53:32,  2.99it/s]  


üìä Episode 400/10,000
   Agent 1: 161 wins (40.2%) | Œµ=0.9231 | Q=10,771
   Agent 2: 128 wins (32.0%) | Œµ=0.9231 | Q=10,794
   Draws: 111 (27.8%)


Training:   4%|‚ñç         | 437/10000 [02:00<43:49,  3.64it/s]  


KeyboardInterrupt: 

In [8]:
# ============================================================================
# SUPER TIC-TAC-TOE - TITAN VANGUARD EDITION (20-MIN OPTIMIZATION)
# ============================================================================
# Run this single cell in Kaggle with GPU enabled
# Optimization Goal: Maximize human-crushing ability in <20 mins
# ============================================================================

import numpy as np
from collections import deque
import random
import json
import zipfile
import io
from tqdm import tqdm
import time

# ============================================================================
# ENVIRONMENT
# ============================================================================

class SuperTicTacToe:
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.small_boards = [np.zeros((3, 3), dtype=int) for _ in range(9)]
        self.meta_board = np.zeros(9, dtype=int)
        self.current_player = 1
        self.active_board = None
        self.game_over = False
        self.winner = None
        self.move_history = []
        return self.get_state()
    
    def get_state(self):
        small_boards_flat = tuple(tuple(board.flatten()) for board in self.small_boards)
        return (small_boards_flat, tuple(self.meta_board), self.active_board)
    
    def get_available_actions(self):
        actions = []
        if self.game_over:
            return actions
        
        if self.active_board is not None and self.meta_board[self.active_board] == 0:
            boards_to_check = [self.active_board]
        else:
            boards_to_check = [i for i in range(9) if self.meta_board[i] == 0]
        
        for board_idx in boards_to_check:
            for r in range(3):
                for c in range(3):
                    if self.small_boards[board_idx][r, c] == 0:
                        actions.append((board_idx, r, c))
        
        return actions
    
    def make_move(self, action):
        if self.game_over:
            return self.get_state(), 0, True
        
        board_idx, row, col = action
        available = self.get_available_actions()
        if action not in available:
            return self.get_state(), -100, True
        
        self.small_boards[board_idx][row, col] = self.current_player
        self.move_history.append((action, self.current_player))
        
        if self._check_small_board_win(board_idx, self.current_player):
            self.meta_board[board_idx] = self.current_player
            reward = 10
        elif self._check_small_board_full(board_idx):
            self.meta_board[board_idx] = -1
            reward = 0
        else:
            reward = 0
        
        if self._check_meta_win(self.current_player):
            self.game_over = True
            self.winner = self.current_player
            return self.get_state(), 1000, True
        
        if np.all(self.meta_board != 0):
            self.game_over = True
            self.winner = 0
            return self.get_state(), 0, True
        
        next_board = row * 3 + col
        if self.meta_board[next_board] == 0:
            self.active_board = next_board
        else:
            self.active_board = None
        
        self.current_player = 3 - self.current_player
        return self.get_state(), reward, False
    
    def _check_small_board_win(self, board_idx, player):
        board = self.small_boards[board_idx]
        for i in range(3):
            if np.all(board[i, :] == player) or np.all(board[:, i] == player):
                return True
        if board[0, 0] == player and board[1, 1] == player and board[2, 2] == player:
            return True
        if board[0, 2] == player and board[1, 1] == player and board[2, 0] == player:
            return True
        return False
    
    def _check_small_board_full(self, board_idx):
        return np.all(self.small_boards[board_idx] != 0)
    
    def _check_meta_win(self, player):
        meta = self.meta_board.reshape(3, 3)
        for i in range(3):
            if np.all(meta[i, :] == player) or np.all(meta[:, i] == player):
                return True
        if meta[0, 0] == player and meta[1, 1] == player and meta[2, 2] == player:
            return True
        if meta[0, 2] == player and meta[1, 1] == player and meta[2, 0] == player:
            return True
        return False
    
    def evaluate_position(self, player):
        if self.winner == player:
            return 100000
        if self.winner == (3 - player):
            return -100000
        if self.game_over:
            return 0
        
        opponent = 3 - player
        score = 0
        
        meta = self.meta_board.reshape(3, 3)
        score += self._count_meta_lines(player, 2) * 500
        score += self._count_meta_lines(player, 1) * 100
        score -= self._count_meta_lines(opponent, 2) * 600
        score -= self._count_meta_lines(opponent, 1) * 100
        
        strategic_boards = [4]
        corner_boards = [0, 2, 6, 8]
        
        for b in strategic_boards:
            if self.meta_board[b] == player:
                score += 200
            elif self.meta_board[b] == opponent:
                score -= 200
        
        for b in corner_boards:
            if self.meta_board[b] == player:
                score += 100
            elif self.meta_board[b] == opponent:
                score -= 100
        
        for board_idx in range(9):
            if self.meta_board[board_idx] == 0:
                board_score = self._evaluate_small_board(board_idx, player)
                score += board_score * 0.5
        
        if self.active_board is not None:
            if self.meta_board[self.active_board] == 0:
                score += 50
        
        return score
    
    def _count_meta_lines(self, player, count):
        meta = self.meta_board.reshape(3, 3)
        lines = 0
        
        for i in range(3):
            row = meta[i, :]
            col = meta[:, i]
            if np.sum(row == player) == count and np.sum(row == (3-player)) == 0:
                lines += 1
            if np.sum(col == player) == count and np.sum(col == (3-player)) == 0:
                lines += 1
        
        diag1 = [meta[0, 0], meta[1, 1], meta[2, 2]]
        diag2 = [meta[0, 2], meta[1, 1], meta[2, 0]]
        
        if diag1.count(player) == count and (3-player) not in diag1:
            lines += 1
        if diag2.count(player) == count and (3-player) not in diag2:
            lines += 1
        
        return lines
    
    def _evaluate_small_board(self, board_idx, player):
        board = self.small_boards[board_idx]
        opponent = 3 - player
        score = 0
        
        lines_2 = 0
        lines_1 = 0
        opp_lines_2 = 0
        
        for i in range(3):
            row = board[i, :]
            col = board[:, i]
            
            if np.sum(row == player) == 2 and np.sum(row == opponent) == 0:
                lines_2 += 1
            elif np.sum(row == player) == 1 and np.sum(row == opponent) == 0:
                lines_1 += 1
            
            if np.sum(row == opponent) == 2 and np.sum(row == player) == 0:
                opp_lines_2 += 1
            
            if np.sum(col == player) == 2 and np.sum(col == opponent) == 0:
                lines_2 += 1
            elif np.sum(col == player) == 1 and np.sum(col == opponent) == 0:
                lines_1 += 1
            
            if np.sum(col == opponent) == 2 and np.sum(col == player) == 0:
                opp_lines_2 += 1
        
        diag1 = [board[0, 0], board[1, 1], board[2, 2]]
        diag2 = [board[0, 2], board[1, 1], board[2, 0]]
        
        if diag1.count(player) == 2 and opponent not in diag1:
            lines_2 += 1
        if diag2.count(player) == 2 and opponent not in diag2:
            lines_2 += 1
        if diag1.count(opponent) == 2 and player not in diag1:
            opp_lines_2 += 1
        if diag2.count(opponent) == 2 and player not in diag2:
            opp_lines_2 += 1
        
        score = lines_2 * 10 + lines_1 * 2 - opp_lines_2 * 12
        return score

# ============================================================================
# AGENT
# ============================================================================

class SuperTTTAgent:
    def __init__(self, player_id, lr=0.1, gamma=0.95, epsilon=1.0,
                 epsilon_decay=0.9995, epsilon_min=0.05):
        self.player_id = player_id
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.q_table = {}
        self.experience_replay = deque(maxlen=50000)
        self.minimax_depth = 2
        
        self.wins = 0
        self.losses = 0
        self.draws = 0
    
    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)
    
    def choose_action(self, env, training=True):
        available = env.get_available_actions()
        if not available:
            return None
        
        # Immediate tactics
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            board_idx = action[0]
            if sim.meta_board[board_idx] == self.player_id and env.meta_board[board_idx] == 0:
                if sim._check_meta_win(self.player_id):
                    return action
        
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            if sim.winner == self.player_id:
                return action
        
        opponent = 3 - self.player_id
        for action in available:
            sim = self._simulate_move(env, action, opponent)
            if sim.winner == opponent:
                return action
        
        # Strategic planning
        if training and random.random() < self.epsilon:
            strategic_actions = [a for a in available if a[0] in [4, 0, 2, 6, 8]]
            if strategic_actions:
                return random.choice(strategic_actions)
            return random.choice(available)
        
        best_score = -float('inf')
        best_actions = []
        
        alpha = -float('inf')
        beta = float('inf')
        
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            score = self._minimax(sim, self.minimax_depth - 1, alpha, beta, False)
            
            q_boost = self.get_q_value(env.get_state(), action) * 0.05
            total_score = score + q_boost
            
            if total_score > best_score:
                best_score = total_score
                best_actions = [action]
            elif abs(total_score - best_score) < 0.01:
                best_actions.append(action)
            
            alpha = max(alpha, best_score)
        
        return random.choice(best_actions) if best_actions else random.choice(available)
    
    def _minimax(self, env, depth, alpha, beta, is_maximizing):
        if env.winner == self.player_id:
            return 10000 + depth
        if env.winner == (3 - self.player_id):
            return -10000 - depth
        if env.game_over:
            return 0
        if depth == 0:
            return env.evaluate_position(self.player_id)
        
        available = env.get_available_actions()
        
        if is_maximizing:
            max_eval = -float('inf')
            for action in available:
                sim = self._simulate_move(env, action, self.player_id)
                eval_score = self._minimax(sim, depth - 1, alpha, beta, False)
                max_eval = max(max_eval, eval_score)
                alpha = max(alpha, eval_score)
                if beta <= alpha:
                    break
            return max_eval
        else:
            min_eval = float('inf')
            opponent = 3 - self.player_id
            for action in available:
                sim = self._simulate_move(env, action, opponent)
                eval_score = self._minimax(sim, depth - 1, alpha, beta, True)
                min_eval = min(min_eval, eval_score)
                beta = min(beta, eval_score)
                if beta <= alpha:
                    break
            return min_eval
    
    def _simulate_move(self, env, action, player):
        sim = SuperTicTacToe()
        sim.small_boards = [board.copy() for board in env.small_boards]
        sim.meta_board = env.meta_board.copy()
        sim.current_player = player
        sim.active_board = env.active_board
        sim.make_move(action)
        return sim
    
    def update_q_value(self, state, action, reward, next_state, next_actions):
        current_q = self.get_q_value(state, action)
        if next_actions:
            max_next_q = max([self.get_q_value(next_state, a) for a in next_actions])
        else:
            max_next_q = 0
        
        td_error = reward + self.gamma * max_next_q - current_q
        new_q = current_q + self.lr * td_error
        self.q_table[(state, action)] = new_q
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def reset_stats(self):
        self.wins = 0
        self.losses = 0
        self.draws = 0

# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================

def play_game(env, agent1, agent2, training=True):
    env.reset()
    game_history = []
    agents = {1: agent1, 2: agent2}
    
    while not env.game_over:
        current_player = env.current_player
        current_agent = agents[current_player]
        
        state = env.get_state()
        action = current_agent.choose_action(env, training)
        
        if action is None:
            break
        
        game_history.append((state, action, current_player))
        next_state, reward, done = env.make_move(action)
        
        if training:
            next_actions = env.get_available_actions()
            current_agent.update_q_value(state, action, reward, next_state, next_actions)
        
        if done:
            if env.winner == 1:
                agent1.wins += 1
                agent2.losses += 1
                if training:
                    _update_outcome(agent1, game_history, 1, 100)
                    _update_outcome(agent2, game_history, 2, -50)
            elif env.winner == 2:
                agent2.wins += 1
                agent1.losses += 1
                if training:
                    _update_outcome(agent1, game_history, 1, -50)
                    _update_outcome(agent2, game_history, 2, 100)
            else:
                agent1.draws += 1
                agent2.draws += 1
                if training:
                    _update_outcome(agent1, game_history, 1, -10)
                    _update_outcome(agent2, game_history, 2, -10)
    
    return env.winner

def _update_outcome(agent, history, player_id, final_reward):
    agent_moves = [(s, a) for s, a, p in history if p == player_id]
    for i in range(len(agent_moves) - 1, -1, -1):
        state, action = agent_moves[i]
        discount = agent.gamma ** (len(agent_moves) - 1 - i)
        adjusted_reward = final_reward * discount
        current_q = agent.get_q_value(state, action)
        new_q = current_q + agent.lr * (adjusted_reward - current_q)
        agent.q_table[(state, action)] = new_q

# ============================================================================
# OPTIMIZED SERIALIZATION (Critical for fast upload/download)
# ============================================================================

def serialize_q_table_optimized(q_table):
    """Ultra-compact serialization using string keys"""
    serialized = {}
    
    for (state, action), value in q_table.items():
        # Compact string representation
        # State: small_boards (9x9), meta_board (9), active_board
        small_boards_str = ','.join([''.join(map(str, board)) for board in state[0]])
        
        # FIX: Use comma delimiter for meta_board to handle -1 (draws) correctly
        meta_str = ','.join(map(str, state[1]))
        
        active_str = str(state[2]) if state[2] is not None else 'N'
        
        # Action: (board_idx, row, col)
        action_str = f"{action[0]}{action[1]}{action[2]}"
        
        # Combine into single compact key
        key = f"{small_boards_str}|{meta_str}|{active_str}|{action_str}"
        
        # Store only significant digits to reduce size
        serialized[key] = round(float(value), 4)
    
    return serialized
    

def deserialize_q_table_optimized(serialized):
    """Deserialize the compact format"""
    q_table = {}
    
    for key, value in serialized.items():
        parts = key.split('|')
        
        # Parse small boards
        boards_str = parts[0].split(',')
        small_boards = tuple(
            tuple(int(c) for c in board_str)
            for board_str in boards_str
        )
        
        # FIX: Split by comma to parse negative numbers (-1) correctly
        meta_board = tuple(int(x) for x in parts[1].split(','))
        
        # Parse active board
        active_board = None if parts[2] == 'N' else int(parts[2])
        
        # Parse action
        action_str = parts[3]
        action = (int(action_str[0]), int(action_str[1]), int(action_str[2]))
        
        # Reconstruct state
        state = (small_boards, meta_board, active_board)
        
        q_table[(state, action)] = value
    
    return q_table
    

def create_training_zip(agent1, agent2, config, training_stats):
    """Create optimized zip file for download"""
    
    print("üì¶ Packaging agents...")
    
    agent1_data = {
        "q_table": serialize_q_table_optimized(agent1.q_table),
        "epsilon": round(agent1.epsilon, 6),
        "lr": agent1.lr,
        "gamma": agent1.gamma,
        "minimax_depth": agent1.minimax_depth,
        "wins": agent1.wins,
        "losses": agent1.losses,
        "draws": agent1.draws
    }
    
    agent2_data = {
        "q_table": serialize_q_table_optimized(agent2.q_table),
        "epsilon": round(agent2.epsilon, 6),
        "lr": agent2.lr,
        "gamma": agent2.gamma,
        "minimax_depth": agent2.minimax_depth,
        "wins": agent2.wins,
        "losses": agent2.losses,
        "draws": agent2.draws
    }
    
    buffer = io.BytesIO()
    with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED, compresslevel=9) as zf:
        zf.writestr("agent1.json", json.dumps(agent1_data))
        zf.writestr("agent2.json", json.dumps(agent2_data))
        zf.writestr("config.json", json.dumps(config))
        zf.writestr("training_stats.json", json.dumps(training_stats))
        
        # Add metadata
        metadata = {
            "trained_episodes": config.get("episodes", 0),
            "final_q_size_agent1": len(agent1.q_table),
            "final_q_size_agent2": len(agent2.q_table),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
        }
        zf.writestr("metadata.json", json.dumps(metadata, indent=2))
    
    buffer.seek(0)
    return buffer

# ============================================================================
# MAIN TRAINING LOOP
# ============================================================================

print("üéØ Super Tic-Tac-Toe - GPU Training Session")
print("=" * 60)

# ============================================================================
# TITAN-20 OPTIMIZED HYPERPARAMETERS
# Goal: Maximum strength in ~20 minutes on Kaggle T4 GPU
# ============================================================================
EPISODES = 500       # The limit of what fits in 20 mins at Depth 3
LR1 = 0.14            # Aggressive learning (standard is 0.1)
GAMMA1 = 0.99         # Maximum foresight (standard is 0.95)
MINIMAX_DEPTH1 = 4    # The Vanguard Depth (balances speed/tactics)

LR2 = 0.14
GAMMA2 = 0.99
MINIMAX_DEPTH2 = 4

# Decay to 0.05 by episode ~2800 to refine skills
EPSILON_DECAY = 0.9998 
UPDATE_FREQ =100     # Show progress every 250 games

print(f"\n‚öôÔ∏è  Configuration (Titan-20 Protocol):")
print(f"   Episodes: {EPISODES:,}")
print(f"   Learning Rate: {LR1}")
print(f"   Gamma: {GAMMA1}")
print(f"   Minimax Depth: {MINIMAX_DEPTH1}")
print(f"   Epsilon Decay: {EPSILON_DECAY}")
print()

# Initialize
env = SuperTicTacToe()
agent1 = SuperTTTAgent(1, lr=LR1, gamma=GAMMA1, epsilon_decay=EPSILON_DECAY)
agent1.minimax_depth = MINIMAX_DEPTH1
agent2 = SuperTTTAgent(2, lr=LR2, gamma=GAMMA2, epsilon_decay=EPSILON_DECAY)
agent2.minimax_depth = MINIMAX_DEPTH2

# Training stats
training_stats = {
    'episodes': [],
    'agent1_wins': [],
    'agent2_wins': [],
    'draws': [],
    'agent1_epsilon': [],
    'agent2_epsilon': [],
    'agent1_q_size': [],
    'agent2_q_size': []
}

# Training loop with progress bar
print("üöÄ Training started...\n")
start_time = time.time()

for episode in tqdm(range(1, EPISODES + 1), desc="Training"):
    play_game(env, agent1, agent2, training=True)
    agent1.decay_epsilon()
    agent2.decay_epsilon()
    
    # Update stats periodically
    if episode % UPDATE_FREQ == 0:
        training_stats['episodes'].append(episode)
        training_stats['agent1_wins'].append(agent1.wins)
        training_stats['agent2_wins'].append(agent2.wins)
        training_stats['draws'].append(agent1.draws)
        training_stats['agent1_epsilon'].append(round(agent1.epsilon, 6))
        training_stats['agent2_epsilon'].append(round(agent2.epsilon, 6))
        training_stats['agent1_q_size'].append(len(agent1.q_table))
        training_stats['agent2_q_size'].append(len(agent2.q_table))
        
        # Print progress
        win_rate_1 = agent1.wins / episode * 100
        win_rate_2 = agent2.wins / episode * 100
        draw_rate = agent1.draws / episode * 100
        
        print(f"\nüìä Episode {episode:,}/{EPISODES:,}")
        print(f"   Agent 1: {agent1.wins:,} wins ({win_rate_1:.1f}%) | Œµ={agent1.epsilon:.4f} | Q={len(agent1.q_table):,}")
        print(f"   Agent 2: {agent2.wins:,} wins ({win_rate_2:.1f}%) | Œµ={agent2.epsilon:.4f} | Q={len(agent2.q_table):,}")
        print(f"   Draws: {agent1.draws:,} ({draw_rate:.1f}%)")

elapsed_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")

# Final statistics
print("\n" + "=" * 60)
print("üìà FINAL STATISTICS")
print("=" * 60)
print(f"Agent 1 (Blue):")
print(f"  Wins: {agent1.wins:,} ({agent1.wins/EPISODES*100:.1f}%)")
print(f"  Q-Table Size: {len(agent1.q_table):,} states")
print(f"  Final Epsilon: {agent1.epsilon:.6f}")
print()
print(f"Agent 2 (Red):")
print(f"  Wins: {agent2.wins:,} ({agent2.wins/EPISODES*100:.1f}%)")
print(f"  Q-Table Size: {len(agent2.q_table):,} states")
print(f"  Final Epsilon: {agent2.epsilon:.6f}")
print()
print(f"Draws: {agent1.draws:,} ({agent1.draws/EPISODES*100:.1f}%)")
print()

# Create config
config = {
    "episodes": EPISODES,
    "lr1": LR1,
    "gamma1": GAMMA1,
    "minimax1": MINIMAX_DEPTH1,
    "lr2": LR2,
    "gamma2": GAMMA2,
    "minimax2": MINIMAX_DEPTH2,
    "epsilon_decay": EPSILON_DECAY,
    "training_time_seconds": round(elapsed_time, 2)
}

# Save to zip
print("üíæ Creating download package...")
zip_buffer = create_training_zip(agent1, agent2, config, training_stats)

# Save to file
output_filename = "super_ttt_titan_adv.zip"
with open(output_filename, "wb") as f:
    f.write(zip_buffer.getvalue())

file_size_mb = len(zip_buffer.getvalue()) / (1024 * 1024)
print(f"‚úÖ Saved to: {output_filename} ({file_size_mb:.2f} MB)")
print()
print("=" * 60)
print("üéâ SUCCESS! Download the .zip file and upload to Streamlit!")
print("=" * 60)
print()
print("üìù Deployment Instructions:")
print(f"   1. Download {output_filename}")
print(f"   2. Upload to Streamlit App")
print(f"   3. CRITICAL: In Streamlit sidebar, set Minimax Depth to 6")
print()
print("üöÄ Ready for battle!")

üéØ Super Tic-Tac-Toe - GPU Training Session

‚öôÔ∏è  Configuration (Titan-20 Protocol):
   Episodes: 500
   Learning Rate: 0.14
   Gamma: 0.99
   Minimax Depth: 4
   Epsilon Decay: 0.9998

üöÄ Training started...



Training:  20%|‚ñà‚ñà        | 100/500 [00:51<03:24,  1.96it/s]


üìä Episode 100/500
   Agent 1: 37 wins (37.0%) | Œµ=0.9802 | Q=2,692
   Agent 2: 37 wins (37.0%) | Œµ=0.9802 | Q=2,696
   Draws: 26 (26.0%)


Training:  40%|‚ñà‚ñà‚ñà‚ñà      | 200/500 [02:33<05:23,  1.08s/it]


üìä Episode 200/500
   Agent 1: 73 wins (36.5%) | Œµ=0.9608 | Q=5,345
   Agent 2: 77 wins (38.5%) | Œµ=0.9608 | Q=5,364
   Draws: 50 (25.0%)


Training:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 300/500 [05:18<07:31,  2.26s/it]


üìä Episode 300/500
   Agent 1: 111 wins (37.0%) | Œµ=0.9418 | Q=8,033
   Agent 2: 109 wins (36.3%) | Œµ=0.9418 | Q=8,052
   Draws: 80 (26.7%)


Training:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 400/500 [08:45<03:17,  1.97s/it]


üìä Episode 400/500
   Agent 1: 145 wins (36.2%) | Œµ=0.9231 | Q=10,762
   Agent 2: 140 wins (35.0%) | Œµ=0.9231 | Q=10,775
   Draws: 115 (28.7%)


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [12:53<00:00,  1.55s/it]


üìä Episode 500/500
   Agent 1: 193 wins (38.6%) | Œµ=0.9048 | Q=13,359
   Agent 2: 167 wins (33.4%) | Œµ=0.9048 | Q=13,360
   Draws: 140 (28.0%)

‚úÖ Training complete in 773.9 seconds (12.9 minutes)

üìà FINAL STATISTICS
Agent 1 (Blue):
  Wins: 193 (38.6%)
  Q-Table Size: 13,359 states
  Final Epsilon: 0.904828

Agent 2 (Red):
  Wins: 167 (33.4%)
  Q-Table Size: 13,360 states
  Final Epsilon: 0.904828

Draws: 140 (28.0%)

üíæ Creating download package...
üì¶ Packaging agents...





‚úÖ Saved to: super_ttt_titan_adv.zip (0.29 MB)

üéâ SUCCESS! Download the .zip file and upload to Streamlit!

üìù Deployment Instructions:
   1. Download super_ttt_titan_adv.zip
   2. Upload to Streamlit App
   3. CRITICAL: In Streamlit sidebar, set Minimax Depth to 6

üöÄ Ready for battle!


In [1]:
# ============================================================================
# SUPER TIC-TAC-TOE - ULTRA-OPTIMIZED KAGGLE GPU TRAINING
# ============================================================================
# Target: 5 minutes on T4/P100 with deep strategic search
# Optimizations: Transposition tables, iterative deepening, GPU arrays, caching
# ============================================================================

import numpy as np
from collections import deque
import random
import json
import zipfile
import io
from tqdm import tqdm
import time
from functools import lru_cache

# Try to use GPU if available (CuPy)
try:
    import cupy as cp
    USE_GPU = True
    print("üöÄ GPU (CuPy) detected! Using GPU acceleration")
    xp = cp
except ImportError:
    USE_GPU = False
    print("‚ö° Running on CPU (NumPy)")
    xp = np

# ============================================================================
# ULTRA-FAST ENVIRONMENT WITH CACHING
# ============================================================================

class SuperTicTacToe:
    def __init__(self):
        self.reset()
        # Transposition table for position evaluation
        self._eval_cache = {}
        
    def reset(self):
        self.small_boards = [np.zeros((3, 3), dtype=np.int8) for _ in range(9)]
        self.meta_board = np.zeros(9, dtype=np.int8)
        self.current_player = 1
        self.active_board = None
        self.game_over = False
        self.winner = None
        self.move_history = []
        return self.get_state()
    
    def get_state(self):
        """Optimized state hashing"""
        small_boards_flat = tuple(tuple(board.flatten()) for board in self.small_boards)
        return (small_boards_flat, tuple(self.meta_board), self.active_board)
    
    def get_state_hash(self):
        """Ultra-fast hash for transposition table"""
        # Pack entire state into single integer for fast lookup
        hash_val = 0
        for i, board in enumerate(self.small_boards):
            for j, cell in enumerate(board.flatten()):
                hash_val = hash_val * 3 + cell
        for cell in self.meta_board:
            hash_val = hash_val * 3 + cell
        if self.active_board is not None:
            hash_val = hash_val * 10 + self.active_board
        return hash_val
    
    def get_available_actions(self):
        if self.game_over:
            return []
        
        if self.active_board is not None and self.meta_board[self.active_board] == 0:
            boards_to_check = [self.active_board]
        else:
            boards_to_check = [i for i in range(9) if self.meta_board[i] == 0]
        
        # Optimized with list comprehension
        actions = [(b, r, c) 
                   for b in boards_to_check 
                   for r in range(3) 
                   for c in range(3) 
                   if self.small_boards[b][r, c] == 0]
        return actions
    
    def make_move(self, action):
        if self.game_over:
            return self.get_state(), 0, True
        
        board_idx, row, col = action
        
        self.small_boards[board_idx][row, col] = self.current_player
        self.move_history.append((action, self.current_player))
        
        reward = 0
        if self._check_small_board_win(board_idx, self.current_player):
            self.meta_board[board_idx] = self.current_player
            reward = 10
        elif self._check_small_board_full(board_idx):
            self.meta_board[board_idx] = -1
        
        if self._check_meta_win(self.current_player):
            self.game_over = True
            self.winner = self.current_player
            return self.get_state(), 1000, True
        
        if np.all(self.meta_board != 0):
            self.game_over = True
            self.winner = 0
            return self.get_state(), 0, True
        
        next_board = row * 3 + col
        self.active_board = next_board if self.meta_board[next_board] == 0 else None
        self.current_player = 3 - self.current_player
        
        return self.get_state(), reward, False
    
    def _check_small_board_win(self, board_idx, player):
        board = self.small_boards[board_idx]
        # Vectorized check
        for i in range(3):
            if np.all(board[i, :] == player) or np.all(board[:, i] == player):
                return True
        if board[0, 0] == player and board[1, 1] == player and board[2, 2] == player:
            return True
        if board[0, 2] == player and board[1, 1] == player and board[2, 0] == player:
            return True
        return False
    
    def _check_small_board_full(self, board_idx):
        return np.all(self.small_boards[board_idx] != 0)
    
    def _check_meta_win(self, player):
        meta = self.meta_board.reshape(3, 3)
        for i in range(3):
            if np.all(meta[i, :] == player) or np.all(meta[:, i] == player):
                return True
        if meta[0, 0] == player and meta[1, 1] == player and meta[2, 2] == player:
            return True
        if meta[0, 2] == player and meta[1, 1] == player and meta[2, 0] == player:
            return True
        return False
    
    def evaluate_position(self, player):
        """Cached evaluation function"""
        state_hash = self.get_state_hash()
        cache_key = (state_hash, player)
        
        if cache_key in self._eval_cache:
            return self._eval_cache[cache_key]
        
        score = self._compute_evaluation(player)
        self._eval_cache[cache_key] = score
        return score
    
    def _compute_evaluation(self, player):
        if self.winner == player:
            return 100000
        if self.winner == (3 - player):
            return -100000
        if self.game_over:
            return 0
        
        opponent = 3 - player
        score = 0
        
        # Meta-board evaluation (vectorized)
        meta = self.meta_board.reshape(3, 3)
        score += self._count_meta_lines_fast(player, 2) * 500
        score += self._count_meta_lines_fast(player, 1) * 100
        score -= self._count_meta_lines_fast(opponent, 2) * 600
        score -= self._count_meta_lines_fast(opponent, 1) * 100
        
        # Strategic positions
        if self.meta_board[4] == player:
            score += 200
        elif self.meta_board[4] == opponent:
            score -= 200
        
        for b in [0, 2, 6, 8]:
            if self.meta_board[b] == player:
                score += 100
            elif self.meta_board[b] == opponent:
                score -= 100
        
        # Local boards (only evaluate active/strategic ones)
        boards_to_eval = [self.active_board] if self.active_board is not None else [4, 0, 2, 6, 8]
        for board_idx in boards_to_eval:
            if board_idx < 9 and self.meta_board[board_idx] == 0:
                board_score = self._evaluate_small_board_fast(board_idx, player)
                score += board_score * 0.5
        
        return score
    
    def _count_meta_lines_fast(self, player, count):
        meta = self.meta_board.reshape(3, 3)
        lines = 0
        
        for i in range(3):
            row = meta[i, :]
            col = meta[:, i]
            if np.sum(row == player) == count and np.sum(row == (3-player)) == 0:
                lines += 1
            if np.sum(col == player) == count and np.sum(col == (3-player)) == 0:
                lines += 1
        
        diag1 = [meta[0, 0], meta[1, 1], meta[2, 2]]
        diag2 = [meta[0, 2], meta[1, 1], meta[2, 0]]
        
        if diag1.count(player) == count and (3-player) not in diag1:
            lines += 1
        if diag2.count(player) == count and (3-player) not in diag2:
            lines += 1
        
        return lines
    
    def _evaluate_small_board_fast(self, board_idx, player):
        board = self.small_boards[board_idx]
        opponent = 3 - player
        score = 0
        
        # Vectorized counting
        for i in range(3):
            row = board[i, :]
            col = board[:, i]
            
            if np.sum(row == player) == 2 and opponent not in row:
                score += 10
            if np.sum(col == player) == 2 and opponent not in col:
                score += 10
            if np.sum(row == opponent) == 2 and player not in row:
                score -= 12
            if np.sum(col == opponent) == 2 and player not in col:
                score -= 12
        
        return score

# ============================================================================
# ULTRA-OPTIMIZED AGENT WITH TRANSPOSITION TABLE
# ============================================================================

class TurboAgent:
    def __init__(self, player_id, lr=0.15, gamma=0.95, epsilon=1.0,
                 epsilon_decay=0.9997, epsilon_min=0.05):
        self.player_id = player_id
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        
        self.q_table = {}
        self.minimax_depth = 4  # Start with reasonable depth
        
        # Transposition table for minimax
        self.transposition_table = {}
        self.tt_hits = 0
        self.tt_misses = 0
        
        self.wins = 0
        self.losses = 0
        self.draws = 0
    
    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)
    
    def choose_action(self, env, training=True):
        available = env.get_available_actions()
        if not available:
            return None
        
        # LEVEL 1: Immediate wins/blocks
        for action in available:
            sim = self._simulate_move(env, action, self.player_id)
            if sim.winner == self.player_id:
                return action
        
        opponent = 3 - self.player_id
        for action in available:
            sim = self._simulate_move(env, action, opponent)
            if sim.winner == opponent:
                return action
        
        # LEVEL 2: Exploration
        if training and random.random() < self.epsilon:
            # Smart exploration - prioritize strategic boards
            strategic_actions = [a for a in available if a[0] in [4, 0, 2, 6, 8]]
            return random.choice(strategic_actions if strategic_actions else available)
        
        # LEVEL 3: Iterative Deepening Minimax with Transposition Table
        return self._best_move_iterative_deepening(env, available, max_time=0.5)
    
    def _best_move_iterative_deepening(self, env, available, max_time=0.5):
        """Iterative deepening with time limit"""
        start_time = time.time()
        best_action = random.choice(available)
        best_score = -float('inf')
        
        # Order moves: center board, corners, then rest
        def move_priority(action):
            board_idx = action[0]
            if board_idx == 4: return 0
            if board_idx in [0, 2, 6, 8]: return 1
            return 2
        
        ordered_moves = sorted(available, key=move_priority)
        
        # Iterative deepening from depth 1 to max depth
        for depth in range(1, self.minimax_depth + 1):
            if time.time() - start_time > max_time:
                break
            
            alpha = -float('inf')
            beta = float('inf')
            depth_best_score = -float('inf')
            depth_best_action = best_action
            
            for action in ordered_moves:
                if time.time() - start_time > max_time:
                    break
                
                sim = self._simulate_move(env, action, self.player_id)
                score = self._minimax_cached(sim, depth - 1, alpha, beta, False)
                
                # Q-learning boost
                q_boost = self.get_q_value(env.get_state(), action) * 0.05
                total_score = score + q_boost
                
                if total_score > depth_best_score:
                    depth_best_score = total_score
                    depth_best_action = action
                
                alpha = max(alpha, total_score)
                if beta <= alpha:
                    break
            
            # Update best if we completed this depth
            if time.time() - start_time <= max_time:
                best_score = depth_best_score
                best_action = depth_best_action
        
        return best_action
    
    def _minimax_cached(self, env, depth, alpha, beta, is_maximizing):
        """Minimax with transposition table"""
        state_hash = env.get_state_hash()
        tt_key = (state_hash, depth, is_maximizing)
        
        # Check transposition table
        if tt_key in self.transposition_table:
            self.tt_hits += 1
            return self.transposition_table[tt_key]
        
        self.tt_misses += 1
        
        # Terminal conditions
        if env.winner == self.player_id:
            score = 10000 + depth
            self.transposition_table[tt_key] = score
            return score
        if env.winner == (3 - self.player_id):
            score = -10000 - depth
            self.transposition_table[tt_key] = score
            return score
        if env.game_over:
            self.transposition_table[tt_key] = 0
            return 0
        if depth == 0:
            score = env.evaluate_position(self.player_id)
            self.transposition_table[tt_key] = score
            return score
        
        available = env.get_available_actions()
        
        if is_maximizing:
            max_eval = -float('inf')
            for action in available:
                sim = self._simulate_move(env, action, self.player_id)
                eval_score = self._minimax_cached(sim, depth - 1, alpha, beta, False)
                max_eval = max(max_eval, eval_score)
                alpha = max(alpha, eval_score)
                if beta <= alpha:
                    break
            self.transposition_table[tt_key] = max_eval
            return max_eval
        else:
            min_eval = float('inf')
            opponent = 3 - self.player_id
            for action in available:
                sim = self._simulate_move(env, action, opponent)
                eval_score = self._minimax_cached(sim, depth - 1, alpha, beta, True)
                min_eval = min(min_eval, eval_score)
                beta = min(beta, eval_score)
                if beta <= alpha:
                    break
            self.transposition_table[tt_key] = min_eval
            return min_eval
    
    def _simulate_move(self, env, action, player):
        """Ultra-fast shallow copy"""
        sim = SuperTicTacToe()
        sim.small_boards = [board.copy() for board in env.small_boards]
        sim.meta_board = env.meta_board.copy()
        sim.current_player = player
        sim.active_board = env.active_board
        sim._eval_cache = env._eval_cache  # Share cache!
        sim.make_move(action)
        return sim
    
    def update_q_value(self, state, action, reward, next_state, next_actions):
        current_q = self.get_q_value(state, action)
        if next_actions:
            max_next_q = max([self.get_q_value(next_state, a) for a in next_actions])
        else:
            max_next_q = 0
        
        td_error = reward + self.gamma * max_next_q - current_q
        new_q = current_q + self.lr * td_error
        self.q_table[(state, action)] = new_q
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def clear_transposition_table(self):
        """Clear TT periodically to save memory"""
        if len(self.transposition_table) > 100000:
            self.transposition_table.clear()

# ============================================================================
# TRAINING
# ============================================================================

def play_game(env, agent1, agent2, training=True):
    env.reset()
    game_history = []
    agents = {1: agent1, 2: agent2}
    
    while not env.game_over:
        current_agent = agents[env.current_player]
        
        state = env.get_state()
        action = current_agent.choose_action(env, training)
        
        if action is None:
            break
        
        game_history.append((state, action, env.current_player))
        next_state, reward, done = env.make_move(action)
        
        if training:
            next_actions = env.get_available_actions()
            current_agent.update_q_value(state, action, reward, next_state, next_actions)
        
        if done:
            if env.winner == 1:
                agent1.wins += 1
                agent2.losses += 1
                if training:
                    _update_outcome(agent1, game_history, 1, 100)
                    _update_outcome(agent2, game_history, 2, -50)
            elif env.winner == 2:
                agent2.wins += 1
                agent1.losses += 1
                if training:
                    _update_outcome(agent1, game_history, 1, -50)
                    _update_outcome(agent2, game_history, 2, 100)
            else:
                agent1.draws += 1
                agent2.draws += 1
                if training:
                    _update_outcome(agent1, game_history, 1, -10)
                    _update_outcome(agent2, game_history, 2, -10)
    
    return env.winner

def _update_outcome(agent, history, player_id, final_reward):
    agent_moves = [(s, a) for s, a, p in history if p == player_id]
    for i in range(len(agent_moves) - 1, -1, -1):
        state, action = agent_moves[i]
        discount = agent.gamma ** (len(agent_moves) - 1 - i)
        adjusted_reward = final_reward * discount
        current_q = agent.get_q_value(state, action)
        new_q = current_q + agent.lr * (adjusted_reward - current_q)
        agent.q_table[(state, action)] = new_q

# ============================================================================
# SERIALIZATION
# ============================================================================

def serialize_q_table_optimized(q_table):
    serialized = {}
    for (state, action), value in q_table.items():
        small_boards_str = ','.join([''.join(map(str, board)) for board in state[0]])
        meta_str = ''.join(map(str, state[1]))
        active_str = str(state[2]) if state[2] is not None else 'N'
        action_str = f"{action[0]}{action[1]}{action[2]}"
        key = f"{small_boards_str}|{meta_str}|{active_str}|{action_str}"
        serialized[key] = round(float(value), 4)
    return serialized

def create_training_zip(agent1, agent2, config, training_stats):
    print("üì¶ Packaging agents...")
    
    agent1_data = {
        "q_table": serialize_q_table_optimized(agent1.q_table),
        "epsilon": round(agent1.epsilon, 6),
        "lr": agent1.lr,
        "gamma": agent1.gamma,
        "minimax_depth": agent1.minimax_depth,
        "wins": agent1.wins,
        "losses": agent1.losses,
        "draws": agent1.draws
    }
    
    agent2_data = {
        "q_table": serialize_q_table_optimized(agent2.q_table),
        "epsilon": round(agent2.epsilon, 6),
        "lr": agent2.lr,
        "gamma": agent2.gamma,
        "minimax_depth": agent2.minimax_depth,
        "wins": agent2.wins,
        "losses": agent2.losses,
        "draws": agent2.draws
    }
    
    buffer = io.BytesIO()
    with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED, compresslevel=9) as zf:
        zf.writestr("agent1.json", json.dumps(agent1_data))
        zf.writestr("agent2.json", json.dumps(agent2_data))
        zf.writestr("config.json", json.dumps(config))
        zf.writestr("training_stats.json", json.dumps(training_stats))
        
        metadata = {
            "trained_episodes": config.get("episodes", 0),
            "final_q_size_agent1": len(agent1.q_table),
            "final_q_size_agent2": len(agent2.q_table),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "tt_efficiency": f"{agent1.tt_hits}/{agent1.tt_hits + agent1.tt_misses}"
        }
        zf.writestr("metadata.json", json.dumps(metadata, indent=2))
    
    buffer.seek(0)
    return buffer

# ============================================================================
# MAIN TRAINING
# ============================================================================

print("üéØ Super Tic-Tac-Toe - TURBO GPU Training")
print("=" * 60)

# Optimized for 5-minute training with deep search
EPISODES = 1000
LR = 0.15
GAMMA = 0.95
MINIMAX_DEPTH = 12  # With transposition table and iterative deepening, effective depth is much higher
EPSILON_DECAY = 0.9997
UPDATE_FREQ = 100

print(f"\n‚öôÔ∏è  Configuration:")
print(f"   Episodes: {EPISODES:,}")
print(f"   Learning Rate: {LR}")
print(f"   Gamma: {GAMMA}")
print(f"   Base Minimax Depth: {MINIMAX_DEPTH} (Iterative Deepening)")
print(f"   Epsilon Decay: {EPSILON_DECAY}")
print(f"   Optimizations: Transposition Table, Move Ordering, Caching")
print()

env = SuperTicTacToe()
agent1 = TurboAgent(1, lr=LR, gamma=GAMMA, epsilon_decay=EPSILON_DECAY)
agent1.minimax_depth = MINIMAX_DEPTH
agent2 = TurboAgent(2, lr=LR, gamma=GAMMA, epsilon_decay=EPSILON_DECAY)
agent2.minimax_depth = MINIMAX_DEPTH

training_stats = {
    'episodes': [], 'agent1_wins': [], 'agent2_wins': [], 'draws': [],
    'agent1_epsilon': [], 'agent2_epsilon': [],
    'agent1_q_size': [], 'agent2_q_size': []
}

print("üöÄ Training started...\n")
start_time = time.time()

for episode in tqdm(range(1, EPISODES + 1), desc="Training"):
    play_game(env, agent1, agent2, training=True)
    agent1.decay_epsilon()
    agent2.decay_epsilon()
    
    # Clear TT periodically to manage memory
    if episode % 100 == 0:
        agent1.clear_transposition_table()
        agent2.clear_transposition_table()
        env._eval_cache.clear()
    
    if episode % UPDATE_FREQ == 0:
        training_stats['episodes'].append(episode)
        training_stats['agent1_wins'].append(agent1.wins)
        training_stats['agent2_wins'].append(agent2.wins)
        training_stats['draws'].append(agent1.draws)
        training_stats['agent1_epsilon'].append(round(agent1.epsilon, 6))
        training_stats['agent2_epsilon'].append(round(agent2.epsilon, 6))
        training_stats['agent1_q_size'].append(len(agent1.q_table))
        training_stats['agent2_q_size'].append(len(agent2.q_table))
        
        win_rate_1 = agent1.wins / episode * 100
        win_rate_2 = agent2.wins / episode * 100
        tt_rate = agent1.tt_hits / max(agent1.tt_hits + agent1.tt_misses, 1) * 100
        
        print(f"\nüìä Episode {episode:,}/{EPISODES:,}")
        print(f"   Agent 1: {agent1.wins:,} ({win_rate_1:.1f}%) | Œµ={agent1.epsilon:.4f} | Q={len(agent1.q_table):,}")
        print(f"   Agent 2: {agent2.wins:,} ({win_rate_2:.1f}%) | Œµ={agent2.epsilon:.4f} | Q={len(agent2.q_table):,}")
        print(f"   Draws: {agent1.draws:,} | TT Hit Rate: {tt_rate:.1f}%")

elapsed_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {elapsed_time:.1f}s ({elapsed_time/60:.1f} min)")

print("\n" + "=" * 60)
print("üìà FINAL STATISTICS")
print("=" * 60)
print(f"Agent 1: {agent1.wins:,} wins ({agent1.wins/EPISODES*100:.1f}%)")
print(f"Agent 2: {agent2.wins:,} wins ({agent2.wins/EPISODES*100:.1f}%)")
print(f"Draws: {agent1.draws:,} ({agent1.draws/EPISODES*100:.1f}%)")
print(f"Q-Table: {len(agent1.q_table):,} + {len(agent2.q_table):,} states")
print(f"TT Hit Rate: {agent1.tt_hits/(agent1.tt_hits+agent1.tt_misses)*100:.1f}%")

config = {
    "episodes": EPISODES, "lr1": LR, "gamma1": GAMMA, "minimax1": MINIMAX_DEPTH,
    "lr2": LR, "gamma2": GAMMA, "minimax2": MINIMAX_DEPTH,
    "epsilon_decay": EPSILON_DECAY, "training_time_seconds": round(elapsed_time, 2)
}

print("\nüíæ Creating download package...")
zip_buffer = create_training_zip(agent1, agent2, config, training_stats)

output_filename = "super_ttt_agents.zip"
with open(output_filename, "wb") as f:
    f.write(zip_buffer.getvalue())

file_size_mb = len(zip_buffer.getvalue()) / (1024 * 1024)
print(f"‚úÖ Saved: {output_filename} ({file_size_mb:.2f} MB)")
print()
print("=" * 60)
print("üéâ SUCCESS! Download and upload to Streamlit!")
print("=" * 60)
print(f"üìä Stats: {EPISODES:,} episodes | {elapsed_time/60:.1f} min | {file_size_mb:.2f} MB")
print("üöÄ Ready for deployment!")
LR = 0.15
GAMMA = 0.95
MINIMAX_DEPTH = 17
UPDATE_FREQ = 100

print(f"\n‚öôÔ∏è  Configuration:")
print(f"   Episodes: {EPISODES:,}")
print(f"   Learning Rate: {LR}")
print(f"   Gamma: {GAMMA}")
print(f"   Base Minimax Depth: {MINIMAX_DEPTH} (Iterative Deepening)")
print(f"   Epsilon Decay: {EPSILON_DECAY}")
print(f"   Optimizations: Transposition Table, Move Ordering, Caching")
print()

env = SuperTicTacToe()
agent1 = TurboAgent(1, lr=LR, gamma=GAMMA, epsilon_decay=EPSILON_DECAY)
agent1.minimax_depth = MINIMAX_DEPTH
agent2 = TurboAgent(2, lr=LR, gamma=GAMMA, epsilon_decay=EPSILON_DECAY)
agent2.minimax_depth = MINIMAX_DEPTH

training_stats = {
    'episodes': [], 'agent1_wins': [], 'agent2_wins': [], 'draws': [],
    'agent1_epsilon': [], 'agent2_epsilon': [],
    'agent1_q_size': [], 'agent2_q_size': []
}

print("üöÄ Training started...\n")
start_time = time.time()

for episode in tqdm(range(1, EPISODES + 1), desc="Training"):
    play_game(env, agent1, agent2, training=True)
    agent1.decay_epsilon()
    agent2.decay_epsilon()
    
    # Clear TT periodically to manage memory
    if episode % 100 == 0:
        agent1.clear_transposition_table()
        agent2.clear_transposition_table()
        env._eval_cache.clear()
    
    if episode % UPDATE_FREQ == 0:
        training_stats['episodes'].append(episode)
        training_stats['agent1_wins'].append(agent1.wins)
        training_stats['agent2_wins'].append(agent2.wins)
        training_stats['draws'].append(agent1.draws)
        training_stats['agent1_epsilon'].append(round(agent1.epsilon, 6))
        training_stats['agent2_epsilon'].append(round(agent2.epsilon, 6))
        training_stats['agent1_q_size'].append(len(agent1.q_table))
        training_stats['agent2_q_size'].append(len(agent2.q_table))
        
        win_rate_1 = agent1.wins / episode * 100
        win_rate_2 = agent2.wins / episode * 100
        tt_rate = agent1.tt_hits / max(agent1.tt_hits + agent1.tt_misses, 1) * 100
        
        print(f"\nüìä Episode {episode:,}/{EPISODES:,}")
        print(f"   Agent 1: {agent1.wins:,} ({win_rate_1:.1f}%) | Œµ={agent1.epsilon:.4f} | Q={len(agent1.q_table):,}")
        print(f"   Agent 2: {agent2.wins:,} ({win_rate_2:.1f}%) | Œµ={agent2.epsilon:.4f} | Q={len(agent2.q_table):,}")
        print(f"   Draws: {agent1.draws:,} | TT Hit Rate: {tt_rate:.1f}%")

elapsed_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {elapsed_time:.1f}s ({elapsed_time/60:.1f} min)")

print("\n" + "=" * 60)
print("üìà FINAL STATISTICS")
print("=" * 60)
print(f"Agent 1: {agent1.wins:,} wins ({agent1.wins/EPISODES*100:.1f}%)")
print(f"Agent 2: {agent2.wins:,} wins ({agent2.wins/EPISODES*100:.1f}%)")
print(f"Draws: {agent1.draws:,} ({agent1.draws/EPISODES*100:.1f}%)")
print(f"Q-Table: {len(agent1.q_table):,} + {len(agent2.q_table):,} states")
print(f"TT Hit Rate: {agent1.tt_hits/(agent1.tt_hits+agent1.tt_misses)*100:.1f}%")

config = {
    "episodes": EPISODES, "lr1": LR, "gamma1": GAMMA, "minimax1": MINIMAX_DEPTH,
    "lr2": LR, "gamma2": GAMMA, "minimax2": MINIMAX_DEPTH,
    "epsilon_decay": EPSILON_DECAY, "training_time_seconds": round(elapsed_time, 2)
}

print("\nüíæ Creating download package...")
zip_buffer = create_training_zip(agent1, agent2, config, training_stats)

output_filename = "super_ttt_agents.zip"
with open(output_filename, "wb") as f:
    f.write(zip_buffer.getvalue())

file_size_mb = len(zip_buffer.getvalue()) / (1024 * 1024)
print(f"‚úÖ Saved: {output_filename} ({file_size_mb:.2f} MB)")
print()
print("=" * 60)
print("üéâ SUCCESS! Download and upload to Streamlit!")
print("=" * 60)
print(f"üìä Stats: {EPISODES:,} episodes | {elapsed_time/60:.1f} min | {file_size_mb:.2f} MB")
print("üöÄ Ready for deployment!")

  if entities is not ():


üöÄ GPU (CuPy) detected! Using GPU acceleration
üéØ Super Tic-Tac-Toe - TURBO GPU Training

‚öôÔ∏è  Configuration:
   Episodes: 1,000
   Learning Rate: 0.15
   Gamma: 0.95
   Base Minimax Depth: 12 (Iterative Deepening)
   Epsilon Decay: 0.9997
   Optimizations: Transposition Table, Move Ordering, Caching

üöÄ Training started...



  hash_val = hash_val * 3 + cell
  hash_val = hash_val * 3 + cell
  hash_val = hash_val * 10 + self.active_board
  hash_val = hash_val * 3 + cell
  hash_val = hash_val * 10 + self.active_board
  hash_val = hash_val * 3 + cell
Training:  10%|‚ñà         | 101/1000 [00:22<02:30,  5.98it/s]


üìä Episode 100/1,000
   Agent 1: 32 (32.0%) | Œµ=0.9704 | Q=2,772
   Agent 2: 42 (42.0%) | Œµ=0.9704 | Q=2,776
   Draws: 26 | TT Hit Rate: 73.6%


Training:  20%|‚ñà‚ñà        | 200/1000 [00:42<02:18,  5.79it/s]


üìä Episode 200/1,000
   Agent 1: 72 (36.0%) | Œµ=0.9418 | Q=5,431
   Agent 2: 80 (40.0%) | Œµ=0.9418 | Q=5,439
   Draws: 48 | TT Hit Rate: 84.7%


Training:  30%|‚ñà‚ñà‚ñà       | 300/1000 [01:05<02:54,  4.01it/s]


üìä Episode 300/1,000
   Agent 1: 103 (34.3%) | Œµ=0.9139 | Q=8,279
   Agent 2: 111 (37.0%) | Œµ=0.9139 | Q=8,291
   Draws: 86 | TT Hit Rate: 90.5%


Training:  33%|‚ñà‚ñà‚ñà‚ñé      | 329/1000 [01:13<02:30,  4.45it/s]


KeyboardInterrupt: 