In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [14]:
import numpy as np
import random
import json
import zipfile
import io
import math
import time
from collections import deque
from copy import deepcopy

# ============================================================================
# 1. CONSTANTS & TILE CLASS
# ============================================================================
COLORS = ['red', 'orange', 'yellow', 'green', 'blue', 'purple']
SHAPES = ['circle', 'square', 'diamond', 'star', 'clover', 'cross']

class Tile:
    def __init__(self, color, shape):
        self.color = color
        self.shape = shape
    def __eq__(self, other): return self.color == other.color and self.shape == other.shape
    def __hash__(self): return hash((self.color, self.shape))
    def __repr__(self): return f"{self.color[0].upper()}{self.shape[0].upper()}"

# ============================================================================
# 2. QWIRKLE ENVIRONMENT (Headless)
# ============================================================================
class Qwirkle:
    def __init__(self, num_players=2):
        self.num_players = num_players
        self.reset()
    
    def reset(self):
        self.bag = []
        for color in COLORS:
            for shape in SHAPES:
                for _ in range(3): self.bag.append(Tile(color, shape))
        random.shuffle(self.bag)
        self.board = {}
        self.hands = [self._draw_tiles(6) for _ in range(self.num_players)]
        self.current_player = 0
        self.scores = [0] * self.num_players
        self.game_over = False
        self.winner = None
        self.passes = 0
        return self.get_state()
    
    def _draw_tiles(self, count):
        drawn = []
        for _ in range(min(count, len(self.bag))):
            if self.bag: drawn.append(self.bag.pop())
        return drawn
    
    def get_state(self):
        # Simplified state for Q-Table hashing
        state_data = {
            'board_size': len(self.board),
            'hand_size': len(self.hands[self.current_player]),
            'tiles_left': len(self.bag),
            'score_diff': self.scores[self.current_player] - max([s for i, s in enumerate(self.scores) if i != self.current_player], default=0)
        }
        return tuple(sorted(state_data.items()))
    
    def get_available_actions(self):
        actions = []
        hand = self.hands[self.current_player]
        if not self.board:
            # First move logic
            for i in range(len(hand)):
                actions.append(('place', [(0, 0, i)]))
                for j in range(i + 1, len(hand)):
                    if self._tiles_compatible([hand[i], hand[j]]):
                        actions.append(('place', [(0, 0, i), (0, 1, j)]))
        else:
            # Standard move logic
            empty_neighbors = self._get_empty_neighbors()
            for pos in empty_neighbors:
                for idx, tile in enumerate(hand):
                    if self._is_valid_placement(pos, tile):
                        actions.append(('place', [(pos[0], pos[1], idx)]))
            # Limited multi-tile check for speed
            if len(hand) >= 2:
                for pos1 in list(empty_neighbors)[:5]: 
                    for idx1, tile1 in enumerate(hand[:4]):
                        if self._is_valid_placement(pos1, tile1):
                            for dr, dc in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
                                pos2 = (pos1[0] + dr, pos1[1] + dc)
                                if pos2 not in self.board:
                                    for idx2, tile2 in enumerate(hand[:4]):
                                        if idx2 != idx1 and self._tiles_compatible([tile1, tile2]):
                                            actions.append(('place', [(pos1[0], pos1[1], idx1), (pos2[0], pos2[1], idx2)]))
        actions.append(('trade', []))
        return actions[:50] # Limit actions to prevent memory explosion
    
    def _get_empty_neighbors(self):
        neighbors = set()
        for (r, c) in self.board:
            for dr, dc in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
                if (r + dr, c + dc) not in self.board: neighbors.add((r + dr, c + dc))
        return neighbors
    
    def _tiles_compatible(self, tiles):
        if len(tiles) <= 1: return True
        colors = set(t.color for t in tiles); shapes = set(t.shape for t in tiles)
        return (len(colors) == 1 and len(shapes) == len(tiles)) or (len(shapes) == 1 and len(colors) == len(tiles))

    def _is_valid_placement(self, pos, tile):
        r, c = pos
        if (r, c) in self.board: return False
        adj = []
        for dr, dc in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
            if (r + dr, c + dc) in self.board: adj.append(self.board[(r+dr, c+dc)])
        if not adj and len(self.board) > 0: return False
        
        # Horizontal check
        h_tiles = [tile]; cc = c - 1
        while (r, cc) in self.board: h_tiles.insert(0, self.board[(r, cc)]); cc -= 1
        cc = c + 1
        while (r, cc) in self.board: h_tiles.append(self.board[(r, cc)]); cc += 1
        if len(h_tiles) > 1 and not self._tiles_compatible(h_tiles): return False
        
        # Vertical check
        v_tiles = [tile]; rr = r - 1
        while (rr, c) in self.board: v_tiles.insert(0, self.board[(rr, c)]); rr -= 1
        rr = r + 1
        while (rr, c) in self.board: v_tiles.append(self.board[(rr, c)]); rr += 1
        if len(v_tiles) > 1 and not self._tiles_compatible(v_tiles): return False
        
        if len(h_tiles) > 6 or len(v_tiles) > 6: return False
        return True

    def make_move(self, action):
        if self.game_over: return self.get_state(), 0, True
        action_type, data = action
        reward = 0
        if action_type == 'place':
            placed = []
            for r, c, idx in data:
                tile = self.hands[self.current_player][idx]
                self.board[(r, c)] = tile
                placed.append((r, c, idx))
            reward = self._calculate_score(placed)
            self.scores[self.current_player] += reward
            for _, _, idx in sorted(placed, key=lambda x: x[2], reverse=True):
                self.hands[self.current_player].pop(idx)
            self.hands[self.current_player].extend(self._draw_tiles(len(placed)))
            self.passes = 0
        elif action_type == 'trade':
            traded = self.hands[self.current_player][:]
            self.hands[self.current_player] = []
            self.bag.extend(traded); random.shuffle(self.bag)
            self.hands[self.current_player] = self._draw_tiles(len(traded))
            reward = -2
            self.passes += 1
        
        if (not self.hands[self.current_player] and not self.bag) or self.passes >= 4:
            self.game_over = True
            if not self.hands[self.current_player]: self.scores[self.current_player] += 6
            self.winner = self.scores.index(max(self.scores))
            return self.get_state(), reward + 50, True
            
        self.current_player = (self.current_player + 1) % self.num_players
        return self.get_state(), reward, False

    def _calculate_score(self, placed):
        score = 0; scored = set()
        for r, c, _ in placed:
            for dr, dc in [(0, 1), (1, 0)]:
                line = set([(r, c)])
                for d in [-1, 1]:
                    curr_r, curr_c = r + d*dr, c + d*dc
                    while (curr_r, curr_c) in self.board:
                        line.add((curr_r, curr_c))
                        curr_r += d*dr; curr_c += d*dc
                if len(line) > 1:
                    ls = len(line) + (6 if len(line) == 6 else 0)
                    if not line.issubset(scored): score += ls; scored.update(line)
        return score if score > 0 else len(placed)

    def evaluate_position(self, player):
        if self.winner == player: return 100000
        if self.winner is not None: return -100000
        score = (self.scores[player] - max([self.scores[i] for i in range(2) if i!=player], default=0)) * 100
        score += len(self.hands[player]) * 10
        return score

# ============================================================================
# 3. AGENT CLASSES (MCTS + Strategic Agent)
# ============================================================================
class MCTSNode:
    def __init__(self, state, parent=None, action=None):
        self.state = state; self.parent = parent; self.action = action
        self.children = []; self.visits = 0; self.value = 0.0; self.untried_actions = None
    def uct(self, c=1.41):
        if self.visits == 0: return float('inf')
        return self.value / self.visits + c * math.sqrt(math.log(self.parent.visits) / self.visits)
    def best_child(self): return max(self.children, key=lambda c: c.uct())
    def most_visited(self): return max(self.children, key=lambda c: c.visits)

class StrategicQwirkleAgent:
    def __init__(self, pid, lr=0.1, gamma=0.95, epsilon=1.0, epsilon_decay=0.995, minimax_depth=0):
        self.pid = pid; self.lr = lr; self.gamma = gamma
        self.epsilon = epsilon; self.epsilon_decay = epsilon_decay; self.epsilon_min = 0.05
        self.minimax_depth = minimax_depth; self.mcts_simulations = 50
        self.q_table = {}; self.wins = 0; self.losses = 0; self.draws = 0
        self.total_score = 0; self.games_played = 0

    def get_q(self, state, action): return self.q_table.get((state, str(action)), 0.0)

    def choose_action(self, env, training=True):
        acts = env.get_available_actions()
        if not acts: return None
        
        # 1. Minimax
        if self.minimax_depth > 0:
            _, best = self._minimax(env, self.minimax_depth, -float('inf'), float('inf'), True)
            return best if best else random.choice(acts)
        
        # 2. Epsilon Greedy
        if training and random.random() < self.epsilon: return random.choice(acts)
        
        # 3. MCTS
        return self._mcts(env, acts)

    def _minimax(self, env, depth, alpha, beta, maxing):
        if depth == 0 or env.game_over: return env.evaluate_position(self.pid), None
        acts = env.get_available_actions()[:5] # Limit breadth
        best_act = random.choice(acts) if acts else None
        
        if maxing:
            max_eval = -float('inf')
            for a in acts:
                sim = self._copy(env); sim.make_move(a)
                val, _ = self._minimax(sim, depth-1, alpha, beta, False)
                if val > max_eval: max_eval = val; best_act = a
                alpha = max(alpha, val)
                if beta <= alpha: break
            return max_eval, best_act
        else:
            min_eval = float('inf')
            for a in acts:
                sim = self._copy(env); sim.make_move(a)
                val, _ = self._minimax(sim, depth-1, alpha, beta, True)
                if val < min_eval: min_eval = val; best_act = a
                beta = min(beta, val)
                if beta <= alpha: break
            return min_eval, best_act

    def _mcts(self, env, acts):
        root = MCTSNode(env.get_state()); root.untried_actions = acts[:]
        for _ in range(self.mcts_simulations):
            node = root; sim = self._copy(env)
            while not node.untried_actions and node.children:
                node = node.best_child(); 
                if node.action: sim.make_move(node.action)
            if node.untried_actions:
                a = random.choice(node.untried_actions); node.untried_actions.remove(a)
                sim.make_move(a); child = MCTSNode(sim.get_state(), node, a)
                node.children.append(child); node = child
            
            # Rollout
            rw = 0; steps = 0
            while not sim.game_over and steps < 20:
                ac = sim.get_available_actions()
                if not ac: break
                _, r, _ = sim.make_move(random.choice(ac))
                rw += r; steps += 1
            if sim.game_over and sim.winner == self.pid: rw += 100
            
            # Backprop
            while node: node.visits+=1; node.value+=rw; node=node.parent
            
        return root.most_visited().action if root.children else acts[0]

    def _copy(self, env):
        new = Qwirkle(env.num_players); new.board = env.board.copy()
        new.hands = [h[:] for h in env.hands]; new.bag = env.bag[:]
        new.current_player = env.current_player; new.scores = env.scores[:]
        new.game_over = env.game_over; new.winner = env.winner; new.passes = env.passes
        return new

    def update_q(self, state, action, reward, next_state, next_acts):
        k = str(action); curr = self.get_q(state, action)
        mx = max([self.get_q(next_state, a) for a in next_acts[:10]], default=0) if next_acts else 0
        self.q_table[(state, k)] = curr + self.lr * (reward + self.gamma * mx - curr)

    def decay_epsilon(self): self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    def reset_stats(self): self.wins=0; self.losses=0; self.total_score=0; self.games_played=0

In [15]:
# ============================================================================
# 1. SERIALIZATION HELPERS
# ============================================================================
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer): return int(obj)
        if isinstance(obj, np.floating): return float(obj)
        if isinstance(obj, np.ndarray): return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def serialize_q_table(q_table):
    # Matches Streamlit: "JSON_List_State|String_Action"
    serialized_q = {}
    for (state, action_key), value in q_table.items():
        state_str = json.dumps(list(state))
        key_str = f"{state_str}|{action_key}"
        serialized_q[key_str] = float(value)
    return serialized_q

def save_kaggle_brain(agent1, agent2, config, filename="qwirkle_brains.zip"):
    a1_data = {
        "q_table": serialize_q_table(agent1.q_table),
        "epsilon": float(agent1.epsilon),
        "wins": int(agent1.wins), "losses": int(agent1.losses),
        "total_score": int(agent1.total_score),
        "games_played": int(agent1.games_played)
    }
    a2_data = {
        "q_table": serialize_q_table(agent2.q_table),
        "epsilon": float(agent2.epsilon),
        "wins": int(agent2.wins), "losses": int(agent2.losses),
        "total_score": int(agent2.total_score),
        "games_played": int(agent2.games_played)
    }
    
    print(f"üíæ Saving: A1 Q-States={len(agent1.q_table)}, A2 Q-States={len(agent2.q_table)}")
    with zipfile.ZipFile(filename, "w", zipfile.ZIP_DEFLATED) as zf:
        zf.writestr("agent1.json", json.dumps(a1_data, cls=NumpyEncoder, indent=2))
        zf.writestr("agent2.json", json.dumps(a2_data, cls=NumpyEncoder, indent=2))
        zf.writestr("config.json", json.dumps(config, cls=NumpyEncoder, indent=2))
    print(f"‚úÖ Save Complete! Download '{filename}'")

# ============================================================================
# 2. TRAINING RUNNER
# ============================================================================
def play_game(env, agent1, agent2):
    env.reset()
    agents = [agent1, agent2]
    moves = 0
    while not env.game_over and moves < 150:
        cp = env.current_player; ag = agents[cp]
        s = env.get_state(); a = ag.choose_action(env)
        if not a: break
        ns, r, done = env.make_move(a)
        ag.update_q(s, a, r, ns, env.get_available_actions())
        moves += 1
        if done:
            for i, ax in enumerate(agents):
                ax.games_played += 1
                ax.total_score += env.scores[i]
                if env.winner == i: ax.wins += 1
                elif env.winner is not None: ax.losses += 1
                else: ax.draws += 1
    return env.winner

def run_qwirkle_training():
    # --- CONFIGURATION ---
    EPISODES = 50       # Qwirkle is complex, 500-1000 is good for a start
    
    # Initialize Environment & Agents
    # Note: Depth=0 uses MCTS (default), Depth>0 uses Minimax
    env = Qwirkle()
    agent1 = StrategicQwirkleAgent(0, lr=0.1, gamma=0.95, epsilon_decay=0.999, minimax_depth=4)
    agent2 = StrategicQwirkleAgent(1, lr=0.1, gamma=0.95, epsilon_decay=0.999, minimax_depth=4)
    
    # Increase MCTS sims for training logic
    agent1.mcts_simulations = 50
    agent2.mcts_simulations = 50
    
    # --- FIX: Initialize History as Lists of Lists/Scalars ---
    history = {
        'agent1_wins': [], 'agent2_wins': [], 'draws': [],
        'agent1_epsilon': [], 'agent2_epsilon': [],
        'agent1_q_size': [], 'agent2_q_size': [],
        'agent1_avg_score': [], 'agent2_avg_score': [],
        'episode': []
    }
    
    print(f"üöÄ Starting Qwirkle Training ({EPISODES} Episodes)...")
    start_time = time.time()
    
    for ep in range(1, EPISODES + 1):
        play_game(env, agent1, agent2)
        
        agent1.decay_epsilon()
        agent2.decay_epsilon()
        
        # Log every 10 episodes to keep history size manageable but detailed
        if ep % 10 == 0:
            history['agent1_wins'].append(agent1.wins)
            history['agent2_wins'].append(agent2.wins)
            history['draws'].append(agent1.draws)
            history['agent1_epsilon'].append(agent1.epsilon)
            history['agent2_epsilon'].append(agent2.epsilon)
            history['agent1_q_size'].append(len(agent1.q_table))
            history['agent2_q_size'].append(len(agent2.q_table))
            history['agent1_avg_score'].append(agent1.total_score / max(1, agent1.games_played))
            history['agent2_avg_score'].append(agent2.total_score / max(1, agent2.games_played))
            history['episode'].append(ep)
        
        if ep % 1 == 0:
            elapsed = time.time() - start_time
            print(f"Ep {ep}/{EPISODES} | A1 Wins: {agent1.wins} | Avg Score: {agent1.total_score/ep:.1f} | {elapsed:.1f}s")

    # Save Config
    config = {
        "lr1": agent1.lr, "gamma1": agent1.gamma, "epsilon_decay1": agent1.epsilon_decay,
        "mcts_sims1": agent1.mcts_simulations, "depth1": agent1.minimax_depth,
        "lr2": agent2.lr, "gamma2": agent2.gamma, "epsilon_decay2": agent2.epsilon_decay,
        "mcts_sims2": agent2.mcts_simulations, "depth2": agent2.minimax_depth,
        "training_history": history
    }
    
    print("\nüèÜ Training Finished!")
    save_kaggle_brain(agent1, agent2, config)

if __name__ == "__main__":
    run_qwirkle_training()

üöÄ Starting Qwirkle Training (50 Episodes)...
Ep 1/50 | A1 Wins: 0 | Avg Score: 144.0 | 9.7s
Ep 2/50 | A1 Wins: 1 | Avg Score: 158.0 | 18.9s
Ep 3/50 | A1 Wins: 1 | Avg Score: 165.0 | 29.3s
Ep 4/50 | A1 Wins: 2 | Avg Score: 165.2 | 39.2s
Ep 5/50 | A1 Wins: 3 | Avg Score: 164.2 | 49.0s
Ep 6/50 | A1 Wins: 4 | Avg Score: 167.2 | 58.7s
Ep 7/50 | A1 Wins: 4 | Avg Score: 166.3 | 68.5s
Ep 8/50 | A1 Wins: 5 | Avg Score: 166.0 | 79.4s
Ep 9/50 | A1 Wins: 6 | Avg Score: 148.2 | 79.5s
Ep 10/50 | A1 Wins: 7 | Avg Score: 150.6 | 88.6s
Ep 11/50 | A1 Wins: 7 | Avg Score: 151.6 | 98.4s
Ep 12/50 | A1 Wins: 8 | Avg Score: 152.1 | 107.9s
Ep 13/50 | A1 Wins: 8 | Avg Score: 151.8 | 117.6s
Ep 14/50 | A1 Wins: 8 | Avg Score: 152.1 | 127.6s
Ep 15/50 | A1 Wins: 9 | Avg Score: 153.2 | 137.4s
Ep 16/50 | A1 Wins: 9 | Avg Score: 153.3 | 147.4s
Ep 17/50 | A1 Wins: 9 | Avg Score: 152.8 | 156.2s
Ep 18/50 | A1 Wins: 10 | Avg Score: 153.9 | 165.8s
Ep 19/50 | A1 Wins: 11 | Avg Score: 154.4 | 176.0s
Ep 20/50 | A1 Wins: 1