In [None]:
import torch
import torch.nn as nn
import numpy as np
from board_processor import BoardProcessor
from feature_generator import FeatureGenerator
import os

class QNetwork(nn.Module):
    def __init__(self, input_dim=138):
        super().__init__()
        layers = []
        for h in [256, 128, 64, 32, 16, 8]:
            layers.extend([nn.Linear(input_dim, h), nn.Tanh()])
            input_dim = h
        layers.extend([nn.Linear(h, 1), nn.Tanh()])
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x).squeeze(-1)

In [None]:

def get_current_q(state_action_features, online_model, scaler, player):
    """Get current Q-value prediction from online model"""
    scaled = scaler.transform([state_action_features])
    with torch.no_grad():
        q = online_model(torch.FloatTensor(scaled).to(online_model.device if hasattr(online_model, 'device') else 'cpu')).item()
    return q * player  # Adjust for player perspective

def calculate_target_q(moves, position_i, player, online_model, target_model, scaler, feature_gen, gamma=0.99):
    """Calculate target Q-value using Double DQN logic"""
    # Check if game ends after our move (position i+1)
    board_after_our_move = BoardProcessor()
    board_after_our_move.generate_state_list(moves[:position_i+1])
    _, feats_after_our_move = feature_gen.convolution_feature_gen(board_after_our_move.state_list)

    # Terminal after our move?
    if 4 in feats_after_our_move:
        return 1 * player  # We win
    elif -4 in feats_after_our_move:
        return -1 * player  # We lose (shouldn't happen)
    elif position_i + 1 >= len(moves):
        return 0  # Draw

    # Check if game ends after opponent's move (position i+2)
    if position_i + 2 > len(moves):
        return 0  # Draw

    board_after_opp = BoardProcessor()
    board_after_opp.generate_state_list(moves[:position_i+2])
    _, feats_after_opp = feature_gen.convolution_feature_gen(board_after_opp.state_list)

    if 4 in feats_after_opp or -4 in feats_after_opp:
        return -1   # Opponent wins

    # Non-terminal: calculate Q-value of next state
    next_board = BoardProcessor()
    next_board.generate_state_list(moves[:position_i+2])
    _, next_curr_feats = feature_gen.convolution_feature_gen(next_board.state_list)

    # Get Q-values for all possible next moves using ONLINE network for selection
    online_q_values = []
    for col in range(7):
        if len(next_board.state_list[col]) < 6:  # Legal move
            next_state = [c[:] for c in next_board.state_list]
            next_state[col].append(player)  # Same player's turn
            _, next_feats = feature_gen.convolution_feature_gen(next_state)

            # Check for immediate win
            if 4 * player in next_feats:
                online_q_values.append((col, 1.0))
            else:
                # Get Q-value from ONLINE model
                features = np.concatenate([next_curr_feats, next_feats])
                scaled = scaler.transform([features])
                with torch.no_grad():
                    q = online_model(torch.FloatTensor(scaled).to(online_model.device if hasattr(online_model, 'device') else 'cpu')).item() * player
                    online_q_values.append((col, q))

    if not online_q_values:
        return 0  # No legal moves = draw

    # DOUBLE DQN: Online network selects best action
    best_action = max(online_q_values, key=lambda x: x[1])[0]

    # TARGET network evaluates the selected action
    best_next_state = [c[:] for c in next_board.state_list]
    best_next_state[best_action].append(player)
    _, best_next_feats = feature_gen.convolution_feature_gen(best_next_state)

    # Check for immediate win with selected action
    if 4 * player in best_next_feats:
        target_q_value = 1.0
    else:
        # Evaluate using TARGET network
        best_features = np.concatenate([next_curr_feats, best_next_feats])
        best_scaled = scaler.transform([best_features])
        with torch.no_grad():
            target_q_value = target_model(torch.FloatTensor(best_scaled).to(target_model.device if hasattr(target_model, 'device') else 'cpu')).item() * player

    return gamma * target_q_value

def generate_training_tuples(game_codes, online_model, target_model, scaler, feature_gen, alpha=0.1, gamma=0.99):
    """Generate (state_action_features, target_q) tuples from game codes using Double DQN"""
    training_tuples = []

    for game_code in game_codes:
        board = BoardProcessor()
        moves = board.decode_moves_code(game_code)
        board.generate_state_list(moves)

        # Process each non-terminal position
        for i in range(len(moves) - 1):  # Skip final position
            # Current state and player
            temp_board = BoardProcessor()
            temp_board.generate_state_list(moves[:i])
            player = 1 if (i % 2) == 0 else -1

            # Get current state features
            _, curr_feats = feature_gen.convolution_feature_gen(temp_board.state_list)

            # Action taken and resulting state
            action = moves[i]
            next_state = [col[:] for col in temp_board.state_list]
            next_state[action].append(player)
            _, next_feats = feature_gen.convolution_feature_gen(next_state)

            # Create state-action input features
            state_action_features = np.concatenate([curr_feats, next_feats])

            # Calculate target Q-value using Double DQN TD update
            current_q = get_current_q(state_action_features, online_model, scaler, player)
            target_q = calculate_target_q(moves, i, player, online_model, target_model, scaler, feature_gen, gamma)
            new_q = current_q + alpha * (target_q - current_q)

            training_tuples.append((state_action_features, new_q))

            if len(training_tuples) >= 512:
                return training_tuples[:512]

    return training_tuples

In [None]:
def test_generate_training_tuples(skip=0):
    """Test generate_training_tuples and compare with show_double_dqn_updates logic"""

    # Load models and setup (same as show_double_dqn_updates)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load("qnet_mc_pretrained.pth", map_location=device)

    # Online and Target networks
    online_net = QNetwork().to(device)
    online_net.load_state_dict(checkpoint['model_state_dict'])
    online_net.eval()
    online_net.device = device  # Add device attribute for the function

    target_net = QNetwork().to(device)
    target_net.load_state_dict(checkpoint['model_state_dict'])
    target_net.eval()
    target_net.device = device

    scaler = checkpoint['scaler']
    feature_gen = FeatureGenerator()

    # Test parameters
    alpha = 0.1
    gamma = 0.99

    # Read test game codes
    codes_file = os.path.expanduser('~/Downloads/replayMem.txt')
    test_codes = []

    with open(codes_file, 'r') as f:
        print("Skipping a million rows...")
        for _ in range(skip):
            f.readline()
        for i, line in enumerate(f):
            if i >= 1:  # Take first 5 games
                break
            test_codes.append(line.strip())

    print(f"Testing with {len(test_codes)} game codes\n")
    print("="*60)

    # Test each game code
    for game_idx, game_code in enumerate(test_codes):
        print(f"\nGAME {game_idx + 1}: {game_code}")
        print("-"*40)

        # Generate training tuples
        tuples = generate_training_tuples(
            [game_code],
            online_net,
            target_net,
            scaler,
            feature_gen,
            alpha=alpha,
            gamma=gamma
        )

        # Decode game to show context
        board = BoardProcessor()
        moves = board.decode_moves_code(game_code)
        print(f"Total moves: {len(moves)}")
        # print(f"Last moves: {moves[-3:]}")
        print(f"Last moves: {'; '.join([('X' if (len(moves)-3+i) % 2 == 0 else 'O') + ':' + str(moves[-3+i]) + (' [final]' if i == 2 else '') for i in range(min(3, len(moves)))])}")
        print(f"Generated tuples: {len(tuples)}")

        # Show details for last few positions (like show_double_dqn_updates does)
        num_positions_to_show = min(3, len(tuples))
        start_idx = max(0, len(tuples) - num_positions_to_show)

        for i in range(start_idx, len(tuples)):
            print(f"\n  Position {i+1}/{len(moves)-1}:")

            # Recreate the position to verify
            temp_board = BoardProcessor()
            temp_board.generate_state_list(moves[:i])
            player = 1 if (i % 2) == 0 else -1
            action = moves[i]

            # Get the tuple
            state_action_features, target_q = tuples[i]

            # Manually calculate what we expect our action is from replay memory
            _, curr_feats = feature_gen.convolution_feature_gen(temp_board.state_list)
            next_state = [col[:] for col in temp_board.state_list]
            next_state[action].append(player)
            _, next_feats = feature_gen.convolution_feature_gen(next_state)

            expected_features = np.concatenate([curr_feats, next_feats])

            # Verify features match
            features_match = np.allclose(state_action_features, expected_features)

            # Calculate current Q for comparison
            current_q = get_current_q(state_action_features, online_net, scaler, player)

            # Calculate what target should be
            expected_target = calculate_target_q(
                moves, i, player, online_net, target_net, scaler, feature_gen, gamma
            )

            # The new Q after TD update
            expected_new_q = current_q + alpha * (expected_target - current_q)

            print(f"    Player: {'X' if player == 1 else 'O'}, Action: {action}")
            temp_board.display_board()
            print(f"    Features match: {features_match}")
            print(f"    Current Q: {current_q:+.4f}")
            print(f"    Target Q (raw): {expected_target:+.4f}")
            print(f"    Expected new Q: {expected_new_q:+.4f}")
            print(f"    Actual tuple Q: {target_q:+.4f}")
            print(f"    Match: {np.isclose(target_q, expected_new_q)}")

            if not np.isclose(target_q, expected_new_q):
                print(f"    ⚠️  MISMATCH DETECTED!")
                print(f"    Difference: {abs(target_q - expected_new_q):.6f}")

    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)

    # Batch test for performance
    print(f"\nBatch testing all {len(test_codes)} games...")
    all_tuples = generate_training_tuples(
        test_codes,
        online_net,
        target_net,
        scaler,
        feature_gen,
        alpha=alpha,
        gamma=gamma
    )

    print(f"Total tuples generated: {len(all_tuples)}")
    print(f"Max tuples limit: 512")
    print(f"Actually returned: {min(len(all_tuples), 512)}")

    # Sample check of Q-value distribution
    if all_tuples:
        q_values = [q for _, q in all_tuples[:512]]
        print(f"\nQ-value statistics:")
        print(f"  Min: {min(q_values):+.4f}")
        print(f"  Max: {max(q_values):+.4f}")
        print(f"  Mean: {np.mean(q_values):+.4f}")
        print(f"  Std: {np.std(q_values):.4f}")

if __name__ == "__main__":
    print("Testing generate_training_tuples consistency with show_double_dqn_updates\n")
    test_generate_training_tuples(skip = 72)

In [None]:

def show_double_dqn_updates(game_code, alpha=0.1, gamma=0.99):
    # Load models
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load("qnet_mc_pretrained.pth", map_location=device)

    # Online and Target networks (for now, same weights)
    online_net = QNetwork().to(device)
    online_net.load_state_dict(checkpoint['model_state_dict'])
    online_net.eval()

    target_net = QNetwork().to(device)
    target_net.load_state_dict(checkpoint['model_state_dict'])
    target_net.eval()

    scaler = checkpoint['scaler']

    # Setup
    board = BoardProcessor()
    feature_gen = FeatureGenerator()
    moves = board.decode_moves_code(game_code)
    board.generate_state_list(moves)

    print(f"Game: {game_code} ({len(moves)} moves)")

    # Show last few positions with Double DQN updates
    for i in range(max(0, len(moves)-5), len(moves)-1):
        print(f"\n{'='*60}")
        print(f"POSITION {i+1}/{len(moves)}")
        print(f"{'='*60}")

        board.display_board(index=i-1)

        # Get current state
        temp_board = BoardProcessor()
        temp_board.generate_state_list(moves[:i])
        player = 1 if (i+1) % 2 == 1 else -1

        # Calculate ALL Q-values for current position (online network)
        _, curr_feats = feature_gen.convolution_feature_gen(temp_board.state_list)
        q_vals = []
        q_dict = {}

        for col in range(7):
            if len(temp_board.state_list[col]) < 6:  # Legal move
                next_state = [c[:] for c in temp_board.state_list]
                next_state[col].append(player)
                board_matrix, next_feats = feature_gen.convolution_feature_gen(next_state)
                # print(board_matrix)

                features = np.concatenate([curr_feats, next_feats])
                scaled = scaler.transform([features])

                with torch.no_grad():
                    q = online_net(torch.FloatTensor(scaled).to(device)).item() * player
                    q_dict[col] = q
                    q_vals.append(f"{q:+.3f}")
            else:
                q_vals.append(" --- ")

        print(f"\nPlayer {'X' if player == 1 else 'O'} to move")
        print(f"Online Q-values for all actions:")
        print(f"Columns: 0      1      2      3      4      5      6")
        print(f"Q(s,a):  {' '.join(q_vals)}")

        # Action taken (from replay memory)
        action = moves[i]
        print(f"\nAction taken: Column {action} (Q={q_dict[action]:+.4f})")

        # Current Q-value for the taken action
        current_q = q_dict[action]

        # Check if move i+1 ends the game
        board_after_our_move = BoardProcessor()
        board_after_our_move.generate_state_list(moves[:i+2])
        _, feats_after_our_move = feature_gen.convolution_feature_gen(board_after_our_move.state_list)

        # Did our move end the game?
        if 4 in feats_after_our_move or -4 in feats_after_our_move or i+2 >= len(moves):
            # Terminal immediately after our move
            print("^"*15+ f"Debug This is allegedly terminal? if {i} plus two is greater or equal to {len(moves)}")

            if 4 in feats_after_our_move:
                reward = 1 * player  # We made 4-in-a-row
            elif -4 in feats_after_our_move:
                reward = -1 * player  # We somehow made opponent's 4-in-a-row (shouldn't happen)
            else:
                reward = 0  # Draw

            target_value = reward
            print(f"\n--- Double DQN Update ---")
            print(f"Terminal after our move, reward: {reward:+.4f}")

        else:
            # Non-terminal after our move
            reward = 0  # No intermediate rewards

            # Next state is from OPPONENT's perspective after they move
            # But for Q-learning, we care about OUR next state (after opponent moves)

            # The key insight: we need the value of the state AFTER opponent moves
            # This is position i+3 from our perspective
            print("^"*15+ f"Debug not yet terminal {i} plus two is greater or equal to {len(moves)}")

            # else:
            # Game continues - evaluate our position after opponent moves
            next_board = BoardProcessor()
            next_board.generate_state_list(moves[:i+2])
            _, next_curr_feats = feature_gen.convolution_feature_gen(next_board.state_list)

            # Get Q-values for OUR next move (same player perspective)
            online_next_q_values = []
            for col in range(7):
                if len(next_board.state_list[col]) < 6:
                    ns = [c[:] for c in next_board.state_list]
                    ns[col].append(player)  # Same player
                    _, ns_feats = feature_gen.convolution_feature_gen(ns)
                    # print(brdd,ns_feats, ns)
                    features = np.concatenate([next_curr_feats, ns_feats])
                    scaled = scaler.transform([features])
                    q = 1 if 4 * player in ns_feats else (0 if i + 3 >= 42 else online_net(torch.FloatTensor(scaled).to(device)).item() * player)
                    # with torch.no_grad():
                    #     #Now if one of the moves is winning - skip online_net and just fix Q
                    #     q = 1 if 4 * player in ns_feats else online_net(torch.FloatTensor(scaled).to(device)).item() * player
                    #     #But if we run out of board?
                    #     if i + 3 >= 42: q=0
                    online_next_q_values.append((col, q))

            if not online_next_q_values:
                target_value = 0  # No moves = draw
            else:
                # Double DQN: online selects, target evaluates
                best_action = max(online_next_q_values, key=lambda x: x[1])[0]

                # Target evaluates
                best_ns = [c[:] for c in next_board.state_list]
                best_ns[best_action].append(player)
                _, best_ns_feats = feature_gen.convolution_feature_gen(best_ns)
                best_features = np.concatenate([next_curr_feats, best_ns_feats])
                best_scaled = scaler.transform([best_features])
                target_q_value = 1 if 4 * player in best_ns_feats else (0 if i + 3 >= 42 else online_net(torch.FloatTensor(best_scaled).to(device)).item() * player)
                #
                # with torch.no_grad():
                #     target_q_value = 1 if 4 * player in best_ns_feats else target_net(torch.FloatTensor(best_scaled).to(device)).item() * player
                #     if i + 3 >= 42: target_q_value=0

                target_value = reward + gamma * target_q_value

                print(f"\n--- Double DQN Update ---")
                print(f"After opponent plays column {moves[i+1]}:")
                print(f"Our next turn Q-values: {[f'{c}:{q:.3f}' for c,q in online_next_q_values]}")
                print(f"Online would select: column {best_action}")
                print(f"Target evaluates: {target_q_value:+.4f}")
                print(f"Target value: 0 + {gamma:.3f}*{target_q_value:+.4f} = {target_value:+.4f}")

        # TD error and update
        td_error = target_value - current_q
        new_q = current_q + alpha * td_error

        print(f"\nTD Error: {target_value:+.4f} - {current_q:+.4f} = {td_error:+.4f}")
        print(f"Q-update: {current_q:+.4f} + {alpha}*{td_error:+.4f} = {new_q:+.4f}")
        # Add this right after the Q-update print statement
        print(f"\n--- Verification: State-Action Pair ---")
        print(f"STATE (before move {action}):")
        temp_board_before = BoardProcessor()
        temp_board_before.generate_state_list(moves[:i])
        temp_board_before.display_board()

        print(f"\nACTION: Player {'X' if player == 1 else 'O'} plays column {action}")

        print(f"\nRESULTING STATE (after move in position {action}):")
        temp_board_after = BoardProcessor()
        temp_board_after.generate_state_list(moves[:i+1])
        temp_board_after.display_board()

        print(f"\nQ-value being updated: Q(state, action={action}) = {current_q:.4f} → {new_q:.4f}")


In [None]:
codes_file = os.path.expanduser('~/Downloads/replayMem.txt')

In [None]:
skip_rows = 72
skip_rows = int(skip_rows)
with open(codes_file, 'r') as f:
    print(f"Skipping a {skip_rows} rows")
    for _ in range(skip_rows):
        f.readline()
    for i, line in enumerate(f):
        if i >= 1:  # Only take first 5
            break
        code = line.strip()
        show_double_dqn_updates(code)
# Usage
# with open("game_codes.txt") as f:
#     for i, line in enumerate(f):
#         if i < 2:  # First 2 games
#             print(f"\n{'#'*60}")
#             print(f"GAME {i+1}")
#             print(f"{'#'*60}")
#             show_double_dqn_updates(line.strip())

In [None]:
with open(os.path.expanduser('~/Downloads/replayMem.txt'), 'r') as f:
    for line_num, line in enumerate(f, 1):
        code = line.strip()
        if code:
            board = BoardProcessor()
            try:
                moves = board.decode_moves_code(code)
                if len(moves) == 42:
                    print(f"First draw found at line {line_num}")
                    print(f"Game code: {code}")
                    print(f"Code length: {len(code)} characters")
                    print(f"Moves: {len(moves)}")
                    # Optionally show the final position
                    board.generate_state_list(moves)
                    board.display_board()
                    break
            except:
                continue
    else:
        print("No draw games found!")

In [None]:
#debugging this:
board = BoardProcessor()
moves = board.decode_moves_code('8IIa2rwOJR')
board.generate_state_list(moves[:])
board.display_board()
feature_gen = FeatureGenerator()
feature_gen.convolution_feature_gen(board.state_list)

In [None]:
"Section 1: Let's take a look at target Q-values"
import torch
import torch.nn as nn
import numpy as np
from board_processor import BoardProcessor
from feature_generator import FeatureGenerator
import os

In [None]:

class QNetwork(nn.Module):
    def __init__(self, input_dim=138):
        super().__init__()
        layers = []
        for h in [256, 128, 64, 32, 16, 8]:
            layers.extend([nn.Linear(input_dim, h), nn.Tanh()])
            input_dim = h
        layers.extend([nn.Linear(h, 1), nn.Tanh()])
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x).squeeze(-1)

In [None]:

def show_q_values(game_code):
    # Load model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load("qnet_mc_pretrained.pth", map_location=device)
    model = QNetwork().to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    scaler = checkpoint['scaler']

    # Setup
    board = BoardProcessor()
    feature_gen = FeatureGenerator()
    moves = board.decode_moves_code(game_code)
    board.generate_state_list(moves)

    # Show last 3 positions (2 with Q-values, 1 terminal)
    for i in range(max(0, len(moves)-3), len(moves)):
        print(f"\n--- Position {i+1}/{len(moves)} ---")
        board.display_board(index=i)

        if i < len(moves)-1:  # Not terminal
            # Get state and calculate Q-values
            temp_board = BoardProcessor()
            temp_board.generate_state_list(moves[:i+1])
            player = 1 if (i+1) % 2 == 1 else -1

            _, curr_feats = feature_gen.convolution_feature_gen(temp_board.state_list)
            q_vals = []

            for col in range(7):
                if len(temp_board.state_list[col]) < 6:  # Legal move
                    next_state = [c[:] for c in temp_board.state_list]
                    next_state[col].append(player)
                    _, next_feats = feature_gen.convolution_feature_gen(next_state)

                    features = np.concatenate([curr_feats, next_feats])
                    scaled = scaler.transform([features])

                    with torch.no_grad():
                        q = model(torch.FloatTensor(scaled).to(device)).item() * player
                        q_vals.append(f"{q:+.2f}")
                else:
                    q_vals.append(" --- ")

            print(f"Q-values: {' '.join(q_vals)}")
            print(f"Next move: {moves[i+1]} (Q={q_vals[moves[i+1]].strip()})")

In [None]:
codes_file = os.path.expanduser('~/Downloads/replayMem.txt')

In [None]:

with open(codes_file, 'r') as f:
    print("Skipping a million rows")
    for _ in range(1_000_000):
        f.readline()
    for i, line in enumerate(f):
        if i >= 5:  # Only take first 5
            break
        code = line.strip()
        show_q_values(code)

In [None]:
"""
Display last 3 moves from game codes using existing board utilities.
"""

import os
from board_processor import BoardProcessor

In [None]:

def display_last_moves(game_code, num_last_moves=3):
    """
    Display the last N moves of a game.

    Args:
        game_code: String game code
        num_last_moves: Number of final moves to display
    """
    # Initialize board processor
    board = BoardProcessor()

    # Decode the game code to get move sequence
    try:
        moves = board.decode_moves_code(game_code)
    except Exception as e:
        print(f"Error decoding {game_code}: {e}")
        return

    # Generate the full game
    board.generate_state_list(moves)

    # Determine game outcome
    total_moves = len(moves)

    # Display game info
    print(f"\nGame Code: {game_code}")
    print(f"Total moves: {total_moves}")

    # Calculate starting position for last N moves
    start_position = max(0, total_moves - num_last_moves)

    # Display last N positions
    for i in range(start_position, total_moves):
        print(f"\n--- After move {i + 1} (column {moves[i]}) ---")
        board.display_board(index=i)

    print("\n" + "="*40)

In [None]:

def main(replay_mem_filename):
    # Path to game codes file
    codes_file = os.path.expanduser(replay_mem_filename)

    # Check if file exists
    if not os.path.exists(codes_file):
        print(f"File not found: {codes_file}")
        print("Please ensure game_codes.txt is in your Downloads folder")
        return

    # Read first 5 game codes
    print("Loading first 5 games from file...")
    game_codes = []

    with open(codes_file, 'r') as f:
        print("Skipping a million rows")
        for _ in range(1_000_000):
            f.readline()
        for i, line in enumerate(f):
            if i >= 5:  # Only take first 5
                break
            code = line.strip()
            if code:  # Skip empty lines
                game_codes.append(code)

    print(f"Found {len(game_codes)} game codes")

    # Display last 3 moves for each game
    for idx, code in enumerate(game_codes, 1):
        print(f"\n{'='*40}")
        print(f"GAME {idx} OF 5")
        print(f"{'='*40}")
        display_last_moves(code, num_last_moves=3)

In [None]:

if __name__ == "__main__":
    main("~/Downloads/replayMem.txt")