In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from blackjack import BlackjackGame # Importing your Week 1 code

class BlackjackEnv:
    def __init__(self):
        self.game = BlackjackGame()

    def _get_state(self):
        """Returns (PlayerSum, DealerUpcard, UsableAce)"""
        # Dealer's second card is the 'upcard' in your Week 1 setup
        dealer_card = self.game.dealer_hand.cards[1]
        
        # Convert rank to numeric value
        if dealer_card.rank in ['J', 'Q', 'K']:
            d_val = 10
        elif dealer_card.rank == 'A':
            d_val = 11
        else:
            d_val = int(dealer_card.rank)
            
        # Check for usable ace: player has an Ace and hand <= 21
        usable_ace = any(c.rank == 'A' for c in self.game.player_hand.cards) and self.game.player_hand.value <= 21
        return (self.game.player_hand.value, d_val, usable_ace)

    def reset(self):
        """Starts a new round and returns initial state"""
        self.game.deal_initial()
        return self._get_state()

    def step(self, action):
        """
        Action: 0 = Stand, 1 = Hit
        Returns: (state, reward, done)
        """
        if action == 1: # HIT
            self.game.player_hand.add_card(self.game.deck.draw())
            if self.game.player_hand.value > 21:
                return self._get_state(), -1, True # Lose
            return self._get_state(), 0, False # Game continues
        
        else: # STAND
            # Dealer plays: hits until 17
            while self.game.dealer_hand.value < 17:
                self.game.dealer_hand.add_card(self.game.deck.draw())
            
            p_val = self.game.player_hand.value
            d_val = self.game.dealer_hand.value
            
            if d_val > 21 or p_val > d_val:
                reward = 1
            elif d_val > p_val:
                reward = -1
            else:
                reward = 0
            return self._get_state(), reward, True

env = BlackjackEnv()

In [None]:
def task_1_prediction(n_episodes=10000):
    # Dictionary to store total returns and counts for averaging
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = defaultdict(float)

    for i in range(n_episodes):
        state = env.reset()
        episode = []
        done = False
        
        # 1. Generate one full game
        while not done:
            action = 1 if state[0] < 20 else 0 # Policy: Hit < 20
            next_state, reward, done = env.step(action)
            episode.append((state, reward))
            state = next_state
            
        # 2. Process the results (First-Visit)
        G = episode[-1][1] # The final win/loss result
        visited_states = set()
        for s, _ in episode:
            if s not in visited_states:
                returns_sum[s] += G
                returns_count[s] += 1
                V[s] = returns_sum[s] / returns_count[s]
                visited_states.add(s)
                
    # Results
    print(f"--- Task 1 Results ---")
    print(f"Value of State (Player 21, Dealer 10): {V.get((21, 10, False), 0):.4f}")
    print(f"Value of State (Player 5, Dealer 10): {V.get((5, 10, False), 0):.4f}")
    return V

# Run it
v_table = task_1_prediction()

In [None]:
def task_2_control(n_episodes=500000, alpha=0.02, epsilon=0.1):
    # Q[(state)][action] -> [Value of Stand, Value of Hit]
    Q = defaultdict(lambda: np.zeros(2))
    reward_history = []

    for i in range(n_episodes):
        state = env.reset()
        episode = []
        done = False
        
        # 1. Play using Epsilon-Greedy
        while not done:
            if np.random.random() < epsilon:
                action = np.random.choice([0, 1]) # Explore
            else:
                action = np.argmax(Q[state]) # Exploit
            
            next_state, reward, done = env.step(action)
            episode.append((state, action))
            state = next_state
            
        # 2. Update Q-table at the end of the round
        G = reward # The final result
        for s, a in episode:
            # Incremental Mean update
            Q[s][a] += alpha * (G - Q[s][a])
        
        reward_history.append(G)
        
        if i % 100000 == 0:
            print(f"Simulation Progress: {i}/{n_episodes} games...")

    return Q, reward_history

# Run the training
Q_optimal, history = task_2_control()

In [None]:
def plot_results(history, Q):
    # 1. Rolling Average Reward
    plt.figure(figsize=(12, 5))
    window = 2000
    rolling_avg = np.convolve(history, np.ones(window)/window, mode='valid')
    plt.plot(rolling_avg, color='green')
    plt.title("Learning Curve: Rolling Average Reward")
    plt.xlabel("Episodes")
    plt.ylabel("Reward (Average over 2000 games)")
    plt.grid(True)
    plt.show()

    # 2. Strategy Card Heatmap (No Usable Ace)
    # We want to show Player Sum (12-21) vs Dealer Card (2-11)
    strategy = np.zeros((10, 10))
    for p_idx, p_sum in enumerate(range(12, 22)):
        for d_idx, d_card in enumerate(range(2, 12)):
            # Pick the best action (0 or 1) from our Q-table
            strategy[p_idx, d_idx] = np.argmax(Q[(p_sum, d_card, False)])

    plt.figure(figsize=(10, 8))
    sns.heatmap(strategy, annot=True, cmap="coolwarm", 
                xticklabels=range(2, 12), yticklabels=range(12, 22))
    plt.title("Optimal Strategy Heatmap (0=Stand, 1=Hit)")
    plt.xlabel("Dealer Upcard Value")
    plt.ylabel("Player Sum")
    plt.show()

plot_results(history, Q_optimal)