In [1]:
# prompt: force reinstallation of numpy to this version pip install numpy==1.23.5

!pip install --upgrade pip
!pip uninstall numpy -y
!pip install numpy==1.23.5
import numpy as np
np.__version__


Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.4.33 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
blosc2 3.0.0 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
imbalanced-learn 0.13.0 requires numpy<3,>=1.24.3, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.1 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.23.5 which is inc

'1.23.5'

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import gym
from gym import spaces
from gym.vector import SyncVectorEnv
from collections import deque
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# GPU Configuration
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Mixed precision policy
tf.keras.mixed_precision.set_global_policy('mixed_float16')

class EnhancedFishingEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=-1, high=1, shape=(5,), dtype=np.float32)
        self.max_steps = 50
        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random = np.random.RandomState(seed)
        return [seed]

    def reset(self):
        self.balance = 100.0
        self.fishes = 10.0
        self.step_count = 0
        return self._get_state()

    def step(self, action):
        self.step_count += 1
        terminated = False
        truncated = self.step_count >= self.max_steps
        reward = 0.0

        if action == 0:  # Fish
            if self.fishes > 0:
                self.fishes -= 1
                reward = 10 + self.balance * 0.01
                self.balance += 10
            else:
                reward = -100
        elif action == 1:  # Gamble
            if self.balance > 0 and self.fishes > 0:
                bet = min(self.balance, self.fishes * 10)
                outcome = self.np_random.choice([1.8, 0], p=[0.47, 0.53])
                self.balance += bet * outcome
                reward = bet * outcome
            else:
                reward = -10
        else:  # Quit
            terminated = True

        if self.balance <= 0:
            terminated = True

        return self._get_state(), reward / 100.0, terminated, truncated, {}

    def _get_state(self):
        return np.array([
            self.balance / 1000.0 - 0.5,
            self.fishes / 50.0 - 0.2,
            self.step_count / self.max_steps,
            (self.balance - 100) / 500.0,
            np.log1p(self.fishes) / 4.0
        ], dtype=np.float32)

class PrioritizedReplay:
    def __init__(self, capacity=1000000):
        self.capacity = capacity
        self.buffer = []
        self.priorities = np.zeros(capacity, dtype=np.float32)
        self.pos = 0
        self.size = 0

    def add(self, transition, priority=1.0):
        priority = max(priority, 1e-8)  # Ensure priority is at least 1e-8
        if len(self.buffer) < self.capacity:
            self.buffer.append(transition)
        else:
            self.buffer[self.pos] = transition

        self.priorities[self.pos] = priority
        self.pos = (self.pos + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def sample(self, batch_size, alpha=0.6):
        if self.size == 0:
            return [], [], []

        probs = self.priorities[:self.size] ** alpha
        probs_sum = probs.sum()

        if probs_sum <= 1e-8:
            probs = np.ones_like(probs) / self.size
        else:
            probs /= probs_sum

        probs = probs / probs.sum()  # Ensure probabilities sum to 1

        indices = np.random.choice(self.size, batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]
        weights = (self.size * probs[indices]) ** (-0.4)
        weights /= weights.max() + 1e-8

        return samples, indices, weights.astype(np.float32)

class DQNAgent:
    def __init__(self, state_shape, action_size):
        self.action_size = action_size
        self.model = self._build_model(state_shape)
        self.target_model = self._build_model(state_shape)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=10.0)
        self.replay = PrioritizedReplay()
        self.gamma = 0.99
        self.batch_size = 8192
        self.env = SyncVectorEnv([lambda: EnhancedFishingEnv() for _ in range(8)], new_step_api=True)
        self.update_freq = 4
        self.step_count = 0

        # Early stopping parameters
        self.best_balance = 0
        self.patience_counter = 0
        self.patience_limit = 10  # Number of evaluations without improvement to wait
        self.improvement_threshold = 0.01  # Minimum improvement considered significant

    def _build_model(self, input_shape):
        inputs = tf.keras.Input(shape=input_shape)
        x = layers.Dense(512, activation='swish')(inputs)
        x = layers.Dense(512, activation='swish')(x)
        x = layers.Dense(256, activation='swish')(x)
        outputs = layers.Dense(self.action_size)(x)
        return tf.keras.Model(inputs, outputs)

    @tf.function
    def _train_step(self, states, actions, rewards, next_states, dones, weights):
        with tf.GradientTape() as tape:
            next_q = tf.cast(self.target_model(next_states), tf.float32)
            gamma = tf.constant(self.gamma, dtype=tf.float32)
            target_q = rewards + (1 - dones) * gamma * tf.reduce_max(next_q, axis=1)
            current_q = tf.reduce_sum(
                tf.cast(self.model(states), tf.float32) *
                tf.one_hot(actions, self.action_size),
                axis=1
            )
            td_error = tf.abs(target_q - current_q)
            loss = tf.reduce_mean(weights * tf.keras.losses.huber(target_q, current_q))

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss, td_error

    def learn(self):
        if self.replay.size < self.batch_size:
            return

        batch, indices, weights = self.replay.sample(self.batch_size)
        if not batch:
            return

        states = tf.convert_to_tensor([x[0] for x in batch], dtype=tf.float32)
        actions = tf.convert_to_tensor([x[1] for x in batch], dtype=tf.int32)
        rewards = tf.convert_to_tensor([x[2] for x in batch], dtype=tf.float32)
        next_states = tf.convert_to_tensor([x[3] for x in batch], dtype=tf.float32)
        dones = tf.convert_to_tensor([float(x[4]) for x in batch], dtype=tf.float32)
        weights = tf.convert_to_tensor(weights, dtype=tf.float32)

        loss, td_errors = self._train_step(states, actions, rewards, next_states, dones, weights)

        # Update priorities
        new_priorities = td_errors.numpy() + 1e-5
        for idx, priority in zip(indices, new_priorities):
            self.replay.priorities[idx] = priority

    def train(self, total_steps=1_000_000):
        stats = {'rewards': [], 'balances': [], 'steps': []}
        states = self.env.reset()

        while self.step_count < total_steps:
            # Collect experiences using vectorized environment
            q_values = self.model(tf.convert_to_tensor(states, dtype=tf.float32))
            actions = tf.argmax(q_values, axis=1).numpy()
            next_states, rewards, terminateds, truncateds, _ = self.env.step(actions)
            dones = np.logical_or(terminateds, truncateds)

            # Store experiences
            for i in range(self.env.num_envs):
                priority = max(abs(rewards[i]), 1e-8)
                self.replay.add((
                    states[i],
                    actions[i],
                    rewards[i],
                    next_states[i],
                    float(dones[i])
                ), priority=priority)

            states = next_states
            self.step_count += self.env.num_envs

            # Learn from experiences
            if self.step_count % self.update_freq == 0:
                self.learn()

            # Evaluation and early stopping check
            if self.step_count % 1000 == 0:
                self.target_model.set_weights(self.model.get_weights())
                avg_reward, avg_balance = self._evaluate()
                stats['rewards'].append(avg_reward)
                stats['balances'].append(avg_balance)
                stats['steps'].append(self.step_count)

                # Check for improvement
                improvement = (avg_balance - self.best_balance) / self.best_balance if self.best_balance != 0 else float('inf')

                if avg_balance > self.best_balance * (1 + self.improvement_threshold):
                    self.best_balance = avg_balance
                    self.patience_counter = 0  # Reset counter
                    self.model.save(f"best_model_{avg_balance:.0f}.keras")
                    print(f"Step {self.step_count} | Avg Balance: {avg_balance:.2f} | Avg Reward: {avg_reward:.2f} (Improvement: {improvement*100:.2f}%)")
                else:
                    self.patience_counter += 1
                    print(f"Step {self.step_count} | No improvement for {self.patience_counter}/{self.patience_limit} checks")

                # Early stopping condition
                if self.patience_counter >= self.patience_limit:
                    print(f"\nEarly stopping triggered! No improvement for {self.patience_limit} consecutive evaluations.")
                    print(f"Final model saved as 'final_model_{self.best_balance:.0f}.keras'")
                    self.model.save(f"final_model_{self.best_balance:.0f}.keras")
                    break

        # Generate final graphs
        self._plot_stats(stats)
        self._plot_rolling_stats(stats)

    def _evaluate(self, num_episodes=20):
        env = EnhancedFishingEnv()
        total_reward = 0
        total_balance = 0

        for _ in range(num_episodes):
            state = env.reset()
            episode_reward = 0
            done = False
            while not done:
                action = tf.argmax(self.model(tf.expand_dims(state, 0))[0]).numpy()
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                episode_reward += reward
                state = next_state
            total_reward += episode_reward
            total_balance += env.balance

        return total_reward/num_episodes, total_balance/num_episodes

    def _plot_stats(self, stats):
        plt.figure(figsize=(15,5))
        plt.subplot(1,2,1)
        plt.plot(stats['steps'], stats['rewards'])
        plt.title('Average Evaluation Reward')
        plt.xlabel('Training Steps')

        plt.subplot(1,2,2)
        plt.plot(stats['steps'], stats['balances'])
        plt.title('Average Final Balance')
        plt.xlabel('Training Steps')
        plt.tight_layout()
        plt.savefig('training_progress.png')
        plt.close()

    def _plot_rolling_stats(self, stats):
        # Calculate rolling averages for smoother trend visualization
        window_size = max(1, len(stats['rewards']) // 20)

        plt.figure(figsize=(15, 5))

        # Rolling reward
        plt.subplot(1, 2, 1)
        rolling_reward = pd.Series(stats['rewards']).rolling(window_size).mean()
        plt.plot(stats['steps'], rolling_reward)
        plt.title(f'Rolling Average Reward (Window: {window_size} evaluations)')
        plt.xlabel('Training Steps')
        plt.ylabel('Smoothed Reward')

        # Rolling balance
        plt.subplot(1, 2, 2)
        rolling_balance = pd.Series(stats['balances']).rolling(window_size).mean()
        plt.plot(stats['steps'], rolling_balance)
        plt.title(f'Rolling Average Balance (Window: {window_size} evaluations)')
        plt.xlabel('Training Steps')
        plt.ylabel('Smoothed Balance')

        plt.tight_layout()
        plt.savefig('rolling_metrics.png')
        plt.close()

if __name__ == "__main__":
    agent = DQNAgent((5,), 3)
    agent.train()
    print("Training complete! Check generated graphs:")
    print("- training_progress.png: Overall training metrics")
    print("- rolling_metrics.png: Smoothed trend analysis")

  and should_run_async(code)


Step 1000 | Avg Balance: 120.00 | Avg Reward: 0.22 (Improvement: inf%)
Step 2000 | No improvement for 1/10 checks
Step 3000 | No improvement for 2/10 checks
Step 4000 | No improvement for 3/10 checks
Step 5000 | No improvement for 4/10 checks
Step 6000 | No improvement for 5/10 checks
Step 7000 | No improvement for 6/10 checks
Step 8000 | No improvement for 7/10 checks
Step 9000 | Avg Balance: 130.00 | Avg Reward: 0.33 (Improvement: 8.33%)
Step 10000 | Avg Balance: 150.00 | Avg Reward: 0.56 (Improvement: 15.38%)
Step 11000 | Avg Balance: 160.00 | Avg Reward: 0.68 (Improvement: 6.67%)
Step 12000 | Avg Balance: 170.00 | Avg Reward: 0.79 (Improvement: 6.25%)
Step 13000 | Avg Balance: 180.00 | Avg Reward: 0.91 (Improvement: 5.88%)
Step 14000 | No improvement for 1/10 checks
Step 15000 | Avg Balance: 837.20 | Avg Reward: 7.61 (Improvement: 365.11%)
Step 16000 | Avg Balance: 4465.00 | Avg Reward: 43.65 (Improvement: 433.33%)
Step 17000 | No improvement for 1/10 checks
Step 18000 | No improve