In [17]:
import pygame
import numpy as np
import math
import random
import tensorflow as tf
from collections import deque
import os
import matplotlib.pyplot as plt

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

TRAINING_MODE = True
RENDER_SPEED = 50.0

class TankEnv:
    def __init__(self):
        pygame.init()
        self.width, self.height = 1000, 800
        self.screen = pygame.display.set_mode((self.width, self.height))
        pygame.display.set_caption("Tank RL Environment v2.3")
        self.clock = pygame.time.Clock()
        self.font = pygame.font.SysFont('Arial', 20)
        
        # Параметры танка
        self.tank_speed = 4
        self.tank_size = 25
        self.target_radius = 20
        self.bullet_speed = 10
        self.bullet_radius = 4
        self.bullets = []
        
        # Улучшенная система препятствий с граничными стенами
        wall_thickness = 20
        self.obstacles = [
            pygame.Rect(200, 150, 300, 40),
            pygame.Rect(500, 300, 40, 200),
            pygame.Rect(100, 500, 250, 30),
            pygame.Rect(700, 200, 50, 400),
            pygame.Rect(300, 400, 200, 50),
            pygame.Rect(600, 100, 150, 40),
            pygame.Rect(0, 0, self.width, wall_thickness),
            pygame.Rect(0, self.height - wall_thickness, self.width, wall_thickness),
            pygame.Rect(0, 0, wall_thickness, self.height),
            pygame.Rect(self.width - wall_thickness, 0, wall_thickness, self.height)
        ]
        
        # Инициализация
        self.reset()
        self.action_descriptions = ["ВПЕРЕД", "НАЗАД", "ВЛЕВО", "ВПРАВО", "ОГОНЬ"]
        
        # Стартовый рендеринг для инициализации экрана
        self.screen.fill((230, 230, 230))
        pygame.display.flip()

    def reset(self):
        """Сброс среды в начальное состояние"""
        # Генерация позиций танка и цели без пересечения с препятствиями
        max_attempts = 100
        for attempt in range(max_attempts):
            self.tank = {
                'x': random.randint(50, self.width-50),
                'y': random.randint(50, self.height-50),
                'angle': random.randint(0, 359),
                'health': 100,
                'cooldown': 0
            }
            
            self.target = {
                'x': random.randint(50, self.width-50),
                'y': random.randint(50, self.height-50)
            }
            
            # Проверка коллизий
            tank_rect = pygame.Rect(
                self.tank['x'] - self.tank_size//2,
                self.tank['y'] - self.tank_size//2,
                self.tank_size, self.tank_size
            )
            
            target_rect = pygame.Rect(
                self.target['x'] - self.target_radius,
                self.target['y'] - self.target_radius,
                self.target_radius*2, self.target_radius*2
            )
            
            valid_positions = True
            for obs in self.obstacles:
                if tank_rect.colliderect(obs) or target_rect.colliderect(obs):
                    valid_positions = False
                    break
            
            if valid_positions:
                break
            elif attempt == max_attempts - 1:
                print("Предупреждение: не удалось найти валидные позиции после 100 попыток")
        
        self.bullets = []
        self.prev_dist = math.dist((self.tank['x'], self.tank['y']), 
                                  (self.target['x'], self.target['y']))
        return self._get_state()

    def _get_state(self):
        """Получение текущего состояния среды"""
        dx = self.target['x'] - self.tank['x']
        dy = self.target['y'] - self.tank['y']
        distance = math.sqrt(dx**2 + dy**2)
        angle_to_target = math.atan2(dy, dx) - math.radians(self.tank['angle'])
        angle_to_target = (angle_to_target + math.pi) % (2 * math.pi) - math.pi
        
        # Расстояния до препятствий в 8 направлениях
        obstacle_distances = self._get_obstacle_distances(num_rays=8)
        
        # Состояние здоровья
        health_state = self.tank['health'] / 100.0
        
        return np.array([
            self.tank['x'] / self.width,
            self.tank['y'] / self.height,
            self.tank['angle'] / 360.0,
            distance / math.sqrt(self.width**2 + self.height**2),
            angle_to_target / math.pi,
            health_state,
            *obstacle_distances
        ], dtype=np.float32)

    def _get_obstacle_distances(self, num_rays=16):
        """Вычисление расстояний до препятствий в различных направлениях"""
        distances = []
        for i in range(num_rays):
            angle = math.radians(self.tank['angle'] + (360/num_rays)*i)
            dist = self._cast_ray(angle)
            distances.append(dist / max(self.width, self.height))
        return distances

    def _cast_ray(self, angle):
        """Трассировка луча для обнаружения препятствий"""
        step = 3
        x, y = self.tank['x'], self.tank['y']
        
        for d in range(0, 600, step):
            x += step * math.cos(angle)
            y += step * math.sin(angle)
            
            # Проверка границ
            if not (0 <= x <= self.width and 0 <= y <= self.height):
                return d
                
            # Проверка столкновения с препятствиями
            for obs in self.obstacles:
                if obs.collidepoint(x, y):
                    return d
                    
        return 600

    def step(self, action):
        """Выполнение действия и возврат нового состояния"""
        reward = -0.2
        done = False
        info = {'reached_target': False, 'hit_obstacle': False, 'target_hit': False}
        
        # Обработка кулдауна стрельбы
        if self.tank['cooldown'] > 0:
            self.tank['cooldown'] -= 1
        
        # Обработка действий
        if action == 0:  # Вперед
            self.tank['x'] += self.tank_speed * math.cos(math.radians(self.tank['angle']))
            self.tank['y'] += self.tank_speed * math.sin(math.radians(self.tank['angle']))
        elif action == 1:  # Назад
            self.tank['x'] -= self.tank_speed * math.cos(math.radians(self.tank['angle']))
            self.tank['y'] -= self.tank_speed * math.sin(math.radians(self.tank['angle']))
        elif action == 2:  # Влево
            self.tank['angle'] = (self.tank['angle'] - 5) % 360
        elif action == 3:  # Вправо
            self.tank['angle'] = (self.tank['angle'] + 5) % 360
        elif action == 4 and self.tank['cooldown'] == 0:  # Огонь
            self.bullets.append({
                'x': self.tank['x'] + (self.tank_size+5) * math.cos(math.radians(self.tank['angle'])),
                'y': self.tank['y'] + (self.tank_size+5) * math.sin(math.radians(self.tank['angle'])),
                'dx': self.bullet_speed * math.cos(math.radians(self.tank['angle'])),
                'dy': self.bullet_speed * math.sin(math.radians(self.tank['angle']))
            })
            self.tank['cooldown'] = 10
        
        # Обновление пуль
        for bullet in self.bullets[:]:
            bullet['x'] += bullet['dx']
            bullet['y'] += bullet['dy']
            
            # Проверка попадания в цель
            if math.dist((bullet['x'], bullet['y']), 
                        (self.target['x'], self.target['y'])) < self.target_radius + self.bullet_radius:
                reward = 100
                done = True
                info['target_hit'] = True
                self.bullets.remove(bullet)
                break
                
            # Проверка выхода за границы
            if not (0 <= bullet['x'] <= self.width and 0 <= bullet['y'] <= self.height):
                self.bullets.remove(bullet)
                continue
                
            # Проверка столкновения с препятствиями
            for obs in self.obstacles:
                if obs.collidepoint(bullet['x'], bullet['y']):
                    if bullet in self.bullets:
                        self.bullets.remove(bullet)
                    break
        
        # Ограничение позиции танка
        self.tank['x'] = np.clip(self.tank['x'], 0, self.width)
        self.tank['y'] = np.clip(self.tank['y'], 0, self.height)
        
        # Проверка столкновения танка с препятствиями
        tank_rect = pygame.Rect(
            self.tank['x'] - self.tank_size//2,
            self.tank['y'] - self.tank_size//2,
            self.tank_size, self.tank_size
        )
        
        for obs in self.obstacles:
            if tank_rect.colliderect(obs):
                reward = -15
                self.tank['health'] -= 20
                if self.tank['health'] <= 0:
                    reward = -30
                    done = True
                    info['hit_obstacle'] = True
                break
        
        # Проверка достижения цели
        if math.dist((self.tank['x'], self.tank['y']),
                    (self.target['x'], self.target['y'])) < self.target_radius + self.tank_size//2:
            reward = 80
            done = True
            info['reached_target'] = True
        
        # Награда за приближение/удаление
        new_dist = math.dist((self.tank['x'], self.tank['y']),
                            (self.target['x'], self.target['y']))
        
        if new_dist < self.prev_dist:
            reward += 1.0
        else:
            reward -= 0.8
            
        self.prev_dist = new_dist
        
        return self._get_state(), reward, done, info

    def render(self, episode=None, total_reward=None, action=None, speed=1.0):
        """Отрисовка текущего состояния среды"""
        # Обработка событий
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return False
        
        # Отрисовка
        self.screen.fill((230, 230, 230))
        
        # Отрисовка препятствий
        for i, obs in enumerate(self.obstacles):
            if i < 6:
                pygame.draw.rect(self.screen, (90, 90, 120), obs)
                pygame.draw.rect(self.screen, (50, 50, 80), obs, 2)
            else:
                pygame.draw.rect(self.screen, (150, 150, 160), obs)
                pygame.draw.rect(self.screen, (100, 100, 120), obs, 2)
        
        # Отрисовка цели
        pygame.draw.circle(self.screen, (220, 50, 50), 
                          (int(self.target['x']), int(self.target['y'])), 
                          self.target_radius)
        pygame.draw.circle(self.screen, (180, 30, 30), 
                          (int(self.target['x']), int(self.target['y'])), 
                          self.target_radius, 2)
        
        # Отрисовка пуль
        for bullet in self.bullets:
            pygame.draw.circle(self.screen, (30, 30, 30), 
                             (int(bullet['x']), int(bullet['y'])), 
                             self.bullet_radius)
        
        # Отрисовка танка
        angle_rad = math.radians(self.tank['angle'])
        tank_color = (50, 180, 100) if self.tank['health'] > 50 else (220, 140, 60)
        
        points = [
            (self.tank['x'] + self.tank_size * math.cos(angle_rad),
             self.tank['y'] + self.tank_size * math.sin(angle_rad)),
            (self.tank['x'] + self.tank_size//2 * math.cos(angle_rad + 2.4),
             self.tank['y'] + self.tank_size//2 * math.sin(angle_rad + 2.4)),
            (self.tank['x'] + self.tank_size//2 * math.cos(angle_rad - 2.4),
             self.tank['y'] + self.tank_size//2 * math.sin(angle_rad - 2.4))
        ]
        
        pygame.draw.polygon(self.screen, tank_color, points)
        pygame.draw.polygon(self.screen, (30, 80, 50), points, 2)
        
        # Линия направления
        end_x = self.tank['x'] + (self.tank_size+10) * math.cos(angle_rad)
        end_y = self.tank['y'] + (self.tank_size+10) * math.sin(angle_rad)
        pygame.draw.line(self.screen, (30, 30, 30), 
                        (self.tank['x'], self.tank['y']), 
                        (end_x, end_y), 2)
        
        # Панель информации
        info_y = 10
        if episode is not None:
            text = self.font.render(f"Эпизод: {episode}", True, (0, 0, 0))
            self.screen.blit(text, (10, info_y))
            info_y += 25
            
        if total_reward is not None:
            text = self.font.render(f"Награда: {total_reward:.1f}", True, (0, 0, 0))
            self.screen.blit(text, (10, info_y))
            info_y += 25
            
        if action is not None and 0 <= action < len(self.action_descriptions):
            text = self.font.render(f"Действие: {self.action_descriptions[action]}", True, (0, 100, 0))
            self.screen.blit(text, (10, info_y))
            info_y += 25
            
        # Полоса здоровья
        health_width = max(0, min(150, self.tank['health'] * 1.5))
        pygame.draw.rect(self.screen, (200, 200, 200), (self.width-160, 15, 150, 20))
        pygame.draw.rect(self.screen, (220, 80, 60) if self.tank['health'] < 50 else (80, 180, 100), 
                        (self.width-160, 15, health_width, 20))
        pygame.draw.rect(self.screen, (100, 100, 100), (self.width-160, 15, 150, 20), 2)
        health_text = self.font.render(f"Здоровье: {self.tank['health']}%", True, (0, 0, 0))
        self.screen.blit(health_text, (self.width-155, 16))
        
        # Обновление экрана
        pygame.display.flip()
        
        # Управление скоростью
        if speed > 0:
            self.clock.tick(60 * speed)
            
        return True



In [19]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=200000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.0005
        self.batch_size = 128
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()
        self.loss_history = []

    def _build_model(self):
        model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, input_dim=self.state_size, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        
        model.compile(
            loss=tf.keras.losses.Huber(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
            metrics=['mae']
        )
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
            
        state = np.reshape(state, [1, self.state_size])
        return np.argmax(self.model.predict(state, verbose=0)[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
            
        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)
        
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)
        
        # Double DQN
        current_q = self.model.predict(states, verbose=0)
        next_q = self.model.predict(next_states, verbose=0)
        next_target_q = self.target_model.predict(next_states, verbose=0)
        
        max_actions = np.argmax(next_q, axis=1)
        targets = rewards + self.gamma * next_target_q[np.arange(self.batch_size), max_actions] * (1 - dones)
        
        current_q[np.arange(self.batch_size), actions] = targets
        
        # Обучение модели
        history = self.model.fit(
            states, 
            current_q, 
            batch_size=self.batch_size, 
            verbose=0
        )
        self.loss_history.append(history.history['loss'][0])
        
        # Уменьшение epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, name):
        filename = f"{name}.weights.h5"
        self.model.save_weights(filename)
        print(f"Модель сохранена как: {filename}")

    def load(self, name):
        self.model.load_weights(f"{name}.weights.h5")


In [20]:
# Параметры обучения
EPISODES = 50
TARGET_UPDATE_FREQ = 5

# Инициализация среды и агента
env = TankEnv()
state_size = 14
action_size = 5

agent = DQNAgent(state_size, action_size)
episode_rewards = []
best_score = -float('inf')

# Основной цикл обучения
running = True
for episode in range(EPISODES):
    if not running:
        break
        
    state = env.reset()
    total_reward = 0
    done = False
    
    # Первоначальная отрисовка
    if not TRAINING_MODE:
        running = env.render(
            episode=episode+1, 
            total_reward=total_reward, 
            action=None,
            speed=RENDER_SPEED
        )
    
    while not done and running:
        # Выбор действия
        action = agent.act(state)
        
        # Выполнение действия
        next_state, reward, done, info = env.step(action)
        
        # Сохранение опыта
        agent.remember(state, action, reward, next_state, done)
        
        # Обновление состояния
        state = next_state
        total_reward += reward
        
        # Визуализация
        if not TRAINING_MODE:
            running = env.render(
                episode=episode+1, 
                total_reward=total_reward, 
                action=action,
                speed=RENDER_SPEED
            )
    
    # Сохранение статистики
    episode_rewards.append(total_reward)
    
    # Обучение на опыте
    agent.replay()
    
    # Периодическое обновление целевой сети
    if episode % TARGET_UPDATE_FREQ == 0:
        agent.update_target_model()
    
    # Вывод статистики
    print(f"Эпизод: {episode+1:2d}/{EPISODES}, "
          f"Награда: {total_reward:7.2f}, "
          f"Epsilon: {agent.epsilon:.4f}, "
          f"Средняя награда: {np.mean(episode_rewards[-10:] if episode_rewards else 0):.2f}")
    
    # Сохранение лучшей модели
    if total_reward > best_score:
        best_score = total_reward
        agent.save(f"tank_dqn_best_episode_{episode+1}")

# Сохранение финальной модели
if running:
    agent.save("tank_dqn_final")
    print("\nОбучение завершено!")

    # Анализ результатов
    if episode_rewards:
        print(f"\nРезультаты обучения за {min(episode+1, EPISODES)} эпизодов:")
        print(f"Максимальная награда: {max(episode_rewards):.2f}")
        print(f"Минимальная награда: {min(episode_rewards):.2f}")
        print(f"Средняя награда: {np.mean(episode_rewards):.2f}")
        if len(episode_rewards) >= 10:
            print(f"Средняя награда (последние 10): {np.mean(episode_rewards[-10:]):.2f}")

        # Сохранение истории наград
        with open("rewards_history.txt", "w") as f:
            for i, reward in enumerate(episode_rewards):
                f.write(f"{i+1},{reward}\n")

        # График наград
        plt.figure(figsize=(12, 6))
        plt.plot(episode_rewards, label='Награда за эпизод')
        
        if len(episode_rewards) >= 10:
            moving_avg = [np.mean(episode_rewards[max(0, i-9):i+1]) for i in range(len(episode_rewards))]
            plt.plot(moving_avg, 'r-', label='Скользящее среднее (10 эп.)')
        
        plt.title("История наград во время обучения")
        plt.xlabel("Эпизод")
        plt.ylabel("Награда")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig("rewards_history.png")
        plt.close()

        print("\nГрафик истории наград сохранен в rewards_history.png")

pygame.quit()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Эпизод:  1/50, Награда: -1090.20, Epsilon: 0.9950, Средняя награда: -1090.20
Модель сохранена как: tank_dqn_best_episode_1.weights.h5
Эпизод:  2/50, Награда: -2173.00, Epsilon: 0.9900, Средняя награда: -1631.60
Эпизод:  3/50, Награда: -3017.00, Epsilon: 0.9851, Средняя награда: -2093.40
Эпизод:  4/50, Награда: -114.80, Epsilon: 0.9801, Средняя награда: -1598.75
Модель сохранена как: tank_dqn_best_episode_4.weights.h5
Эпизод:  5/50, Награда: -427.40, Epsilon: 0.9752, Средняя награда: -1364.48
Эпизод:  6/50, Награда: -133.20, Epsilon: 0.9704, Средняя награда: -1159.27
Эпизод:  7/50, Награда: -1311.20, Epsilon: 0.9655, Средняя награда: -1180.97
Эпизод:  8/50, Награда: -1025.40, Epsilon: 0.9607, Средняя награда: -1161.52
Эпизод:  9/50, Награда: -3726.40, Epsilon: 0.9559, Средняя награда: -1446.51
Эпизод: 10/50, Награда: -4845.80, Epsilon: 0.9511, Средняя награда: -1786.44
Эпизод: 11/50, Награда:  -99.40, Epsilon: 0.9464, Средняя награда: -1687.36
Модель сохранена как: tank_dqn_best_episode