# üéÆ Agar.io DQN Trainer - Colab Edition

Train a reinforcement learning bot for Agar.io using Google Colab's free GPU.

## ‚ö° Quick Start
1. **Runtime ‚Üí Change runtime type ‚Üí GPU** (for faster training)
2. Run all cells in order
3. Adjust `NUM_EPISODES` below (5000+ recommended)
4. Download `live_model.pth` when done

## üìä Expected Performance
- **5,000 episodes**: ~10-20 minutes on GPU
- **10,000 episodes**: ~20-40 minutes on GPU


In [None]:
# Check GPU availability
import torch
print(f"üî• GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üöÄ GPU: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU detected! Go to Runtime ‚Üí Change runtime type ‚Üí GPU")


In [None]:
# ‚öôÔ∏è CONFIGURATION - ADJUST THESE!
NUM_EPISODES = 5000   # More = better bot (5000-10000 recommended)
SAVE_FREQ = 500       # Save checkpoint every N episodes


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
import math
import time
from collections import deque

# Food
class Food:
    __slots__ = ['x', 'y', 'radius', 'color']
    def __init__(self, x, y):
        self.x, self.y = x, y
        self.radius = random.uniform(4, 8)
        self.color = '#FF6B6B'

# Bot
class Bot:
    __slots__ = ['x', 'y', 'radius', 'start_radius', 'color', 'speed', 'behavior', 'alive',
                 'direction', 'steps_in_dir', 'steps_per_side', 'wander_angle', 'wander_timer']
    def __init__(self, x, y, radius, color, speed=2, behavior='patrol'):
        self.x, self.y, self.radius, self.start_radius = x, y, radius, radius
        self.color, self.speed, self.behavior, self.alive = color, speed, behavior, True
        self.direction, self.steps_in_dir = random.randint(0,3), 0
        self.steps_per_side = random.randint(40, 100)
        self.wander_angle, self.wander_timer = random.random()*6.28, 0
    
    def update(self, w, h, px=None, py=None, pr=None):
        if not self.alive: return
        if self.behavior == 'patrol':
            if self.direction == 0: self.x += self.speed
            elif self.direction == 1: self.y += self.speed
            elif self.direction == 2: self.x -= self.speed
            else: self.y -= self.speed
            self.steps_in_dir += 1
            if self.steps_in_dir >= self.steps_per_side:
                self.direction = (self.direction + 1) % 4
                self.steps_in_dir = 0
        elif self.behavior == 'chase' and px:
            if self.radius > pr * 1.1:
                dx, dy = px - self.x, py - self.y
                d = math.sqrt(dx*dx + dy*dy)
                if d > 0 and d < 400:
                    self.x += (dx/d) * self.speed
                    self.y += (dy/d) * self.speed
        elif self.behavior == 'flee' and px:
            if pr > self.radius * 1.1:
                dx, dy = self.x - px, self.y - py
                d = math.sqrt(dx*dx + dy*dy)
                if d < 300 and d > 0:
                    self.x += (dx/d) * self.speed * 1.2
                    self.y += (dy/d) * self.speed * 1.2
        elif self.behavior == 'smart' and px:
            d = math.sqrt((px-self.x)**2 + (py-self.y)**2)
            if self.radius > pr * 1.2 and d < 350:
                dx, dy = px - self.x, py - self.y
                dd = math.sqrt(dx*dx+dy*dy)
                if dd > 0: self.x += (dx/dd)*self.speed; self.y += (dy/dd)*self.speed
            elif pr > self.radius * 1.2 and d < 250:
                dx, dy = self.x - px, self.y - py
                dd = math.sqrt(dx*dx+dy*dy)
                if dd > 0: self.x += (dx/dd)*self.speed*1.3; self.y += (dy/dd)*self.speed*1.3
        self.x = max(self.radius, min(w - self.radius, self.x))
        self.y = max(self.radius, min(h - self.radius, self.y))
    
    def reset(self, x, y):
        self.x, self.y, self.radius, self.alive = x, y, self.start_radius, True
        self.direction, self.steps_in_dir = random.randint(0,3), 0

# Player
class Player:
    __slots__ = ['x', 'y', 'radius', 'start_radius', 'color', 'alive', '_score']
    def __init__(self, x, y, radius=15):
        self.x, self.y, self.radius, self.start_radius = x, y, radius, radius
        self.color, self.alive, self._score = '#3498DB', True, radius
    @property
    def score(self): return self._score
    def move(self, dx, dy, w, h):
        self.x = max(self.radius, min(w - self.radius, self.x + dx))
        self.y = max(self.radius, min(h - self.radius, self.y + dy))
    def grow(self, amt):
        self.radius = math.sqrt(self.radius**2 + amt * 0.318)
        self._score = int(self.radius)
    def reset(self, x, y):
        self.x, self.y, self.radius, self.alive = x, y, self.start_radius, True
        self._score = int(self.start_radius)

# Environment
class AgarioEnv:
    W, H = 3000, 2000
    def __init__(self, num_food=150):
        self.width, self.height, self.num_food = self.W, self.H, num_food
        self.player, self.bots, self.food = None, [], []
        self.action_dim, self.state_dim, self.move_speed = 8, 24, 10
        d = 1/math.sqrt(2)
        self.dirs = [(0,-1),(0,1),(-1,0),(1,0),(d,-d),(d,d),(-d,-d),(-d,d)]
        self.steps, self.episode, self.kills, self.deaths, self.last_score = 0, 0, 0, 0, 0
        self.reset()
    
    def _spawn_food(self):
        self.food = [Food(random.uniform(20,self.width-20), random.uniform(20,self.height-20)) for _ in range(self.num_food)]
    
    def _spawn_bots(self):
        self.bots = []
        cfgs = [(1,(8,12),['flee'],['#98D8C8'],(3.5,5)), (1,(12,18),['flee','patrol'],['#2ECC71'],(3,4.5)),
                (1,(20,35),['smart','chase'],['#F39C12'],(2.5,3.5)), (1,(40,60),['chase','smart'],['#E74C3C'],(2,3)),
                (1,(70,90),['patrol','smart'],['#8E44AD'],(1.5,2.5))]
        px, py = self.width*0.2, self.height*0.5
        for cnt, sz, beh, col, spd in cfgs:
            for _ in range(cnt):
                for _ in range(20):
                    x, y = random.uniform(100,self.width-100), random.uniform(100,self.height-100)
                    if (x-px)**2 + (y-py)**2 > 40000: break
                self.bots.append(Bot(x, y, random.uniform(*sz), random.choice(col), random.uniform(*spd), random.choice(beh)))
    
    def _get_state(self):
        v = []
        if self.player.alive: v.extend([self.player.x, self.player.y, self.player.radius])
        else: v.extend([-1,-1,-1])
        v.append(self.player.score if self.player.alive else 0)
        for _ in range(3): v.extend([-1,-1,-1])
        if self.player.alive:
            px, py = self.player.x, self.player.y
            dists = [(((b.x-px)**2+(b.y-py)**2), b.x, b.y, b.radius) for b in self.bots if b.alive]
            dists.sort()
            for i in range(3):
                if i < len(dists): v.extend([dists[i][1], dists[i][2], dists[i][3]])
                else: v.extend([-1,-1,-1])
        else:
            for _ in range(3): v.extend([-1,-1,-1])
        v.extend([0, 1 if not self.player.alive else 0])
        return np.array(v, dtype=np.float32)
    
    def reset(self):
        self.episode += 1; self.steps = 0
        self.player = Player(self.width*0.2, self.height*0.5, 15)
        self.last_score = self.player.score
        self._spawn_bots(); self._spawn_food()
        return self._get_state()
    
    def step(self, action):
        self.steps += 1; reward, done = 0, False
        dx, dy = self.dirs[action]
        self.player.move(dx*self.move_speed, dy*self.move_speed, self.width, self.height)
        px, py, pr = self.player.x, self.player.y, self.player.radius
        for b in self.bots: b.update(self.width, self.height, px, py, pr)
        rm = []
        cd = (pr*0.75)**2
        for i, f in enumerate(self.food):
            if (f.x-px)**2 + (f.y-py)**2 < cd:
                rm.append(i); self.player.grow(f.radius**2*3); reward += 0.5
        for i in reversed(rm): self.food.pop(i)
        while len(self.food) < self.num_food:
            self.food.append(Food(random.uniform(20,self.width-20), random.uniform(20,self.height-20)))
        pr = self.player.radius
        for b in self.bots:
            if not b.alive: continue
            dsq = (b.x-px)**2 + (b.y-py)**2
            cd = max(pr,b.radius)*0.75
            if dsq < cd*cd:
                if pr > b.radius*1.1:
                    reward += 10+b.radius*0.2; self.player.grow(b.radius**2); b.alive = False; self.kills += 1
                elif b.radius > pr*1.1:
                    reward -= 100; done = True; self.player.alive = False; self.deaths += 1; break
        for b in self.bots:
            if not b.alive:
                for _ in range(8):
                    x, y = random.uniform(100,self.width-100), random.uniform(100,self.height-100)
                    if (x-px)**2+(y-py)**2 > 62500: break
                b.reset(x, y); b.radius = random.uniform(10, max(50,pr*0.7)); b.start_radius = b.radius
        reward += (self.player.score - self.last_score)*0.3
        self.last_score = self.player.score
        if not done:
            reward += 0.05
            for b in self.bots:
                if b.alive and b.radius > pr*1.1:
                    d = math.sqrt((px-b.x)**2+(py-b.y)**2)
                    if d < b.radius*3: reward -= (1-d/(b.radius*3))
        return self._get_state(), reward, done, {'score': self.player.score}

# Q-Network
class QNet(nn.Module):
    def __init__(self, s, a, h=128):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(s,h), nn.ReLU(), nn.Linear(h,h), nn.ReLU(), nn.Linear(h,a))
    def forward(self, x): return self.net(x)

# Replay Buffer
class RLBuf:
    def __init__(self, cap=10000): self.buf = deque(maxlen=cap)
    def push(self, *args): self.buf.append(args)
    def sample(self, n):
        b = random.sample(self.buf, min(n, len(self.buf)))
        return [np.array(x) for x in zip(*b)]
    def __len__(self): return len(self.buf)

# DQN Agent
class DQN:
    def __init__(self, s, a, lr=0.001, g=0.99, eps=1.0, eps_min=0.1, eps_d=0.995, buf=10000, bs=32, tuf=100):
        self.a_dim, self.g, self.bs, self.tuf = a, g, bs, tuf
        self.eps, self.eps_min, self.eps_d = eps, eps_min, eps_d
        self.dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.q = QNet(s,a).to(self.dev); self.qt = QNet(s,a).to(self.dev)
        self.qt.load_state_dict(self.q.state_dict())
        self.opt = optim.Adam(self.q.parameters(), lr=lr)
        self.buf = RLBuf(buf); self.steps, self.eps_cnt = 0, 0; self.rewards = []
    
    def act(self, s, train=True):
        if train and random.random() < self.eps: return random.randint(0, self.a_dim-1)
        with torch.no_grad(): return self.q(torch.FloatTensor(s).unsqueeze(0).to(self.dev)).argmax().item()
    
    def store(self, *args): self.buf.push(*args)
    
    def learn(self):
        if len(self.buf) < self.bs: return
        s, a, r, ns, d = self.buf.sample(self.bs)
        s = torch.FloatTensor(s).to(self.dev); a = torch.LongTensor(a).to(self.dev)
        r = torch.FloatTensor(r).to(self.dev); ns = torch.FloatTensor(ns).to(self.dev)
        d = torch.FloatTensor(d).to(self.dev)
        cq = self.q(s).gather(1, a.unsqueeze(1)).squeeze(1)
        with torch.no_grad(): tq = r + self.g * self.qt(ns).max(1)[0] * (1-d)
        loss = nn.MSELoss()(cq, tq)
        self.opt.zero_grad(); loss.backward(); self.opt.step()
        self.steps += 1
        if self.steps % self.tuf == 0: self.qt.load_state_dict(self.q.state_dict())
    
    def end_ep(self, rew): self.eps_cnt += 1; self.rewards.append(rew); self.eps = max(self.eps_min, self.eps*self.eps_d)
    
    def save(self, p='dqn.pth'):
        torch.save({'q_network': self.q.state_dict(), 'target_network': self.qt.state_dict(), 
                    'optimizer': self.opt.state_dict(), 'epsilon': self.eps, 'steps': self.steps}, p)
        print(f"‚úÖ Model saved: {p}")

print("‚úÖ All classes loaded successfully!")


In [None]:
# üöÄ RUN TRAINING
def train_dqn(n_eps, save_freq=500):
    print('='*60)
    print('üéÆ DQN TRAINING - ULTRA FAST MODE')
    print('='*60)
    env = AgarioEnv()
    agent = DQN(env.state_dim, env.action_dim)
    print(f'Device: {agent.dev} | Episodes: {n_eps}')
    print('='*60)
    
    t0 = time.time()
    rews, lens, best = [], [], -1e9
    lt, le = t0, 0
    
    for ep in range(n_eps):
        s, er, st = env.reset(), 0, 0
        while True:
            st += 1
            a = agent.act(s)
            ns, r, d, info = env.step(a)
            agent.store(s, a, r, ns, float(d))
            agent.learn()
            er += r
            s = ns
            if d: break
        
        agent.end_ep(er)
        rews.append(er)
        lens.append(st)
        
        if (ep+1) % 50 == 0:
            now = time.time()
            dt = now - lt
            de = ep+1-le
            a10 = np.mean(rews[-10:])
            a100 = np.mean(rews[-100:]) if len(rews)>=100 else a10
            spd = de/dt if dt > 0 else 0
            print(f'Ep {ep+1:5d}/{n_eps} | Rew:{er:6.1f} | Avg100:{a100:6.1f} | Eps:{agent.eps:.3f} | Len:{np.mean(lens[-50:]):.0f} | Spd:{spd:.1f}ep/s | K/D:{env.kills}/{env.deaths}')
            lt, le = now, ep+1
            if a100 > best and len(rews)>=100:
                best = a100
                agent.save('dqn_best.pth')
        
        if (ep+1) % save_freq == 0:
            agent.save(f'dqn_ep{ep+1}.pth')
    
    agent.save('dqn_final.pth')
    agent.save('live_model.pth')
    
    total_time = time.time() - t0
    print(f'\n{"="*60}')
    print(f'‚úÖ TRAINING COMPLETE!')
    print(f'‚è±Ô∏è  Time: {total_time/60:.1f} minutes')
    print(f'üèÜ Best Avg100: {best:.1f}')
    print(f'‚öîÔ∏è  Total K/D: {env.kills}/{env.deaths}')
    print(f'{"="*60}')
    return agent

# Start training!
agent = train_dqn(NUM_EPISODES, SAVE_FREQ)


In [None]:
# üì• DOWNLOAD TRAINED MODELS
# Run this cell to download your trained model!
try:
    from google.colab import files
    print("üì• Downloading models...")
    files.download('live_model.pth')
    files.download('dqn_best.pth')
    print("‚úÖ Download started! Check your browser downloads.")
except:
    print("üíæ Models saved locally:")
    print("  - live_model.pth (use this for live game)")
    print("  - dqn_best.pth (best performing model)")
    print("  - dqn_final.pth (final model)")


## üéØ Using Your Trained Model

The trained model (`live_model.pth`) is **cross-compatible** with the live Agar.io game!

### State Space (24 dimensions)
```
[0-2]   Player: x, y, radius
[3]     Score
[4-12]  3 nearest viruses (x, y, radius each)
[13-21] 3 nearest players (x, y, radius each)
[22]    Food count
[23]    Game ended flag
```

### Action Space (8 directions)
```
0: Up      1: Down    2: Left    3: Right
4: Up-Right   5: Down-Right   6: Up-Left   7: Down-Left
```

### Tips for Better Results
- **More episodes = better bot** (try 10,000+)
- **Use GPU** for faster training
- The bot learns to: avoid larger players, eat smaller ones, grow by eating food
