In [1]:
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

from torch.utils.data import DataLoader
from torchvision import transforms, datasets


transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

dataset = datasets.ImageFolder(
    root=os.path.join("data_to_use", "dataset_2025-11-03_17-45-40"),
    transform=transform
)

from torch.utils.data import random_split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_data, batch_size=16, shuffle=False)

print(f"Classes found: {dataset.classes}")

Using device: cuda
Classes found: ['dark_center', 'dark_head', 'dark_none', 'dark_rock', 'dark_side', 'fire_center', 'fire_head', 'fire_none', 'fire_rock', 'fire_side', 'ice_center', 'ice_head', 'ice_none', 'ice_rock', 'ice_side', 'no_monster', 'none_center', 'none_head', 'none_none', 'none_rock', 'none_side', 'robot_center', 'robot_head', 'robot_none', 'robot_rock', 'robot_side']


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    """
    Simple 3-layer CNN for 128x128 RGB images.
    Automatically builds correct FC layer size.
    """

    def __init__(self, num_classes):
        super().__init__()

        # Input: (3, 128, 128)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        # Output: (16, 128, 128)
        self.pool1 = nn.MaxPool2d(2, 2)
        # Output after pool1: (16, 64, 64)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        # Output: (32, 64, 64)
        self.pool2 = nn.MaxPool2d(2, 2)
        # Output after pool2: (32, 32, 32)

        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        # Output: (64, 32, 32)
        self.pool3 = nn.MaxPool2d(2, 2)
        # Output after pool3: (64, 16, 16)

        self.dropout = nn.Dropout(0.3)

        # Dynamically determine fc input size

        # ‚úÖ Precompute FC input size (for 128x128 images)
        self.fc_input_size = 64 * 16 * 16
        self.fc1 = nn.Linear(self.fc_input_size, 128)
        self.fc2 = nn.Linear(128, num_classes)


    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


model = SimpleCNN(num_classes=26).to("cuda" if torch.cuda.is_available() else "cpu")
dummy = torch.randn(1, 3, 128, 128).to(next(model.parameters()).device)
out = model(dummy)
print("Final output:", out.shape)

with torch.no_grad():
    x = torch.randn(1, 3, 128, 128).to(device)
    for layer in [model.conv1, model.pool1, model.conv2, model.pool2, model.conv3, model.pool3]:
        x = layer(x)
        print(x.shape)


Final output: torch.Size([1, 26])
torch.Size([1, 16, 128, 128])
torch.Size([1, 16, 64, 64])
torch.Size([1, 32, 64, 64])
torch.Size([1, 32, 32, 32])
torch.Size([1, 64, 32, 32])
torch.Size([1, 64, 16, 16])


In [10]:
from torchsummary import summary

model = SimpleCNN(num_classes=26).to(device)
summary(model, input_size=(3, 128, 128))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 128, 128]             448
         MaxPool2d-2           [-1, 16, 64, 64]               0
            Conv2d-3           [-1, 32, 64, 64]           4,640
         MaxPool2d-4           [-1, 32, 32, 32]               0
            Conv2d-5           [-1, 64, 32, 32]          18,496
         MaxPool2d-6           [-1, 64, 16, 16]               0
            Linear-7                  [-1, 128]       2,097,280
           Dropout-8                  [-1, 128]               0
            Linear-9                   [-1, 26]           3,354
Total params: 2,124,218
Trainable params: 2,124,218
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.19
Forward/backward pass size (MB): 4.38
Params size (MB): 8.10
Estimated Total Size (MB): 12.67
--------------------------------------

In [3]:
import torch.optim as optim

model = SimpleCNN(num_classes=len(dataset.classes)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# optimizer = optim.Adam(model.parameters(), lr=0.01)

epochs = 10
for epoch in range(epochs):
    model.train()

    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss+=loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{epochs}] Loss: {avg_loss:.4f}")


Epoch [1/10] Loss: 2.7244
Epoch [2/10] Loss: 2.6331
Epoch [3/10] Loss: 2.5133
Epoch [4/10] Loss: 2.4138
Epoch [5/10] Loss: 2.3325
Epoch [6/10] Loss: 2.1979
Epoch [7/10] Loss: 1.9840
Epoch [8/10] Loss: 1.9533
Epoch [9/10] Loss: 1.8642
Epoch [10/10] Loss: 1.7115


In [4]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 52.71%


# Save weight and reuse weight

In [5]:
save_path = "simplecnn_weights.pth"
torch.save(model.state_dict(), save_path)
print(f"‚úÖ Model weights saved at {save_path}")

‚úÖ Model weights saved at simplecnn_weights.pth


In [6]:
# Recreate model with same architecture
model = SimpleCNN(num_classes=len(dataset.classes)).to(device)

# Load saved weights
model.load_state_dict(torch.load(save_path, map_location=device))
model.eval()  # set to eval mode for inference
print("‚úÖ Model weights loaded successfully")

‚úÖ Model weights loaded successfully


In [None]:
# from PIL import Image
# import torchvision.transforms as transforms

# # Load and preprocess single image
# img_path = "test_image.jpg"
# image = Image.open(img_path).convert("RGB")

# transform = transforms.Compose([
#     transforms.Resize((128, 128)),
#     transforms.ToTensor(),
# ])

# image = transform(image).unsqueeze(0).to(device)  # Add batch dimension

# # Predict
# model.eval()
# with torch.no_grad():
#     output = model(image)
#     _, predicted = torch.max(output, 1)

# print("Predicted class index:", predicted.item())
# print("Predicted class name:", dataset.classes[predicted.item()])


Example code to integrate into bot with pyautogui
```python
import pyautogui
import torch
from PIL import Image
import torchvision.transforms as transforms
import time

# --- Load model ---
model = SimpleCNN(num_classes=len(dataset.classes)).to(device)
model.load_state_dict(torch.load(save_path, map_location=device))
model.eval()

# --- Define transform same as training ---
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# --- Function to capture screen region and classify ---
def predict_screen(region=None):
    """
    region: (left, top, width, height)
    If None, capture the whole screen.
    """
    # Capture region
    screenshot = pyautogui.screenshot(region=region)
    
    # Convert to tensor
    image = transform(screenshot).unsqueeze(0).to(device)
    
    # Predict
    with torch.no_grad():
        output = model(image)
        _, predicted = torch.max(output, 1)
    
    class_name = dataset.classes[predicted.item()]
    print(f"üñºÔ∏è Detected class: {class_name}")
    return class_name


# --- Example usage ---
while True:
    detected = predict_screen(region=(100, 200, 300, 300))  # x, y, width, height
    
    # Example: act if model detects a specific class
    if detected == "monster":
        pyautogui.press("space")  # attack!
    
    time.sleep(1)  # wait before next capture
```