In [2]:
pip install ultralytics

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from transformers import AutoImageProcessor, AutoModelForImageClassification
from torchvision import transforms
from PIL import Image
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from ultralytics import YOLO
from PIL import Image
import numpy as np
import cv2

In [4]:
object_det_model = YOLO("yolov8x.pt")

In [5]:
def rotate(img, angle):
    if angle == 0:
        return img
    h, w = img.size[1], img.size[0]
    m = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1)
    rad = np.radians(angle)
    nw = int(abs(np.sin(rad)) * h + abs(np.cos(rad)) * w)
    nh = int(abs(np.cos(rad)) * h + abs(np.sin(rad)) * w)
    m[0, 2] += (nw - w) / 2
    m[1, 2] += (nh - h) / 2
    return Image.fromarray(cv2.warpAffine(np.array(img), m, (nw, nh)))

In [6]:
def detect_best_class(model, img_path):
    img = Image.open(img_path).convert("RGB")
    angles = [0, 45, 90, 180]

    best = {"cls": None, "conf": 0}

    for a in angles:
        rimg = rotate(img, a)
        res = model(rimg)[0]

        if res.boxes:
            for b in res.boxes:
                conf = float(b.conf)
                cls = int(b.cls)

                if conf > best["conf"]:
                    best.update({"cls": cls, "conf": conf})

    if best["cls"] is None:
        return "None"

    return model.names[best["cls"]]

In [7]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=0.05):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        noise = torch.randn(tensor.size()) * self.std + self.mean
        return torch.clamp(tensor + noise, 0., 1.)

In [8]:
train_transform = transforms.Compose([
    transforms.Resize((384, 384)),

    # 무작위 회전 (30, 45, 60도 중 하나)
    transforms.RandomChoice([
        transforms.RandomRotation(30),
        transforms.RandomRotation(45),
        transforms.RandomRotation(60),
        transforms.RandomRotation(75),
        transforms.RandomRotation(90),
        transforms.RandomRotation(120),
        transforms.RandomRotation(150),
        transforms.RandomRotation(180),
        transforms.RandomRotation(210),
        transforms.RandomRotation(240),
        transforms.RandomRotation(270),
        transforms.RandomRotation(300)
    ]),

    # 좌우 / 상하 반전
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),

    # 30% 확률로 흑백화
    transforms.RandomApply([transforms.Grayscale(num_output_channels=3)], p=0.3),

    transforms.ToTensor(),

    # 가우시안 노이즈 추가
    AddGaussianNoise(0., 0.05),
])

# 검증 및 테스트용은 변형 최소화 (노이즈, 회전 등 제외)
test_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor()
])

In [9]:
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None, label_encoder=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform
        self.label_encoder = label_encoder

        if 'target' in self.data.columns and self.label_encoder is not None:
            self.data['target'] = self.label_encoder.transform(self.data['target'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = f"{self.img_dir}/{self.data.iloc[idx, 0]}"
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        if 'target' in self.data.columns:
            label = torch.tensor(self.data.iloc[idx, 1], dtype=torch.long)
            return image, label
        else:
            return image, -1

In [10]:
train_df = pd.read_csv("/home/realtheai/cv_competetion/data/train.csv")
le = LabelEncoder()
train_df['target'] = le.fit_transform(train_df['target'])

train_dataset = CustomImageDataset(
    csv_file="/home/realtheai/cv_competetion/data/train.csv",
    img_dir="/home/realtheai/cv_competetion/data/train",
    transform=train_transform,
    label_encoder=le
)

test_dataset = CustomImageDataset(
    csv_file="/home/realtheai/cv_competetion/data/sample_submission.csv",
    img_dir="/home/realtheai/cv_competetion/data/test",
    transform=test_transform,
    label_encoder=le
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [11]:
processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-384")
model = AutoModelForImageClassification.from_pretrained("facebook/convnext-base-384", num_labels=len(le.classes_), ignore_mismatched_sizes=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

preprocessor_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of ConvNextForImageClassification were not initialized from the model checkpoint at facebook/convnext-base-384 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([17, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/354M [00:00<?, ?B/s]

In [12]:
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")


Epoch 1/30 - Loss: 1.5998
Epoch 2/30 - Loss: 0.4883
Epoch 3/30 - Loss: 0.3340
Epoch 4/30 - Loss: 0.2543
Epoch 5/30 - Loss: 0.2066
Epoch 6/30 - Loss: 0.2002
Epoch 7/30 - Loss: 0.1656
Epoch 8/30 - Loss: 0.1484
Epoch 9/30 - Loss: 0.1207
Epoch 10/30 - Loss: 0.1162
Epoch 11/30 - Loss: 0.0910
Epoch 12/30 - Loss: 0.1016
Epoch 13/30 - Loss: 0.0816
Epoch 14/30 - Loss: 0.0831
Epoch 15/30 - Loss: 0.0751
Epoch 16/30 - Loss: 0.0642
Epoch 17/30 - Loss: 0.0510
Epoch 18/30 - Loss: 0.0612
Epoch 19/30 - Loss: 0.0680
Epoch 20/30 - Loss: 0.0492
Epoch 21/30 - Loss: 0.0477
Epoch 22/30 - Loss: 0.0265
Epoch 23/30 - Loss: 0.0516
Epoch 24/30 - Loss: 0.0212
Epoch 25/30 - Loss: 0.0477
Epoch 26/30 - Loss: 0.0177
Epoch 27/30 - Loss: 0.0318
Epoch 28/30 - Loss: 0.0407
Epoch 29/30 - Loss: 0.0234
Epoch 30/30 - Loss: 0.0190


In [None]:
model.eval()
all_preds = []

with torch.no_grad():
    for images, _ in test_loader:
        images = images.to(device)

        for img_tensor in images:
            img_np = (img_tensor.cpu().permute(1, 2, 0).numpy() * 255).astype(np.uint8)
            pil_img = Image.fromarray(img_np)

            yolo_res = object_det_model(pil_img)[0]

            detected_car = False

            if yolo_res.boxes:
                for box in yolo_res.boxes:
                    cls = int(box.cls[0])
                    class_name = object_det_model.names[cls]

                    if class_name == "car":
                        detected_car = True
                        break

            if detected_car:
                all_preds.append(16)
            else:
                img_batch = img_tensor.unsqueeze(0).to(device)
                outputs = model(img_batch).logits
                pred_class = outputs.argmax(dim=1).cpu().item()
                all_preds.append(pred_class)

pred_labels = le.inverse_transform(all_preds)

result = pd.read_csv('/home/realtheai/cv_competetion/data/sample_submission.csv')
result['target'] = pred_labels
result.to_csv('convnext_384_output.csv', index=False)
print("✅ 저장 완료: convnext_384_output.csv")


0: 640x640 (no detections), 132.5ms
Speed: 29.1ms preprocess, 132.5ms inference, 3.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 17.0ms
Speed: 3.0ms preprocess, 17.0ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 book, 20.7ms
Speed: 3.5ms preprocess, 20.7ms inference, 12.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 18.9ms
Speed: 2.8ms preprocess, 18.9ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 cell phone, 1 clock, 25.0ms
Speed: 3.9ms preprocess, 25.0ms inference, 3.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 20.1ms
Speed: 3.1ms preprocess, 20.1ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 39.2ms
Speed: 3.0ms preprocess, 39.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 41.4ms
Speed: 7.4ms preprocess, 41.4ms 

: 