# Pose Destimator Scratch

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import cv2
import os

class YoloPoseDataset(Dataset):
    def __init__(self, images_dir, labels_dir, img_size=(224,224)):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.img_size = img_size
        self.img_files = sorted(os.listdir(images_dir))

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_file = self.img_files[idx]
        label_file = img_file.replace('.jpg', '.txt')

        # 이미지 로드
        img_path = os.path.join(self.images_dir, img_file)
        img = cv2.imread(img_path)
        img = cv2.resize(img, self.img_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = torch.tensor(img, dtype=torch.float32).permute(2,0,1) / 255.0

        # 레이블 로드
        label_path = os.path.join(self.labels_dir, label_file)
        with open(label_path, 'r') as f:
            line = f.readline().strip().split()
            keypoints = [float(x) for x in line]  

        keypoints = torch.tensor(keypoints, dtype=torch.float32)

        return img, keypoints

base = '/home/otter/dataset/pallet/dataset/train'
dataset = YoloPoseDataset(images_dir=os.path.join(base, 'images'), labels_dir=os.path.join(base, 'labels'))

In [42]:
base = '/home/otter/dataset/pallet/dataset/train'
dataset = YoloPoseDataset(images_dir=os.path.join(base, 'images'), labels_dir=os.path.join(base, 'labels'))

In [2]:
import torch.nn as nn

class SimplePoseEstimator(nn.Module):
    def __init__(self, num_keypoints):
        super(SimplePoseEstimator, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
        )

        self.fc = nn.Linear(128, num_keypoints * 2)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [None]:
from tqdm import tqdm 
import torch.optim as optim



# DataLoader 사용 예시
base = '/home/otter/dataset/pallet/dataset/train'
dataset = YoloPoseDataset(images_dir=os.path.join(base, 'images'), labels_dir=os.path.join(base, 'labels'))
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


# 모델 생성 및 설정
num_keypoints = 6 # 실제 사용하는 keypoint 개수로 설정
model = SimplePoseEstimator(num_keypoints).to('cuda')
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 학습 루프
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for images, keypoints_gt in tqdm(dataloader):
        images, keypoints_gt = images.to('cuda'), keypoints_gt.to('cuda')
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, keypoints_gt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{epochs}] - Avg Loss: {avg_loss:.4f}")

print("학습 완료!")

torch.save(model, 'checkpoints/pe.pt')

100%|██████████| 340/340 [00:29<00:00, 11.56it/s]


Epoch [1/10] - Avg Loss: 0.0265


100%|██████████| 340/340 [00:29<00:00, 11.64it/s]


Epoch [2/10] - Avg Loss: 0.0210


100%|██████████| 340/340 [00:29<00:00, 11.64it/s]


Epoch [3/10] - Avg Loss: 0.0166


100%|██████████| 340/340 [00:29<00:00, 11.51it/s]


Epoch [4/10] - Avg Loss: 0.0153


100%|██████████| 340/340 [00:29<00:00, 11.54it/s]


Epoch [5/10] - Avg Loss: 0.0143


100%|██████████| 340/340 [00:29<00:00, 11.48it/s]


Epoch [6/10] - Avg Loss: 0.0138


100%|██████████| 340/340 [00:29<00:00, 11.53it/s]


Epoch [7/10] - Avg Loss: 0.0128


100%|██████████| 340/340 [00:29<00:00, 11.41it/s]


Epoch [8/10] - Avg Loss: 0.0121


100%|██████████| 340/340 [00:29<00:00, 11.34it/s]


Epoch [9/10] - Avg Loss: 0.0110


100%|██████████| 340/340 [00:29<00:00, 11.46it/s]

Epoch [10/10] - Avg Loss: 0.0094
학습 완료!





In [None]:
from torchvision import transforms
model = torch.load('checkpoints/pe.pt').cuda()
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 모델 사용 예시
file_base = '/home/otter/workspace/Pallet/train_video'
files_list = os.listdir(file_base)
files = [os.path.join(file_base, file) for file in files_list if file.endswith('.mp4')]
for file in files:
    key = None
    cap = cv2.VideoCapture(file)
    while cap.isOpened():
        ret, color_frame = cap.read()
        if ret: 
            input_frame = transform(color_frame).to('cuda').unsqueeze(0)
            outputs = model(input_frame)
            xs = outputs[:, 0::2][0][2:] * 1920 / 224
            ys = outputs[:, 1::2][0][2:] * 1080 / 224
            x = xs.type(torch.int32).detach().cpu().numpy()
            y = ys.type(torch.int32).detach().cpu().numpy()
            for i in range(len(x)):
                cv2.circle(color_frame, (x[i], y[i]), 5, (0, 0, 255), -1)
            cv2.imshow('frame', color_frame)    
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q') or key == ord('p'): break
        else:
            break 
    if key == ord('q'):
        break

    cap.release()
cv2.destroyAllWindows()

  model = torch.load('pe.pt').cuda()


torch.Size([8, 7, 7, 5])


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.