In [4]:
import pandas as pd
import re

In [5]:
# 데이터 로드 및 전처리
data = pd.read_csv('/workspace/MIL/data/raw/naver_ocr.csv')
data['image_path'] = data['image_path'].str.replace('/content/xai_train', '/workspace/MIL/data/raw/csafe_version5_xai_train', regex=False)
data['repeat'] = data['image_path'].apply(lambda x: int(re.search(r'_(\d+)\.png', x).group(1)) if re.search(r'_(\d+)\.png', x) else None)

# 중요: 실제 이미지가 있는 200-299 라벨만 사용
data = data[data['label'].between(200, 299)].reset_index(drop=True)
print(f"총 데이터 개수: {len(data)}")
print(f"라벨 범위: {data['label'].min()} - {data['label'].max()}")
print(f"작성자 수: {data['label'].nunique()}")

data.head()

총 데이터 개수: 119661
라벨 범위: 200 - 299
작성자 수: 100


Unnamed: 0,image_path,label,text,repeat
0,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,yx,16
1,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,cond,20
2,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,yx,12
3,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,yx,15
4,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,yx,13


In [6]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import timm
import os

# 트랜스폼 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 모델 정의 - Autoencoder를 포함한 수정된 모델
class ModifiedViTWithAutoencoder(nn.Module):
    def __init__(self, base_model, latent_dim=128):
        super(ModifiedViTWithAutoencoder, self).__init__()
        self.base_model = base_model
        # 300차원을 128차원으로 압축하는 Autoencoder
        self.encoder = nn.Sequential(
            nn.Linear(300, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, latent_dim),
            nn.BatchNorm1d(latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 300),
            nn.BatchNorm1d(300)
        )
    
    def forward(self, x):
        # 베이스 모델로 특징 추출
        features = self.base_model(x)
        # Encoder 통과
        latent = self.encoder(features)
        # Decoder 통과
        reconstructed = self.decoder(latent)
        return reconstructed
    
    def get_latent_vector(self, x):
        # 베이스 모델로 특징 추출
        features = self.base_model(x)
        # Encoder만 통과하여 latent vector 반환
        latent = self.encoder(features)
        return latent

# 사전학습된 ViT 모델 로드
print("사전학습된 ViT 모델 로딩 중...")
checkpoint_path = '/workspace/MIL/data/raw/csafe_vit_300classes_best_model.pth'
checkpoint = torch.load(checkpoint_path, map_location='cpu')

# 체크포인트 키 확인
print("체크포인트 키:", checkpoint.keys())

# ViT 베이스 모델 생성
base_model = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=300)

# 체크포인트 키에 따라 로드 방식 결정
if 'model_state_dict' in checkpoint:
    base_model.load_state_dict(checkpoint['model_state_dict'])
elif 'state_dict' in checkpoint:
    base_model.load_state_dict(checkpoint['state_dict'])
else:
    # 체크포인트가 직접 state_dict인 경우
    base_model.load_state_dict(checkpoint)

base_model.eval()

# Autoencoder가 포함된 수정된 모델 생성
modified_model = ModifiedViTWithAutoencoder(base_model, latent_dim=128)

print("모델 준비 완료!")

사전학습된 ViT 모델 로딩 중...


  checkpoint = torch.load(checkpoint_path, map_location='cpu')


체크포인트 키: odict_keys(['cls_token', 'pos_embed', 'patch_embed.proj.weight', 'patch_embed.proj.bias', 'blocks.0.norm1.weight', 'blocks.0.norm1.bias', 'blocks.0.attn.qkv.weight', 'blocks.0.attn.qkv.bias', 'blocks.0.attn.proj.weight', 'blocks.0.attn.proj.bias', 'blocks.0.norm2.weight', 'blocks.0.norm2.bias', 'blocks.0.mlp.fc1.weight', 'blocks.0.mlp.fc1.bias', 'blocks.0.mlp.fc2.weight', 'blocks.0.mlp.fc2.bias', 'blocks.1.norm1.weight', 'blocks.1.norm1.bias', 'blocks.1.attn.qkv.weight', 'blocks.1.attn.qkv.bias', 'blocks.1.attn.proj.weight', 'blocks.1.attn.proj.bias', 'blocks.1.norm2.weight', 'blocks.1.norm2.bias', 'blocks.1.mlp.fc1.weight', 'blocks.1.mlp.fc1.bias', 'blocks.1.mlp.fc2.weight', 'blocks.1.mlp.fc2.bias', 'blocks.2.norm1.weight', 'blocks.2.norm1.bias', 'blocks.2.attn.qkv.weight', 'blocks.2.attn.qkv.bias', 'blocks.2.attn.proj.weight', 'blocks.2.attn.proj.bias', 'blocks.2.norm2.weight', 'blocks.2.norm2.bias', 'blocks.2.mlp.fc1.weight', 'blocks.2.mlp.fc1.bias', 'blocks.2.mlp.fc2.weigh

In [7]:
# GPU 상태 확인
import torch
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
print(f"사용 가능한 GPU 개수: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"  - 메모리: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
    print(f"  - 현재 할당: {torch.cuda.memory_allocated(i) / 1024**3:.1f} GB")
    print(f"  - 캐시: {torch.cuda.memory_reserved(i) / 1024**3:.1f} GB")

CUDA 사용 가능: True
사용 가능한 GPU 개수: 5
GPU 0: NVIDIA GeForce RTX 3090
  - 메모리: 23.7 GB
  - 현재 할당: 0.0 GB
  - 캐시: 0.0 GB
GPU 1: NVIDIA GeForce RTX 3090
  - 메모리: 23.7 GB
  - 현재 할당: 0.0 GB
  - 캐시: 0.0 GB
GPU 2: NVIDIA GeForce RTX 3090
  - 메모리: 23.7 GB
  - 현재 할당: 0.0 GB
  - 캐시: 0.0 GB
GPU 3: NVIDIA GeForce RTX 3090
  - 메모리: 23.7 GB
  - 현재 할당: 0.0 GB
  - 캐시: 0.0 GB
GPU 4: NVIDIA GeForce RTX 3090
  - 메모리: 23.7 GB
  - 현재 할당: 0.0 GB
  - 캐시: 0.0 GB


In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
from tqdm import tqdm
import os

# GPU 설정 - 5개 GPU 모두 활용
device_ids = [0, 1, 2, 3, 4]
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"사용 가능한 GPU 개수: {torch.cuda.device_count()}")
print(f"메인 디바이스: {device}")

# Custom Dataset 정의
class addfeature(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.df.iloc[idx]['image_path']
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, idx

# Dataset과 DataLoader 정의
dataset = addfeature(data, transform=transform)
# num_workers 추가하여 데이터 로딩 속도 향상
dataloader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=16, pin_memory=True)

# 멀티 GPU를 위한 DataParallel 설정
if torch.cuda.device_count() > 1:
    print(f"DataParallel 사용: {len(device_ids)}개 GPU")
    modified_model = nn.DataParallel(modified_model, device_ids=device_ids)
    
# 모델을 GPU로 이동
modified_model = modified_model.to(device)

# 모델을 이용한 벡터 추출
vector_list = []

with torch.no_grad():
    for images, indices in tqdm(dataloader, desc="Processing batches"):
        images = images.to(device)
        
        # DataParallel 사용 시 모듈 접근 방식 조정
        if isinstance(modified_model, nn.DataParallel):
            outputs = modified_model.module.get_latent_vector(images)
        else:
            outputs = modified_model.get_latent_vector(images)
            
        outputs = outputs.cpu().numpy()
        for i, idx in enumerate(indices):
            vector_list.append((idx, outputs[i]))

# 데이터프레임에 벡터 추가
vector_list = sorted(vector_list, key=lambda x: x[0])
vector_data = [x[1] for x in vector_list]
vector_columns = [f'var_{i+1}' for i in range(128)]
df_vectors = pd.DataFrame(vector_data, columns=vector_columns)

# 원본 데이터프레임에 합치기
data = pd.concat([data, df_vectors], axis=1)

# 원하는 열 순서
column_order = ['image_path', 'label', 'repeat', 'text']
# 나머지 열 자동 추가
column_order += [col for col in data.columns if col not in column_order]
# 열 순서 변경
data = data[column_order]
data = data.sort_values(by=['label', 'repeat','text']).reset_index(drop=True)

# GPU 메모리 정리
torch.cuda.empty_cache()

data

사용 가능한 GPU 개수: 5
메인 디바이스: cuda:0
DataParallel 사용: 5개 GPU


Processing batches: 100%|██████████| 1870/1870 [05:10<00:00,  6.03it/s]


Unnamed: 0,image_path,label,repeat,text,var_1,var_2,var_3,var_4,var_5,var_6,...,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128
0,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,1,a,0.463252,0.481082,1.062095,-0.367942,0.323362,-1.772118,...,-0.532931,1.275100,-1.458979,-1.473845,-0.411755,-1.558433,-1.052598,-0.047623,0.404223,0.177495
1,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,1,and,0.131122,-0.891745,0.193618,0.203714,-2.206375,1.400460,...,0.156004,0.717183,1.337669,-1.793418,0.019087,-0.196402,0.131149,-0.063505,-0.135851,-0.626591
2,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,1,away,0.642389,-0.383311,0.267690,-0.124094,-1.517472,-0.597316,...,0.660416,-2.752204,0.775014,-0.139299,1.085288,0.949715,-0.013237,-0.459833,0.287906,-1.374860
3,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,1,badasyou,1.555513,-1.271465,0.919383,1.237138,-0.428402,-0.380833,...,-0.686536,-0.640464,0.060657,-0.127544,1.193358,-1.175104,0.278152,-0.043623,-0.045278,0.164029
4,/workspace/MIL/data/raw/csafe_version5_xai_tra...,200,1,birds,1.933651,-1.809842,0.634431,-0.696290,-0.316514,0.087643,...,-1.222492,0.216802,1.580631,-0.515904,-0.117093,-0.518382,0.321472,-1.332392,-1.446169,-0.130970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119656,/workspace/MIL/data/raw/csafe_version5_xai_tra...,299,27,mouse,-0.222112,-1.110298,-0.810680,-0.821577,0.159712,-2.064002,...,1.240755,-0.218592,-0.673362,0.736480,-1.044209,0.465515,0.771495,0.730246,-0.030987,-0.649975
119657,/workspace/MIL/data/raw/csafe_version5_xai_tra...,299,27,second,-0.481328,-0.541618,0.548241,1.573464,1.193744,1.653575,...,-0.599495,0.203871,-1.560563,-0.673085,-0.013611,0.627045,-1.043341,-0.726055,-0.220593,1.631622
119658,/workspace/MIL/data/raw/csafe_version5_xai_tra...,299,27,the,-1.464403,-1.971323,1.486517,-0.868799,-0.031748,1.405624,...,0.283655,-0.224520,0.656601,0.802068,0.218527,-1.045422,-0.060966,0.003870,-1.026624,0.156493
119659,/workspace/MIL/data/raw/csafe_version5_xai_tra...,299,27,the,0.846941,1.307505,-1.867766,0.563033,-0.974710,-0.148046,...,0.451209,-0.262255,0.897709,0.290556,-0.319367,-0.876842,0.721452,2.265077,1.080170,3.062237


In [9]:
data_info = 'autoencoder_128d'

# 데이터프레임 생성 (200-299 라벨 기준으로 수정)
# 훈련: 200-259 (60명)
# 검증: 260-279 (20명) 
# 테스트: 280-299 (20명)
mil_train_data = data[data['label'].between(200, 259)].reset_index(drop=True)
mil_val_data = data[data['label'].between(260, 279)].reset_index(drop=True)
mil_test_data = data[data['label'].between(280, 299)].reset_index(drop=True)

print(f"훈련 데이터: {len(mil_train_data)} 샘플, {mil_train_data['label'].nunique()} 작성자")
print(f"검증 데이터: {len(mil_val_data)} 샘플, {mil_val_data['label'].nunique()} 작성자")
print(f"테스트 데이터: {len(mil_test_data)} 샘플, {mil_test_data['label'].nunique()} 작성자")

# 저장할 경로 - MIL 프로젝트 내부로 변경
output_dir = "/workspace/MIL/data/processed/embeddings"
os.makedirs(output_dir, exist_ok=True)

# 파일 이름에 data_info 추가
train_file = f"mil_{data_info}_train_data.csv"
val_file = f"mil_{data_info}_val_data.csv"
test_file = f"mil_{data_info}_test_data.csv"

# CSV로 저장
mil_train_data.to_csv(os.path.join(output_dir, train_file), index=False)
mil_val_data.to_csv(os.path.join(output_dir, val_file), index=False)
mil_test_data.to_csv(os.path.join(output_dir, test_file), index=False)

print(f"\nCSV 파일 저장 완료!")
print(f"저장 위치: {output_dir}")
print(f"파일: {train_file}, {val_file}, {test_file}")

훈련 데이터: 71464 샘플, 60 작성자
검증 데이터: 24243 샘플, 20 작성자
테스트 데이터: 23954 샘플, 20 작성자

CSV 파일 저장 완료!
저장 위치: /workspace/MIL/data/processed/embeddings
파일: mil_autoencoder_128d_train_data.csv, mil_autoencoder_128d_val_data.csv, mil_autoencoder_128d_test_data.csv
