In [1]:
import pandas as pd
import os

ROOT_DIR = '/workspace/upstageailab-cv-classification-cv_7'
train_csv = pd.read_csv(f"{ROOT_DIR}/data/train.csv")

missing = []
for img_id in train_csv['ID']:
    if not os.path.exists(f'{ROOT_DIR}/data/train/{img_id}'):
        missing.append(img_id)
print(f'누락 이미지 개수: {len(missing)}')
if missing:
    print(missing[:10])  # 누락된 파일명 샘플

누락 이미지 개수: 0


In [2]:
import pandas as pd
import os
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import GroupKFold

ROOT_DIR = '/workspace/upstageailab-cv-classification-cv_7'

def load_dataframe():
    return pd.read_csv(f"{ROOT_DIR}/data/train.csv")

def get_feature_extractor():
    model = models.resnet18(pretrained=True)
    model.fc = torch.nn.Identity()
    model.eval()
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
    return model, transform

def get_feature(img_path, model, transform):
    # 이미지가 없는 경우 None 반환
    if not os.path.exists(img_path):
        return None
    img = Image.open(img_path).convert('RGB')
    tensor = transform(img).unsqueeze(0)
    with torch.no_grad():
        feature = model(tensor).squeeze().numpy()
    return feature

def extract_features(train_df, model, transform):
    features = []
    missing_imgs = []
    for img_id in train_df['ID']:
        img_path = os.path.join(ROOT_DIR, "data", "train", img_id)
        if not os.path.exists(img_path):
            missing_imgs.append(img_id)
            features.append(np.zeros(512))  # 임시 zero vector로 대체(옵션)
            continue
        features.append(get_feature(img_path, model, transform))
    print(f"누락 이미지 개수(추출단계): {len(missing_imgs)}")
    if missing_imgs:
        print(missing_imgs[:10])
    return np.stack(features)


In [3]:
df = load_dataframe()
model, transform = get_feature_extractor()
features = extract_features(df, model, transform)  # 누락 파일 수 실시간 체크!



누락 이미지 개수(추출단계): 0


In [4]:
import pandas as pd
import os

ROOT_DIR = '/workspace/upstageailab-cv-classification-cv_7'
group_csv = pd.read_csv(f'{ROOT_DIR}/data/train_group_kfold.csv')

missing = []
for img_id in group_csv['ID']:
    if not os.path.exists(f'{ROOT_DIR}/data/train/{img_id}'):
        missing.append(img_id)
print(f'누락 이미지 개수 (group_kfold): {len(missing)}')
if missing:
    print(missing[:10])


누락 이미지 개수 (group_kfold): 0


In [5]:
import glob
ROOT_DIR = '/workspace/upstageailab-cv-classification-cv_7'
search = glob.glob(f'{ROOT_DIR}/data/train/00ec594b7cc89183*')
print('검색된 파일:', search)

검색된 파일: ['/workspace/upstageailab-cv-classification-cv_7/data/train/00ec594b7cc89183.jpg']


In [1]:
import pandas as pd
df = pd.read_csv('/workspace/upstageailab-cv-classification-cv_7/data/train_group_kfold.csv')
print(f"타입: {type(df)}")
print(f"컬럼: {df.columns.tolist()}")
print(f"샘플:\n{df.head()}")

타입: <class 'pandas.core.frame.DataFrame'>
컬럼: ['ID', 'target', 'group', 'kfold']
샘플:
                     ID  target  group  kfold
0  002f99746285dfdd.jpg      16     34      1
1  008ccd231e1fea5d.jpg      10     14      4
2  008f5911bfda7695.jpg      10     25      3
3  009235e4c9c07af5.jpg       4      8      2
4  00b2f44967580c74.jpg      16     37      1


In [2]:
class_counts = df["target"].value_counts()
print(f"클래스 분포: {dict(class_counts)}")

median_count = class_counts.median()
balanced_dfs = []

for target_class in class_counts.index:
    class_df = df[df["target"] == target_class]
    current_count = len(class_df)
    
    if current_count < median_count:
        repeat_factor = int(median_count / current_count)
        balanced_dfs.extend([class_df] * repeat_factor)
        
        remaining = int(median_count % current_count)
        if remaining > 0:
            balanced_dfs.append(class_df.sample(remaining, replace=True))
    else:
        balanced_dfs.append(class_df)

result_df = pd.concat(balanced_dfs, ignore_index=True)
print(f"밸런싱 후 분포: {dict(result_df['target'].value_counts())}")
print(f"반환값 타입 : {type(result_df)}")

클래스 분포: {16: 100, 10: 100, 0: 100, 3: 100, 12: 100, 8: 100, 2: 100, 11: 100, 7: 100, 9: 100, 15: 100, 5: 100, 4: 100, 6: 100, 13: 74, 14: 50, 1: 46}
밸런싱 후 분포: {16: 100, 9: 100, 14: 100, 13: 100, 6: 100, 4: 100, 5: 100, 15: 100, 7: 100, 10: 100, 11: 100, 2: 100, 8: 100, 12: 100, 3: 100, 0: 100, 1: 100}
반환값 타입 : <class 'pandas.core.frame.DataFrame'>


In [4]:
import os

# 코드에서 ROOT_DIR 확인
ROOT_DIR = '/workspace/upstageailab-cv-classification-cv_7'
print(f"ROOT_DIR: {ROOT_DIR}")
print(f"artifacts 경로: {os.path.join(ROOT_DIR, 'artifacts')}")

ROOT_DIR: /workspace/upstageailab-cv-classification-cv_7
artifacts 경로: /workspace/upstageailab-cv-classification-cv_7/artifacts
