## 1. 라이브러리 불러오기

In [None]:
import sys
import glob
import cv2
import numpy as np
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, utils, datasets, models
from torch.nn.modules.loss import BCEWithLogitsLoss
from torch.optim import lr_scheduler

from torch.autograd import Variable

from matplotlib import pyplot as plt
from time import time

import os
import time
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from torch.utils.data.dataloader import DataLoader, default_collate
import wandb

from augraphy import *

In [None]:
meta_path = '/data/ephemeral/home/data/meta.csv'
train_path = '/data/ephemeral/home/data/train.csv'
submission_path = '/data/ephemeral/home/data/sample_submission.csv'

meta_data = pd.read_csv(meta_path)
df_train = pd.read_csv(train_path)
df_submission = pd.read_csv(submission_path)

merge = pd.merge(df_train, meta_data, how='inner')

In [None]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

## 2. Custom Dataset

In [None]:
class ImageDataset(Dataset):
    def __init__(self, csv, path, album_transform=None, augraphy_transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path 
        self.album_transform = album_transform
        self.augraphy_transform = augraphy_transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        name, target = self.df[idx]
        img_path = os.path.join(self.path, name)
        
        try:
            img = Image.open(img_path)
            img = np.array(img)
            
            if self.augraphy_transform:
                img = self.augraphy_transform(img)

            if self.album_transform:
                img = self.album_transform(image=img)['image']
            
            return img, target
        except (IOError, OSError):
            print(f"Cannot read image: {img_path}")
            return None, None

In [None]:
from timm.data.mixup import Mixup
from timm.loss import SoftTargetCrossEntropy
import torch.nn as nn

# 여기서 Mixup을 위한 설정을 추가합니다.
mixup_fn = Mixup(
    mixup_alpha=0.3, cutmix_alpha=0.0, prob=0.8, switch_prob=0.5, mode='elem',
    label_smoothing=0.1, num_classes=17
)

# Mixup 사용 시 SoftTargetCrossEntropy 사용, 아니면 기본 CrossEntropyLoss 사용
criterion = SoftTargetCrossEntropy() if mixup_fn is not None else nn.CrossEntropyLoss()

In [None]:
from torch.utils.data import Subset, ConcatDataset
import random

oversampling_factors = [1.0] * 17
oversampling_factors[1] = 2.0  # (100/50)
oversampling_factors[13] = 1.35  # (100/74)
oversampling_factors[14] = 2.0  # (100/50)

def oversample_subset_per_class(dataset, oversampling_factors):
    oversampled_datasets = []
    class_to_indices = {}
    for i in range(len(dataset)):
        _, label = dataset[i]
        if label not in class_to_indices:
            class_to_indices[label] = []
        class_to_indices[label].append(i)

    for label, indices in class_to_indices.items():
        oversampling_factor = oversampling_factors[label]
        oversampled_indices = random.choices(indices, k=int(len(indices) * oversampling_factor) // 2 * 2)
        oversampled_subset = Subset(dataset, oversampled_indices)
        oversampled_datasets.append(oversampled_subset)
    
    oversampled_dataset = ConcatDataset(oversampled_datasets)
    print(f"Oversampled | {len(dataset)} -> {len(oversampled_dataset)}")
    return oversampled_dataset


## 3. Training Pipeline

In [None]:

def training(model, dataloader, device, criterion, optimizer, epoch, num_epochs, mixup_fn=None):
    model.train()
    train_loss = 0.0
    preds_list = []
    targets_list = []

    tbar = tqdm(dataloader)
    for batch in tbar:
        images, labels = batch
        
        # None 값 확인 및 건너뛰기
        if images is None or labels is None:
            continue  # 손상된 파일을 만나면 이 배치를 건너뛰고 다음으로 진행
        
        images = images.type(torch.cuda.FloatTensor)
        images, labels = images.to(device), labels.to(device)

        if mixup_fn is not None:
            images, labels = mixup_fn(images, labels)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        if mixup_fn is None:
            preds_list.extend(outputs.argmax(dim=1).detach().cpu().numpy())
            targets_list.extend(labels.detach().cpu().numpy())

        tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Train Loss : {loss.item():.4f}")

    train_loss = train_loss / len(dataloader)
    train_acc = None
    train_f1 = None

    if mixup_fn is None:
        train_acc = accuracy_score(targets_list, preds_list) if len(targets_list) > 0 else 0
        train_f1 = f1_score(targets_list, preds_list, average='macro') if len(targets_list) > 0 else 0

    metrics = {
        'train_loss': train_loss,
        'train_acc': train_acc,
        'train_f1': train_f1
    }

    return model, metrics



def evaluation(model, dataloader, dataset, device, criterion, epoch, num_epochs):
    model.eval()
    valid_loss = 0.0
    preds_list = []
    targets_list = []
    batch_count = 0

    with torch.no_grad():
        tbar = tqdm(dataloader)
        for batch in tbar:
            images, labels = batch
            
            # None 값 확인 및 건너뛰기
            if images is None or labels is None:
                continue  # 손상된 파일을 만나면 이 배치를 건너뛰고 다음으로 진행
            
            images = images.type(torch.cuda.FloatTensor)
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            valid_loss += loss.item()
            batch_count += 1  # 배치 개수를 카운트합니다.

            preds_list.extend(outputs.argmax(dim=1).detach().cpu().numpy())
            targets_list.extend(labels.detach().cpu().numpy())

            tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}] Valid Loss : {valid_loss/batch_count:.4f}")

    if batch_count > 0:  # 배치가 처리된 경우에만 계산
        valid_loss /= batch_count  # 배치 개수로 나누어 평균 손실을 계산합니다.
        valid_acc = accuracy_score(preds_list, targets_list)
        valid_f1 = f1_score(preds_list, targets_list, average='macro')
    else:  # 처리된 배치가 없는 경우(모든 배치가 건너뛰어진 경우)
        valid_loss = None
        valid_acc = None
        valid_f1 = None

    metrics = {
        'valid_loss': valid_loss,
        'valid_acc': valid_acc,
        'valid_f1': valid_f1
    }

    return model, metrics

def training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, criterion, optimizer, device, num_epochs, model_path, model_name, patience, run,scheduler):

    best_valid_loss = float('inf')
    valid_max_accuracy = -1
    valid_max_f1 = -1
    early_stop_counter = 0

    for epoch in range(num_epochs):
        model, train_metrics = training(model, train_dataloader, device, criterion, optimizer, epoch, num_epochs, mixup_fn)
        model, valid_metrics = evaluation(model, valid_dataloader, valid_dataset, device, criterion, epoch, num_epochs)
        scheduler.step()

        monitoring_value = {
            'train_loss': train_metrics['train_loss'],
            'valid_loss': valid_metrics['valid_loss']
        }
        run.log(monitoring_value, step=epoch)

        # 검증 손실을 기준으로 최고 성능 모델 저장 및 조기 종료 판단
        if valid_metrics['valid_loss'] < best_valid_loss:
            best_valid_loss = valid_metrics['valid_loss']
            early_stop_counter = 0  # 카운터 초기화
            # 모델 저장
            torch.save(model.state_dict(), f"{model_path}/model_{model_name}.pt")
            # WandB 요약 정보 업데이트
            run.summary['best_train_loss'] = train_metrics['train_loss']
            run.summary['best_valid_loss'] = valid_metrics['valid_loss']
        else:
            early_stop_counter += 1
            
        
        if early_stop_counter >= patience:
            print('Early Stopping!')        
            break

    return model, valid_max_accuracy, valid_max_f1


## 4. Data Load

In [None]:
img_csv_path = '/data/ephemeral/home/filtered_final.csv'
df_img = pd.read_csv(img_csv_path)
print(df_img.head())
len(df_img)




In [None]:
from albumentations.core.transforms_interface import ImageOnlyTransform


class NonLocalMeansDenoising(ImageOnlyTransform):
    def __init__(self, h=10, templateWindowSize=7, searchWindowSize=21, always_apply=False, p=0.5):
        super(NonLocalMeansDenoising, self).__init__(always_apply, p)
        self.h = h
        self.templateWindowSize = templateWindowSize
        self.searchWindowSize = searchWindowSize

    def apply(self, image, **params):
        # OpenCV는 BGR 형태로 이미지를 처리하므로, RGB 이미지를 BGR로 변환
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        # Non-Local Means Denoising 적용
        denoised_image = cv2.fastNlMeansDenoisingColored(image, None, self.h, self.h, self.templateWindowSize, self.searchWindowSize)
        # 결과 이미지를 다시 RGB로 변환
        denoised_image = cv2.cvtColor(denoised_image, cv2.COLOR_BGR2RGB)
        return denoised_image


In [None]:
img_path = '/data/ephemeral/home/lmj2' 
test_img_path = '/data/ephemeral/home/data/test/'
totensor_transform = A.Compose([A.Resize(380, 380), ToTensorV2()])
test_transform = A.Compose([
    A.Resize(380, 380),
    ToTensorV2()
])
# 이미지 변환 설정
totensor_transform = A.Compose([A.Resize(380, 380), ToTensorV2()])
test_transform = A.Compose([
    A.Resize(380, 380),
    #NonLocalMeansDenoising(h=10, templateWindowSize=7, searchWindowSize=21, p=1.0), # Non-Local Means 적용, p=1.0은 항상 적용
    #A.GaussianBlur(blur_limit=(3, 7), p=0.5),  # 가우시안 블러 적용, p는 적용 확률
    ToTensorV2()
])

# 데이터셋 로드
train_dataset = ImageDataset(img_csv_path, img_path, album_transform=totensor_transform, augraphy_transform=None)
test_dataset = ImageDataset(submission_path, test_img_path, album_transform=test_transform, augraphy_transform=None)

# 데이터셋 크기 출력
print(f"Original train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# 훈련 데이터와 검증 데이터로 분리 (random_split 사용)
train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [train_size, valid_size])

# def custom_collate_fn(batch):
#     # 배치에서 None 항목을 제외
#     batch = [item for item in batch if item[0] is not None and item[1] is not None]
#     # 배치 크기가 홀수인 경우 마지막 항목 제거
#     if len(batch) % 2 != 0:
#         batch = batch[:-1]
#     # 배치가 완전히 비었을 경우, DataLoader가 처리할 수 있는 유효한 빈 배치 반환
#     if len(batch) == 0:
#         return torch.tensor([]), torch.tensor([])
#     return default_collate(batch)


# 데이터로더 생성
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)#, collate_fn=custom_collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)#, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 데이터셋 크기 및 데이터로더 준비 상태 출력
print(f"Train dataset size: {len(train_dataset)}")
print(f"Valid dataset size: {len(valid_dataset)}")
print(f"Train dataloader size: {len(train_dataloader)}")
print(f"Valid dataloader size: {len(valid_dataloader)}")
print(f"Test dataloader size: {len(test_dataloader)}")


In [None]:
# # 오버샘플링용

# img_path = 'data/aug_dataset/aug_2'
# test_img_path = '/data/ephemeral/home/data/test/'
# totensor_transform = A.Compose([A.Resize(380, 380), ToTensorV2()])
# test_transform = A.Compose([
#     A.Resize(380, 380),
#     ToTensorV2()
# ])

# # 데이터셋 로드
# train_dataset = ImageDataset(img_csv_path, img_path, album_transform=totensor_transform, augraphy_transform=None)
# test_dataset = ImageDataset(submission_path, test_img_path, album_transform=test_transform, augraphy_transform=None)

# # 오버샘플링 적용 전 데이터셋 크기 출력
# print(f"Original train dataset size: {len(train_dataset)}")
# print(f"Test dataset size: {len(test_dataset)}")

# # 훈련 데이터와 검증 데이터로 분리 (random_split 사용)
# train_size = int(0.8 * len(train_dataset))
# valid_size = len(train_dataset) - train_size
# train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [train_size, valid_size])

# # 오버샘플링 적용 (가정: oversample_subset_per_class 함수가 정의되어 있음)
# oversampled_train_dataset = oversample_subset_per_class(train_dataset, oversampling_factors)

# # 데이터로더 생성
# train_dataloader = DataLoader(oversampled_train_dataset, batch_size=32, shuffle=True)
# valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
# test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# # 오버샘플링 적용 후 데이터셋 크기 및 데이터로더 준비 상태 출력
# print(f"Oversampled train dataset size: {len(oversampled_train_dataset)}")
# print(f"Valid dataset size: {len(valid_dataset)}")
# print(f"Train dataloader size: {len(train_dataloader)}")
# print(f"Valid dataloader size: {len(valid_dataloader)}")
# print(f"Test dataloader size: {len(test_dataloader)}")




# # Original train dataset size: 23550
# # Test dataset size: 3140
# # Oversampled | 18840 -> 20300
# # Oversampled train dataset size: 20300
# # Valid dataset size: 4710
# # Train dataloader size: 635
# # Valid dataloader size: 148
# # Test dataloader size: 99

## 5. Train Model

In [None]:
model = timm.create_model('efficientnet_b4', pretrained=True)
in_features = model.classifier.in_features
classifier = nn.Sequential(
    nn.Linear(in_features, 1024),
    nn.BatchNorm1d(1024),
    nn.SiLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 512),
    nn.BatchNorm1d(512),
    nn.SiLU(),
    nn.Dropout(p=0.2),
    nn.Linear(512, 256),
    nn.BatchNorm1d(256),
    nn.SiLU(),
    nn.Dropout(p=0.2),
    nn.Linear(256, 17),
) 
 
model.classifier = classifier

### Hyper Parameter 정의

In [None]:
class Cfg():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 100
    batch_size=32
    model_path = '/data/ephemeral/home/models'

    scheduler = StepLR(optimizer, step_size=5, gamma=0.5)    

In [None]:
# #run = wandb.init(project='AIStage-CV', name='effb4_add_fc')

# device = Cfg.device
# model = Cfg.model
# criterion = Cfg.criterion
# optimizer = Cfg.optimizer 
# num_epochs = Cfg.num_epochs
# model_name = 'effb4-add_fc'
# model_path = Cfg.model_path

# #run.watch(model, criterion, log='all', log_graph=True)

# #model, valid_max_accuracy, valid_max_f1 = training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, criterion, optimizer, device, num_epochs, model_path, model_name, 10, run, Cfg.scheduler)

# #run.finish()



In [None]:
# from PIL import Image
# import os

# def check_corrupted_images(dataset_directory_path):
#     image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']  # 이미지 확장자 리스트
#     corrupted_count = 0
#     total_count = 0

#     # 데이터셋 디렉토리 내의 모든 파일 순회
#     for root, dirs, files in os.walk(dataset_directory_path):
#         for file in files:
#             if any(file.lower().endswith(ext) for ext in image_extensions):
#                 total_count += 1
#                 file_path = os.path.join(root, file)
#                 try:
#                     with Image.open(file_path) as img:
#                         img.verify()  # 이미지 파일 검증
#                 except (IOError, SyntaxError) as e:
#                     print(f"손상된 파일 발견: {file_path}")
#                     corrupted_count += 1

#     return total_count, corrupted_count

# # 데이터셋 디렉토리 경로 설정
# dataset_directory_path = '/data/ephemeral/home/lmj'
# total_images, corrupted_images = check_corrupted_images(dataset_directory_path)

# print(f"전체 이미지 수: {total_images}, 손상된 이미지 수: {corrupted_images}")


In [None]:
#run = wandb.init(project='AIStage-CV', name='effb4_add_fc')

device = Cfg.device
model = Cfg.model
criterion = Cfg.criterion
optimizer = Cfg.optimizer 
num_epochs = Cfg.num_epochs
model_name = 'effb4-add_fc' 
model_path = Cfg.model_path

#run.watch(model, criterion, log='all', log_graph=True)

#model, valid_max_accuracy, valid_max_f1 = training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, criterion, optimizer, device, num_epochs, model_path, model_name, 10, run, Cfg.scheduler)

#run.finish()

In [None]:
import os
import pandas as pd

# CSV 파일 경로
csv_path = '/data/ephemeral/home/real_final.csv'
# 이미지가 저장된 폴더 경로
img_folder_path = '/data/ephemeral/home/data/aug_dataset/aug_2'

# CSV 파일 로드
df = pd.read_csv(csv_path)

# 이미지 파일명이 있는 컬럼 이름, 예를 들어 'image_name'이라고 가정
image_column_name = 'ID'  # 실제 컬럼 이름으로 변경해야 함

# 누락된 파일 검사
missing_files = []
for image_name in df[image_column_name]:
    image_path = os.path.join(img_folder_path, image_name)
    if not os.path.exists(image_path):
        missing_files.append(image_name)

# 누락된 파일 출력
for missing_file in missing_files:
    print(missing_file)

# 누락된 파일의 개수 출력
print(f"Total missing files: {len(missing_files)}")    


In [None]:
effb4 = timm.create_model('efficientnet_b4', pretrained=True)
in_features = effb4.classifier.in_features
classifier = nn.Sequential(
    nn.Linear(in_features, 1024),
    nn.BatchNorm1d(1024),
    nn.SiLU(), # relu -> swish 변경 
    nn.Dropout(p=0.2),
    nn.Linear(1024, 512),
    nn.BatchNorm1d(512),
    nn.SiLU(),
    nn.Dropout(p=0.2),
    nn.Linear(512, 256),
    nn.BatchNorm1d(256),
    nn.SiLU(),
    nn.Dropout(p=0.2),
    nn.Linear(256, 17),
)

effb4.classifier = classifier
effb4.load_state_dict(torch.load('/data/ephemeral/home/models/model_effb4-add_fc.pt'))
effb4 = effb4.to(device)
effb4.eval()

In [None]:
# original

preds_list = []

for images, labels in tqdm(test_dataloader):
    images = images.type(torch.cuda.FloatTensor)
    images = images.to(device)

    with torch.no_grad():
        preds = effb4(images)
    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

In [None]:
# import torch
# from torchvision import transforms
# from tqdm import tqdm
# from torch.nn import functional as F

# # 모델 평가 모드 설정
# model.eval()

# # TTA 변환 목록
# transformation_list = [
#     transforms.Compose([transforms.RandomHorizontalFlip(p=1)]),
#     transforms.Compose([transforms.RandomVerticalFlip(p=1)]),
#     transforms.Compose([transforms.RandomRotation(degrees=90)]),
#     transforms.Compose([transforms.RandomRotation(degrees=179)])
# ]

# # 결과 저장을 위한 딕셔너리
# preds_dict = {}
# probs_dict = {}

# # TTA가 적용된 이미지 수를 저장할 변수
# tta_applied_count = 0

# # test_dataloader의 각 배치에 대해 반복
# for batch_idx, (images, _) in enumerate(tqdm(test_dataloader)):
#     images = images.float().to(device)  # 이미지를 device로 이동 및 데이터 타입 변경
    
#     with torch.no_grad():
#         outputs = effb4(images)
#         probs = torch.softmax(outputs, dim=1)
#         max_probs, preds = torch.max(probs, dim=1)
        
#         # 각 이미지에 대하여
#         for i in range(images.size(0)):
#             img_index = batch_idx * test_dataloader.batch_size + i  # 이미지의 전역 인덱스 계산
#             if max_probs[i] >= 0.50:
#                 # 확률이 0.5 이상인 경우, 직접 결과 딕셔너리에 추가
#                 preds_dict[img_index] = preds[i].item()
#                 probs_dict[img_index] = max_probs[i].item()
#             else:
#                 # 확률이 0.5 미만인 경우, TTA 적용을 위해 별도 처리
#                 tta_probs = []
#                 for transformation in transformation_list:
#                     # 변환 적용
#                     transformed_image = transformation(images[i].unsqueeze(0)).to(device)
                    
#                     with torch.no_grad():
#                         tta_output = model(transformed_image)
#                         tta_prob = torch.softmax(tta_output, dim=1)
#                         tta_probs.append(tta_prob)
                
#                 # TTA 결과의 평균 확률 계산
#                 avg_tta_probs = torch.mean(torch.stack(tta_probs), dim=0)
#                 avg_max_prob, avg_pred = torch.max(avg_tta_probs, dim=1)
                
#                 # 평균 확률을 기반으로 최종 예측 결정
#                 preds_dict[img_index] = avg_pred.item()
#                 probs_dict[img_index] = avg_max_prob.item()  # TTA로 계산된 평균 확률 사용
#                 tta_applied_count += 1

# # 결과를 인덱스에 따라 정렬하여 최종 리스트 생성
# sorted_indices = sorted(preds_dict.keys())
# final_preds = [preds_dict[i] for i in sorted_indices]
# final_probs = [probs_dict[i] for i in sorted_indices]

# print(f"Number of predictions with TTA applied: {tta_applied_count}")
# print(f"Total predictions: {len(final_preds)}")


In [None]:
pred_df = pd.DataFrame(test_dataset.df, columns=['ID', 'target'])
pred_df['target'] = preds_list

In [None]:
sample_submission_df = pd.read_csv(submission_path)
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [None]:
pred_df.to_csv('/data/ephemeral/home/outputs/effb4-add_fc.csv', index=False)

In [None]:
pred_df

# ID	target
# 0	0008fdb22ddce0ce.jpg	[4]
# 1	00091bffdffd83de.jpg	[3]
# 2	00396fbc1f6cc21d.jpg	[3]
# 3	00471f8038d9c4b6.jpg	[3]
# 4	00901f504008d884.jpg	[4]

## 0.9434

# 	ID	target
# 0	0008fdb22ddce0ce.jpg	2
# 1	00091bffdffd83de.jpg	12
# 2	00396fbc1f6cc21d.jpg	5
# 3	00471f8038d9c4b6.jpg	12
# 4	00901f504008d884.jpg	2
# ...	...	...
# 3135	ffb4b6f619fb60ea.jpg	6
# 3136	ffb54299b1ad4159.jpg	10
# 3137	ffc2c91dff8cf2c0.jpg	8
# 3138	ffc4e330a5353a2a.jpg	0
# 3139	ffc71fed753d90c1.jpg	12

## 이상

# ID	target
# 0	0008fdb22ddce0ce.jpg	4
# 1	00091bffdffd83de.jpg	3
# 2	00396fbc1f6cc21d.jpg	3
# 3	00471f8038d9c4b6.jpg	3
# 4	00901f504008d884.jpg	4
# ...	...	...
# 3135	ffb4b6f619fb60ea.jpg	3
# 3136	ffb54299b1ad4159.jpg	3
# 3137	ffc2c91dff8cf2c0.jpg	3
# 3138	ffc4e330a5353a2a.jpg	4
# 3139	ffc71fed753d90c1.jpg	3

In [None]:
target_distribution = pred_df['target'].value_counts()

print(target_distribution)

## 0.9434

# target
# 7     225
# 6     204
# 10    203
# 8     200
# 9     200
# 2     200
# 0     200
# 15    200
# 5     200
# 16    200
# 12    197
# 11    195
# 4     184
# 3     180
# 13    156
# 14    107
# 1      89
# Name: count, dtype: int64

In [None]:
import pandas as pd

# CSV 파일 읽기
df1 = pd.read_csv("/data/ephemeral/home/1.csv")
df2 = pd.read_csv("/data/ephemeral/home/2.csv")
df3 = pd.read_csv("/data/ephemeral/home/data/sample_submission.csv")

# 인덱스를 기준으로 ID 값이 모두 같은 항목의 개수 계산
matching_count = 0

# 세 데이터프레임의 길이가 다를 수 있으므로, 가장 작은 길이를 기준으로 반복
min_length = min(len(df1), len(df2), len(df3))

for i in range(min_length):
    if df1.loc[i, "ID"] == df2.loc[i, "ID"] == df3.loc[i, "ID"]:
        matching_count += 1

print(f"인덱스에 따른 ID값이 모두 같은 항목의 개수: {matching_count}")
