<a href="https://colab.research.google.com/github/DoItSon/playdata/blob/main/%EB%94%A5%EB%9F%AC%EB%8B%9D/13_%EB%8B%A4%EC%A4%91%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 경로를 변경하시오

In [None]:
DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42

In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random # 시드 고정을 위해
import os # 시드 고정을 위해
from PIL import Image

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu 연산 무작위 고정
    torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
    torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# 구글 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 데이터 복사및 압축풀기

In [None]:
if not os.path.isdir("train"):
    !cp "{DATA_PATH}meat.zip" "meat.zip"
    !unzip -qq "meat.zip" 

# 이미지경로 및 정답 데이터셋

In [None]:
train = pd.read_csv("train/class_info.csv")
test = pd.read_csv("test/class_info.csv")
train.shape , test.shape

((1246, 2), (1020, 2))

# 정답값
- 0 : 신선한고기
- 1 : 반 신선한 고기
- 2 : 상태 안좋은 고기

In [None]:
train

Unnamed: 0,filename,target
0,1074.jpg,0
1,1222.jpg,1
2,2105.jpg,1
3,81.jpg,2
4,1791.jpg,2
...,...,...
1241,1670.jpg,1
1242,1244.jpg,0
1243,776.jpg,2
1244,750.jpg,0


# transforms 객체

In [None]:
from torchvision import transforms

img_size = [224,224]

train_lst = [
    transforms.Resize(img_size),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
train_transform = transforms.Compose(train_lst)

test_lst = [
    transforms.Resize(img_size),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
test_transform = transforms.Compose(test_lst)

# 데이터셋

In [None]:
class MeatDataset(torch.utils.data.Dataset):
    def __init__(self ,transform , x , is_inference = False ): 
        self.is_inference = is_inference
        self.transform = transform
        if not self.is_inference:
            self.y = x["target"].to_numpy() # 다중분류에서는 정답값을 내부적으로 원핫 인코딩하기 때문에 1차원형태로 전달해줘야하고, int64 형태로 전달해줘야한다.
            self.x = "train/" + x["filename"]
        else:
            self.x = "test/" + x["filename"]
    def __len__(self): 
        return self.x.shape[0]
    def __getitem__(self, idx): 
        item = {}
        x = Image.open(self.x[idx])
        item["x"] = self.transform(x)
        if not self.is_inference:
            item["y"] = torch.tensor(self.y[idx]) # 넘파이 데이터 타입 그대로 유지하기위해 tensor 함수를 사용한다.
        return item   

In [None]:
dt = MeatDataset(train_transform,train)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=True)
next(iter(dl))

{'x': tensor([[[[-0.0972, -0.0287, -0.0287,  ...,  0.7591,  1.0331,  1.4783],
           [-0.1314, -0.0801, -0.0801,  ...,  0.7419,  0.9646,  1.2385],
           [-0.2342, -0.1657, -0.1486,  ...,  0.7762,  0.9817,  1.2043],
           ...,
           [ 1.4612,  1.3413,  1.3242,  ...,  1.0673,  1.1015,  1.1529],
           [ 1.4098,  1.3413,  1.2899,  ...,  1.0844,  1.0673,  1.0844],
           [ 1.3584,  1.3413,  1.2214,  ...,  0.6906,  0.9132,  0.9817]],
 
          [[ 0.1877,  0.2577,  0.2577,  ...,  1.3782,  1.4482,  1.7633],
           [ 0.1527,  0.2052,  0.2052,  ...,  1.3256,  1.4832,  1.7108],
           [ 0.0476,  0.1176,  0.1352,  ...,  1.2031,  1.5532,  1.8508],
           ...,
           [ 1.3431,  1.2206,  1.2031,  ...,  1.1681,  1.1856,  1.1856],
           [ 1.2906,  1.2206,  1.1681,  ...,  1.1856,  1.1506,  1.1331],
           [ 1.2381,  1.2206,  1.0980,  ...,  0.7829,  1.0105,  1.0455]],
 
          [[ 0.3219,  0.3916,  0.3916,  ...,  1.5768,  1.7860,  2.1520],
        

In [None]:
from torchvision.models import resnet50 , ResNet50_Weights 

In [None]:
class Net(torch.nn.Module):
    def __init__(self): 
        super().__init__()
        self.pre_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        self.pre_model.fc = torch.nn.Linear(2048,3)

    def forward(self, x):
        x  = self.pre_model(x)
        return x

In [None]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torchinfo
model = Net()
torchinfo.summary(model,(32,3,224,224))  

Layer (type:depth-idx)                        Output Shape              Param #
Net                                           [32, 3]                   --
├─ResNet: 1-1                                 [32, 3]                   --
│    └─Conv2d: 2-1                            [32, 64, 112, 112]        9,408
│    └─BatchNorm2d: 2-2                       [32, 64, 112, 112]        128
│    └─ReLU: 2-3                              [32, 64, 112, 112]        --
│    └─MaxPool2d: 2-4                         [32, 64, 56, 56]          --
│    └─Sequential: 2-5                        [32, 256, 56, 56]         --
│    │    └─Bottleneck: 3-1                   [32, 256, 56, 56]         75,008
│    │    └─Bottleneck: 3-2                   [32, 256, 56, 56]         70,400
│    │    └─Bottleneck: 3-3                   [32, 256, 56, 56]         70,400
│    └─Sequential: 2-6                        [32, 512, 28, 28]         --
│    │    └─Bottleneck: 3-4                   [32, 512, 28, 28]         379,392

In [None]:
def train_loop(dataloader,model,loss_fn,optimizer,device):
    epoch_loss = 0 
    model.train() 
    for batch in dataloader: 
        pred = model(batch["x"].to(device)) 
        loss = loss_fn(pred, batch["y"].to(device)) 
        
        optimizer.zero_grad() 
        loss.backward()  
        optimizer.step() 
        
        epoch_loss += loss.item() 

    epoch_loss /= len(dataloader) 

    return epoch_loss 

In [None]:
@torch.no_grad() 
def test_loop(dataloader,model,loss_fn,device): 
    epoch_loss = 0
    model.eval() 

    
    pred_list = []
    softmax = torch.nn.Softmax(dim=1) # 다중분류 예측 확률을 출력하기위해 softmax 사용

    for batch in dataloader:
        
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None: 
            loss = loss_fn(pred, batch["y"].to(device))
            epoch_loss += loss.item()
        
        pred = softmax(pred)
        pred = pred.to("cpu").numpy() 
        pred_list.append(pred)

    epoch_loss /= len(dataloader)

    pred = np.concatenate(pred_list) 
    return epoch_loss , pred 

In [None]:
n_splits = 5
batch_size = 32 
epochs = 100
loss_fn = torch.nn.CrossEntropyLoss() # 다중분류 손실객체

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=n_splits,shuffle=True, random_state=SEED)

In [None]:
is_holdout = True
reset_seeds(SEED)
best_score_list = []
for i,(tri,vai) in enumerate(cv.split(train)):
    
    model = Net().to(device)
    optimizer = torch.optim.Adam(model.parameters())
    
    x_train = train.iloc[tri].reset_index(drop=True) # 학습 데이터 프레임
    x_valid = train.iloc[vai].reset_index(drop=True) # 검증 데이터 프레임

    train_dt = MeatDataset(train_transform,x_train)
    valid_dt = MeatDataset(test_transform,x_valid)
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size,shuffle=False)

    best_score = 0
    patience = 0

    for epoch in tqdm(range(epochs)):
        
        train_loss = train_loop(train_dl, model, loss_fn,optimizer,device )
        valid_loss , pred = test_loop(valid_dl, model, loss_fn,device  )
        
        pred = np.argmax(pred, axis=1) # 가장 확률이 높은 인덱스를 정답값으로 결정
        true = valid_dt.y # y 인스턴스 변수에 정답값이 있음
        score = f1_score(true, pred , average="micro")
        print(train_loss,valid_loss,score)
        patience += 1
        if best_score < score:
            patience = 0
            best_score = score
            torch.save(model.state_dict(),f"model_{i}.pth")

        if patience == 5:
            break
    print(f"Fold ({i}), BEST F1: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

0.3267693659872748 0.9896133318543434 0.712
0.1368025408446556 0.4902668185532093 0.884
0.13005199533654377 0.22788291051983833 0.908
0.21647235736600123 0.47630587965250015 0.804
0.1251756631245371 0.1417496760841459 0.952
0.03726591321901651 0.024825825297739357 0.992
0.09148947357107318 0.05835396994370967 0.98
0.15166734221566003 0.40860216692090034 0.832
0.029930003696790664 0.08427690247481223 0.984
0.012545705614684266 0.024681103986949893 0.992
0.026098519050265168 0.09977429690479767 0.976
Fold (0), BEST F1: 0.992
