# Setting

In [1]:
import os
import glob
import random
import time
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import copy, math

import torch.nn.functional as F
from torch.nn.modules.utils import _pair
from torch.nn.parameter import Parameter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from collections import Counter
from sklearn.preprocessing import LabelEncoder

from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

import torchvision.transforms as T

from models.shared_perceiver import crop, patchify, get_patch_coords, ImageDataset, PerceiverBlock, Perceiver

In [2]:
def seed_everything(seed):
    torch.manual_seed(seed) #torch를 거치는 모든 난수들의 생성순서를 고정한다
    torch.cuda.manual_seed(seed) #cuda를 사용하는 메소드들의 난수시드는 따로 고정해줘야한다 
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True #딥러닝에 특화된 CuDNN의 난수시드도 고정 
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed) #numpy를 사용할 경우 고정
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정
seed_everything(42)

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Functions

In [4]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs, device, scheduler=None):
    best_model = None 
    best_val_score = 0
    model.train()
    train_losses = []
    val_accuracies = []
    start = time.perf_counter()
    
    for epoch in range(epochs):
        total_loss = 0.0
        for images, labels in train_loader:
            # GPU로 옮기기
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)

        accuracy = evaluate_model(model, valid_loader, device=device, log_results=False)
        val_accuracies.append(accuracy)

        # Scheduler step 추가
        if scheduler:
            scheduler.step()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {accuracy:.2f}%")
        if accuracy > best_val_score:
            best_val_score = accuracy
            best_model_state = model.state_dict()  # 모델 상태 저장
            best_model = copy.deepcopy(model) 
            print(f"New best model found at epoch {epoch+1} with accuracy: {best_val_score:.2f}%")

    end = time.perf_counter()
    hour = (end-start) // 3600
    min = ((end-start) % 3600) // 60
    sec = int((end-start) % 60)
    print(f"Total Train time: {hour}h {min}m {sec}s")

    return train_losses, val_accuracies, best_model

def evaluate_model(model, data_loader, device, log_results=True):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        start = time.perf_counter()
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        end = time.perf_counter()
        hour = (end-start) // 3600
        min = ((end-start) % 3600) // 60
        sec = (end-start) % 60
        print(f"Elapsed time on CPU: {hour}h {min}m {sec}s")

    accuracy = 100.0 * correct / total
    if log_results:
        print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy


# Load Model 

In [5]:
data_dir = '/home/youlee/n24news/n24news/image'
crop_size = 0
patch_size = 16
batch_size = 32
epochs = 30
group_class = 3

In [6]:
#random.shuffle(target_classes)
target_classes = [ # 임의로 순서지정
    "Opinion", "Art & Design", "Television",
    "Music", "Travel", "Real Estate",
    "Books", "Theater", "Health",
    "Sports", "Science", "Food",
    "Fashion & Style", "Movies", "Technology",
    "Dance", "Media", "Style"
]
target_classes

['Opinion',
 'Art & Design',
 'Television',
 'Music',
 'Travel',
 'Real Estate',
 'Books',
 'Theater',
 'Health',
 'Sports',
 'Science',
 'Food',
 'Fashion & Style',
 'Movies',
 'Technology',
 'Dance',
 'Media',
 'Style']

In [7]:
output_path = '/home/Minju/Perceiver/shared_layer_model'
loader_path = '/home/Minju/Perceiver/shared_layer_loader'
group_class

3

# Train Loop 

In [8]:
for i in range(0, len(target_classes), group_class):  
    print(f"실험 {i//group_class + 1} 시작")
    selected_classes = target_classes[i:i+group_class]
    print(f"Selected Feature: {selected_classes}")

    filtered_dataset = ImageDataset(root_dir=data_dir, 
                                    transform=transform, 
                                    crop_size=crop_size, 
                                    patch_size=patch_size,
                                    selected_classes=selected_classes)
    all_labels = [label_idx for (_, label_idx) in filtered_dataset.data]

    # 1) 유니크 라벨과 개수
    unique_label_ids = np.unique(all_labels)
    print("Unique numeric labels:", unique_label_ids)
    print("Number of unique numeric labels:", len(unique_label_ids))

    # 2) 라벨별 개수 (분포)
    label_counts = Counter(all_labels)
    print("Label distribution (index: count):", label_counts)
    
    train_ratio = 0.8
    train_size = int(len(filtered_dataset) * train_ratio)
    valid_size = len(filtered_dataset) - train_size

    train_dataset, valid_dataset = random_split(filtered_dataset, [train_size, valid_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    print(f"train: {train_size}, valid: {valid_size}")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    NUM_CLASSES = len(filtered_dataset.label_encoder.classes_)
    model = Perceiver(input_dim=(patch_size**2) * 3 + 2,
                        latent_dim=128, 
                        latent_size=64, 
                        num_classes=NUM_CLASSES, 
                        num_blocks=4, 
                        self_attn_layers_per_block=10).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # Learning rate decay 추가

    start = time.perf_counter()
    train_losses, val_accuracies, best_model = train_model(
        model, train_loader, valid_loader,
        criterion, optimizer, epochs,
        device=device,
        scheduler=scheduler  
    )
    
    final_acc = evaluate_model(best_model, valid_loader, device=device, log_results=True)
    end = time.perf_counter()
    hour = (end-start) // 3600
    min = ((end-start) % 3600) // 60
    sec = int((end-start) % 60)
    print(f"Train time: {hour}h {min}m {sec}s")
    print(f"Final Validation Accuracy: {final_acc:.2f}%")
    print("----------------------------------------------------------")
    
    torch.save(best_model, f'{output_path}/image_model_{i//group_class+1}.pkl')

    val_loader_save_path = f"{loader_path}/image_val_loader_{i//group_class+1}.pkl"
    with open(val_loader_save_path, 'wb') as f:
        pickle.dump(valid_dataset, f)

실험 1 시작
Selected Feature: ['Opinion', 'Art & Design', 'Television']
Unique numeric labels: [0 1 2]
Number of unique numeric labels: 3
Label distribution (index: count): Counter({np.int64(1): 2437, np.int64(0): 2431, np.int64(2): 2419})
train: 5829, valid: 1458
Elapsed time on CPU: 0.0h 0.0m 12.30782374786213s
Epoch 1/30, Loss: 1.0701, Val Accuracy: 41.77%
New best model found at epoch 1 with accuracy: 41.77%
Elapsed time on CPU: 0.0h 0.0m 8.824211569968611s
Epoch 2/30, Loss: 1.0562, Val Accuracy: 42.87%
New best model found at epoch 2 with accuracy: 42.87%
Elapsed time on CPU: 0.0h 0.0m 8.83511544414796s
Epoch 3/30, Loss: 1.0509, Val Accuracy: 42.94%
New best model found at epoch 3 with accuracy: 42.94%
Elapsed time on CPU: 0.0h 0.0m 9.063895086059347s
Epoch 4/30, Loss: 1.0536, Val Accuracy: 42.73%
Elapsed time on CPU: 0.0h 0.0m 8.718714248854667s
Epoch 5/30, Loss: 1.0516, Val Accuracy: 42.11%
Elapsed time on CPU: 0.0h 0.0m 8.727431236067787s
Epoch 6/30, Loss: 1.0517, Val Accuracy: 43.

In [9]:
# def clones(module, N):
#     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [10]:
# def attention(query, key, value, mask=None, dropout=None):
#   d_k = query.size(-1)
#   scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
#   if mask is not None:
#     scores =scores.masked_fill(mask==0, -1e9)
#   p_attn = scores.softmax(dim=-1)
#   if dropout is not None:
#     p_attn = dropout(p_attn)
#   return torch.matmul(p_attn, value), p_attn

In [11]:
# class SharableMultiheadAttention(nn.Module):
#     def __init__(self, embed_dim, num_heads, dropout=0.1):
#         super(SharableMultiheadAttention, self).__init__()
#         self.embed_dim = embed_dim
#         self.num_heads = num_heads
#         self.d_k = embed_dim // num_heads
#         self.linears = clones(nn.Linear(embed_dim))
#         self.attn = None
#         self.dropout = nn.Dropout(dropout)

#         self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3)
#         self.out_proj = nn.Linear(embed_dim, embed_dim)
#         self.scale = self.d_k  ** -0.5

#     def forward(self, query, key, value, mask=None):
#         if mask is not None:
#             mask = mask.unsqueeze(1)
#         batch_size, seq_length, embed_dim = query.shape
#         qkv = self.qkv_proj(torch.cat[query, key, value], dim=1)
#         qkv = qkv.view(batch_size, seq_length, 3, self.num_heads, self.head_dim)
#         q, k, v = qkv.permute(2, 0, 3, 1, 4)
#         self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)

#         x = (
#             x.transpose(1,2)
#             .contiguous()
#             .view(batch_size, -1, self.num_heads * self.d_k)
#         )
#         del query
#         del key
#         del value
#         return self.linears[-1](x)

In [12]:
# DEFAULT_THRESHOLD = 5e-3
# class SharableMultiheadAttention(nn.Module):
#     def __init__(self, embed_dim, num_heads, bias=True, 
#                  dropout=0., mask_init='1s', mask_scale=1e-2,
#                  threshold_fn='binarizer', threshold=None):
#         super(SharableMultiheadAttention, self).__init__()
#         self.embed_dim = embed_dim
#         self.num_heads = num_heads
#         self.dropout = dropout
#         self.mask_init= mask_init
#         self.mask_scale = mask_scale

#         if threshold is not None:
#             threshold = DEFAULT_THRESHOLD
#         self.info = {
#             'threshold_fn': threshold_fn,
#             'threshold': threshold,
#         }

#         self.weight = Parameter(torch.Tensor()) # 이 부분 맞춰주어야함함
#         if bias:
#             self.bias = Parameter(torch.Tensor())   # 이 부분 맞춰주어야함
#         else:
#             self.register_parameter()
#         self.piggymask = None

#         if threshold_fn == 'binarizer':
#             self.threshold_fn = Binarizer.apply
#         elif threshold_fn == 'tenarizer':
#             self.threshold_fn = Tenarizer(threshold=threshold)

#         self.q_proj = SharableLinear(embed_dim, embed_dim)
#         self.k_proj = SharableLinear(embed_dim, embed_dim)
#         self.v_proj = SharableLinear(embed_dim, embed_dim)
#         self.out_proj = SharableLinear(embed_dim, embed_dim)

#     def forward(self, query, key, value, attn_mask=None):
#         Q = self.q_proj(query)
#         K = self.k_proj(key)
#         V = self.v_proj(value)

#         if self.piggymask is not None:
#             mask_thresholded = self.threshold_fn(self.piggymask, self.info['threshold'])
#             weight = mask_thresholded * self.weight
#         else:
#             weight = self.weight ########일단 여기까지 따라침.

#         attn_output, _ = F.multi_head_attention_forward(
#             Q, K, V,
#             self.embed_dim, self.num_heads,
#             None, None, None,  # Scaling 및 Bias 없음
#             attn_mask,
#             dropout_p=self.dropout,
#             out_proj_weight=self.out_proj.weight * self.out_proj.mask,
#             out_proj_bias=self.out_proj.bias,
#             training=self.training,
#             need_weights=False
#         )
#         return attn_output