# Setting

In [1]:
import os
import glob
import random
import time
import math
import logging 
from tqdm import tqdm

import torch

import torch.nn as nn
import torch.optim as optim
import pickle
import copy, math
from torch.utils.data import DataLoader, random_split

from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

import torch.nn.functional as F
from torch.nn.modules.utils import _pair
from torch.nn.parameter import Parameter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image

import torchvision.transforms as T

import models.shared_perceiver as sp
import models.layers as nl

In [2]:
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def seed_everything(seed):
    torch.manual_seed(seed) #torch를 거치는 모든 난수들의 생성순서를 고정한다
    torch.cuda.manual_seed(seed) #cuda를 사용하는 메소드들의 난수시드는 따로 고정해줘야한다 
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True #딥러닝에 특화된 CuDNN의 난수시드도 고정 
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed) #numpy를 사용할 경우 고정
    random.seed(seed) #파이썬 자체 모듈 random 모듈의 시드 고정

def seed_worker(worker_id): #데이터로더 난수고정
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

seed_everything(42)
g = torch.Generator()
g.manual_seed(42)
NUM_WORKERS = 4 # 서브프로세스관리자 수. 난수생성과 관련있습니다. 일단은 4로 고정합니다.


## Params & Hyperparams

In [4]:
FINETUNE_EPOCHS = 50
NETWORK_WIDTH_MULTIPLIER = 1.0
MAX_NETWORK_WIDTH_MULTIPLIER = 2.0

LR = 1e-2
LR_MASK = 1e-4
WEIGHT_DECAY = 4e-5
BATCH_SIZE = 32
TOTAL_NUM_TASKS = 6

NUM_CLASSES = 3

EMBED_DIM = 128
LATENT_DIM = 64
LATENT_SIZE = 64
NUM_BLOCKS = 4


task_id = 1
target_id = 7

In [5]:
data_path = '/home/youlee/n24news/n24news'

file_path = '/home/Minju/Perceiver'

groups_path = data_path + '/captions_and_labels.csv'

model_path = file_path + '/shared_layer_model/'
loader_path = file_path + '/shared_layer_loader/'

save_folder = file_path + '/finetune/'
load_folder = file_path + f'/{task_id}/{target_id}'


In [6]:
groups_df = pd.read_csv(groups_path)

groups = [
    ["Opinion", "Art & Design", "Television"],
    ["Music", "Travel", "Real Estate"],
    ["Books", "Theater", "Health"],
    ["Sports", "Science", "Food"],
    ["Fashion & Style", "Movies", "Technology"],
    ["Dance", "Media", "Style"]
]

output_paths = []
for i, group_labels in enumerate(groups, 1):
    group_data = groups_df[groups_df['Label'].isin(group_labels)]
    output_path = file_path + f'/regroup/regroup_{i}.csv'
    group_data.to_csv(output_path, index=False)
    output_paths.append(output_path)

print("생성된 그룹별 CSV 파일 경로:")
for path in output_paths:
    print(path)

생성된 그룹별 CSV 파일 경로:
/home/Minju/Perceiver/regroup/regroup_1.csv
/home/Minju/Perceiver/regroup/regroup_2.csv
/home/Minju/Perceiver/regroup/regroup_3.csv
/home/Minju/Perceiver/regroup/regroup_4.csv
/home/Minju/Perceiver/regroup/regroup_5.csv
/home/Minju/Perceiver/regroup/regroup_6.csv


# Load Model

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH=128

# Functions

In [8]:
def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

def eval_epoch(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy


In [9]:
# checkpoint = torch.load(model_path + (f'text_model_{target_id%6}.pkl' if target_id < 6 else f'image_model_{target_id}_fold1.pkl'))
# #if isinstance(checkpoint, sp.CombinedModel):
    
# checkpoint = checkpoint.state_dict()

# model = sp.Perceiver(input_dim=770, latent_dim=64, 
#                      latent_size=64, num_classes=NUM_CLASSES,
#                    num_blocks=4, self_attn_layers_per_block=10)

# model = sp.CombinedModel(vocab_size=tokenizer.vocab_size, embed_dim=128, perceiver_model=model)

# model.load_state_dict(checkpoint)
# model = model.to(device)

In [10]:
model = torch.load(model_path + (f'text_model_{target_id%6}.pkl' if target_id < 6 else f'image_model_{target_id}_fold1.pkl'))
model = model.to(device)

  model = torch.load(model_path + (f'text_model_{target_id%6}.pkl' if target_id < 6 else f'image_model_{target_id}_fold1.pkl'))


# Load Data, Dataloader

In [11]:
# def collate_fn(batch):
#     """batch를 PyTorch Tensor로 변환"""
#     input_ids = torch.stack([torch.tensor(b['input_ids']) for b in batch])
#     attention_masks = torch.stack([torch.tensor(b['attention_mask']) for b in batch])
#     labels = torch.tensor([b['labels'] for b in batch])  # labels도 텐서로 변환
#     return {'input_ids': input_ids, 'attention_mask': attention_masks, 'labels': labels}

In [12]:
for idx, group_file in enumerate(output_paths, start=1):
    print(f"\ngroup {idx} 처리 중...")

    df = pd.read_csv(group_file)
    label_encoder = LabelEncoder()
    df['Label'] = label_encoder.fit_transform(df['Label'])
    num_classes = len(label_encoder.classes_)

    input_ids, attention_masks = sp.tokenize_data(df, tokenizer=tokenizer, MAX_LENGTH=MAX_LENGTH)
    labels = torch.tensor(df['Label'].values)

    dataset = sp.CustomDataset(input_ids, attention_masks, labels)
    
    indices = torch.randperm(len(dataset))

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size

    train_idx = indices[:train_size]
    test_idx = indices[train_size:]

    train_subset = torch.utils.data.Subset(dataset, train_idx)
    test_subset = torch.utils.data.Subset(dataset, test_idx)

    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True,
                              num_workers=NUM_WORKERS, worker_init_fn=seed_worker, generator=g)
    test_loader = DataLoader(test_subset, batch_size=32, shuffle=False,
                             num_workers=NUM_WORKERS, worker_init_fn=seed_worker, generator=g)


group 1 처리 중...

group 2 처리 중...

group 3 처리 중...

group 4 처리 중...

group 5 처리 중...

group 6 처리 중...


In [13]:
# if task_id < 6:
#     dataset = sp.CustomDataset(input_ids, attention_masks, labels)
#     indices = torch.randperm(len(dataset))

#     train_size = int(0.8 * len(dataset))
#     test_size = len(dataset) - train_size

#     train_idx = indices[:train_size]
#     test_idx = indices[train_size:]

#     train_subset = torch.utils.data.Subset(dataset, train_idx)
#     test_subset = torch.utils.data.Subset(dataset, test_idx)

#     train_loader = DataLoader(train_subset, batch_size=32, shuffle=True,
#                               num_workers=NUM_WORKERS, worker_init_fn=seed_worker, generator=g)
#     test_loader = DataLoader(test_subset, batch_size=32, shuffle=False,
#                              num_workers=NUM_WORKERS, worker_init_fn=seed_worker, generator=g)

In [14]:
print(f"✅ Train dataset 샘플 예시: {train_subset}")

✅ Train dataset 샘플 예시: <torch.utils.data.dataset.Subset object at 0x719de0748f10>


In [15]:
# train_loader = dataset.load_cifar100_train(DATASET_DIR, batch_size)
# val_loader = dataset.load_cifar100_valid(VALIDSET_DIR, batch_size)

# Loss, Optimizer

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY, momentum=0.9)

# Fine-Tuning

In [17]:
best_acc = 0.0
for epoch in range(FINETUNE_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = eval_epoch(model, test_loader, criterion)

    print(f"Epoch {epoch+1}/{FINETUNE_EPOCHS}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}")
    print(f"                  Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

    # 최고 정확도 모델 저장
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), os.path.join(save_folder, "fine_tuned_model.pth.tar"))

print(f"🔹 Fine-tuning 완료! 최고 정확도: {best_acc:.4f}")

ValueError: not enough values to unpack (expected 3, got 2)

In [28]:
# for epoch in range(FINETUNE_EPOCHS):
#     model.train()
#     running_loss = 0.0
#     correct = 0
#     total = 0

    # for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{FINETUNE_EPOCHS}"):
    #     inputs, labels = inputs.to(device), labels.to(device)
    #     optimizer.zero_grad()
    #     outputs = model(inputs)
    #     loss = criterion(outputs, labels)
    #     loss.backward()
    #     optimizer.step()

# for batch in dataloader:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)  
#         labels = batch['labels'].to(device)
#         #print(f"🚀 input_ids.shape: {input_ids.shape}") 
#         optimizer.zero_grad()
#         outputs = model(input_ids)  # CombinedModel의 forward 호출
#         loss = criterion(outputs, labels)
#         #print(f"🚀 outputs.shape: {outputs.shape}")
        
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()
#         _, predicted = torch.max(outputs, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

# for batch in train_loader:
#     inputs = batch['input_ids'].to(device)
#     labels = batch['labels'].to(device)

#     optimizer.zero_grad()
#     outputs = model(inputs)  # 🔹 attention_mask 제거
#     loss = criterion(outputs, labels)
#     loss.backward()
#     optimizer.step()

#     running_loss += loss.item()
#     _, predicted = outputs.max(1)
#     total += labels.size(0)
#     correct += predicted.eq(labels).sum().item()

#     train_acc = 100. * correct / total
#     logging.info(f"Epoch [{epoch+1}/{FINETUNE_EPOCHS}], Loss: {running_loss:.4f}, Train Acc: {train_acc:.2f}%")


# for batch in train_loader:
#     inputs = batch['input_ids'].to(device)
#     attention_masks = batch['attention_mask'].to(device)
#     labels = batch['labels'].to(device)

#     optimizer.zero_grad()
#     outputs = model(inputs)  # 모델 입력 방식에 따라 수정
#     loss = criterion(outputs, labels)
#     loss.backward()
#     optimizer.step()

#     running_loss += loss.item()
#     _, predicted = outputs.max(1)
#     total += labels.size(0)
#     correct += predicted.eq(labels).sum().item()

#     train_acc = 100. * correct / total
#     logging.info(f"Epoch [{epoch+1}/{FINETUNE_EPOCHS}], Loss: {running_loss:.4f}, Train Acc: {train_acc:.2f}%")


ValueError: not enough values to unpack (expected 3, got 2)

# Save models

In [None]:
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    torch.save({'state_dict': model.state_dict()}, os.path.join(save_folder, "fine_tuned_model.pth.tar"))