In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, Subset
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
file_path = '/home/jisoo/n24news/n24news/captions_and_labels.csv'
data = pd.read_csv(file_path)

groups = [
    ['Opinion', 'Food', 'Movies'],
    ['Art & Design', 'Science', 'Fashion & Style'],
    ['Television', 'Sports', 'Style'],
    ['Music', 'Health', 'Dance'],
    ['Real Estate', 'Books', 'Media'],
    ['Travel', 'Theater', 'Technology']
]

output_paths = []
for i, group_labels in enumerate(groups, 1):
    group_data = data[data['Label'].isin(group_labels)]
    output_path = f'/home/jisoo/n24news/n24news/regroup_{i}.csv'
    group_data.to_csv(output_path, index=False)
    output_paths.append(output_path)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 128

def tokenize_data(df):
    input_ids = []

    df['Caption'] = df['Caption'].astype(str).fillna("")

    for text in df['Caption']:
        encoded = tokenizer(
            text, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt"
        )
        input_ids.append(encoded['input_ids'].squeeze(0))
    return torch.stack(input_ids)

In [5]:
class CustomDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'labels': self.labels[idx]
        }

In [6]:
class PerceiverBlock(nn.Module):
    """
    - 크로스 어텐션 (latents -> x)
    - 이어서 셀프 어텐션 (latent들끼리)
    - 보통은 LayerNorm, MLP(FeedForward) 등을 곁들여 잔차 연결(residual branch)을 구성
    """
    def __init__(self, latent_dim, n_heads=8, self_attn_layers=1):
        super().__init__()
        # 크로스 어텐션
        self.cross_attn = nn.MultiheadAttention(embed_dim=latent_dim, num_heads=n_heads)
        self.cross_ln = nn.LayerNorm(latent_dim)  # Layer Normalization

        # 여러 층의 셀프 어텐션
        self.self_attn_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=latent_dim, nhead=n_heads)
            for _ in range(self_attn_layers)
        ])

    def forward(self, latents, x):
        # latents, x: (T, B, dim) 형태로 가정
        # Perceiver 원리상 latents는 query, x는 key/value

        # 1) 크로스 어텐션
        updated_latents, _ = self.cross_attn(latents, x, x)
        latents = latents + updated_latents        # 잔차 연결
        latents = self.cross_ln(latents)           # LayerNorm

        # 2) 셀프 어텐션 반복
        for layer in self.self_attn_layers:
            latents = layer(latents)  # 내부적으로 잔차 연결 및 LayerNorm 포함

        return latents

class Perceiver(nn.Module):
    def __init__(self, input_dim, latent_dim, latent_size, num_classes,
                 num_blocks, self_attn_layers_per_block=1):
        super().__init__()
        self.latents = nn.Parameter(torch.randn(latent_size, latent_dim))
        self.input_projection = nn.Linear(input_dim, latent_dim)

        # 여러 개의 PerceiverBlock을 쌓음
        self.blocks = nn.ModuleList([
            PerceiverBlock(
                latent_dim=latent_dim,
                n_heads=8,
                self_attn_layers=self_attn_layers_per_block
            )
            for _ in range(num_blocks)
        ])

        self.output_layer = nn.Linear(latent_dim, num_classes)

    def forward(self, x):
        """
        x: (B, T, F) = (배치, 시퀀스 길이, 피처 차원)
        """
        B, T, F = x.size()
        x = self.input_projection(x)                 # (B, T, latent_dim)

        # latents: (latent_size, latent_dim) -> 배치 차원 확장 (B, latent_size, latent_dim)
        latents = self.latents.unsqueeze(0).expand(B, -1, -1)

        # MultiHeadAttention은 (T, B, dim) 순서를 권장하므로 permute
        x = x.permute(1, 0, 2)        # (T, B, latent_dim)
        latents = latents.permute(1, 0, 2)  # (latent_size, B, latent_dim)

        # PerceiverBlock을 여러 번 반복
        for block in self.blocks:
            latents = block(latents, x)

        # 최종 latents: (latent_size, B, latent_dim)
        latents = latents.permute(1, 0, 2).mean(dim=1)  # (B, latent_dim)
        return self.output_layer(latents)

class CombinedModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, perceiver_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.perceiver = perceiver_model

    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)  # (B, T, embed_dim)
        return self.perceiver(embeddings)       # Perceiver에 전달

In [7]:
class PackNet(nn.Module):
    def __init__(self, model):
        super(PackNet, self).__init__()
        self.model = model
        self.masks = {}
        self.current_task = None

    def set_task(self, task_id):
        self.current_task = task_id
        if task_id not in self.masks:
            self.masks[task_id] = {
                name: torch.ones_like(param, device=param.device)
                for name, param in self.model.named_parameters()
                if param.requires_grad
            }

    def prune(self, sparsity=0.2):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                mask = self.masks[self.current_task][name]
                threshold = torch.quantile(param.abs(), sparsity)
                mask[param.abs() < threshold] = 0
                self.masks[self.current_task][name] = mask

    def forward(self, input_ids, **kwargs):
        if self.current_task in self.masks:
            with torch.no_grad():
                for name, param in self.model.named_parameters():
                    if param.requires_grad:
                        param.data *= self.masks[self.current_task][name]
        return self.model(input_ids, **kwargs)

In [8]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total_loss / len(dataloader), correct / total

def eval_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return total_loss / len(dataloader), correct / total

In [9]:
EPOCHS = 15
BATCH_SIZE = 32
K_FOLDS = 5
EMBED_DIM = 128  
LATENT_DIM = 64
LATENT_SIZE = 64
NUM_BLOCKS = 4

In [10]:
def apply_pruning_with_intervals(packnet_model, test_loader, criterion, device, start_sparsity, end_sparsity, pruning_ratio):
    current_sparsity = start_sparsity
    while current_sparsity <= end_sparsity:
        print(f"Applying pruning with sparsity: {current_sparsity:.2f}")
        packnet_model.prune(sparsity=current_sparsity)

        print("Evaluating after pruning...")
        pruned_test_loss, pruned_test_acc = eval_epoch(packnet_model, test_loader, criterion, device)
        print(f"Pruned Test Loss: {pruned_test_loss:.4f}, Test Accuracy: {pruned_test_acc:.4f}")

        current_sparsity += pruning_ratio

In [14]:
# def save_final_model_after_kfold(results, all_learning_curves, checkpoint_path):
#     print("Saving final model after K-Fold training...")

#     # Assuming the last fold's model represents the final model
#     final_model_state_dict = results[-1]['Fold Results'][-1]['model_state_dict']

#     # Save final model checkpoint
#     torch.save({
#         'model_state_dict': final_model_state_dict,
#         'learning_curves': all_learning_curves,
#         'results': results
#     }, checkpoint_path)

#     print(f"Final model saved at {checkpoint_path}")

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [None]:
results = []
all_learning_curves = []



for idx, group_file in enumerate(output_paths, start=1):
    print(f"\nGroup {idx} 처리 중...")

    df = pd.read_csv(group_file)
    label_encoder = LabelEncoder()
    df['Label'] = label_encoder.fit_transform(df['Label'])
    num_classes = len(label_encoder.classes_)

    input_ids = tokenize_data(df)
    labels = torch.tensor(df['Label'].values)

    dataset = CustomDataset(input_ids, labels)
    kfold = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

    fold_results = []
    fold_learning_curves = []

    for fold, (train_idx, test_idx) in enumerate(kfold.split(dataset), start=1):
        print(f"\n  Fold {fold}/{K_FOLDS} 처리 중...")

        train_subset = Subset(dataset, train_idx)
        test_subset = Subset(dataset, test_idx)

        train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
        test_loader = DataLoader(test_subset, batch_size=BATCH_SIZE, shuffle=False)

        # Perceiver 모델 초기화
        perceiver = Perceiver(
            input_dim=EMBED_DIM,
            latent_dim=LATENT_DIM,
            latent_size=LATENT_SIZE,
            num_classes=num_classes,
            num_blocks=NUM_BLOCKS,
            self_attn_layers_per_block=1
        )

        # CombinedModel 초기화
        combined_model = CombinedModel(
            vocab_size=tokenizer.vocab_size,
            embed_dim=EMBED_DIM,
            perceiver_model=perceiver
        )

        # PackNet
        packnet_model = PackNet(combined_model)
        packnet_model.to(device)
        packnet_model.set_task(f"task_{idx}_{fold}")

        optimizer = optim.Adam(packnet_model.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
        criterion = nn.CrossEntropyLoss()

        train_losses, test_losses = [], []
        train_accuracies, test_accuracies = [], []

        # Pruning 이전 성능 평가
        print("Pruning 이전 성능:")
        initial_test_loss, initial_test_acc = eval_epoch(packnet_model, test_loader, criterion, device)
        print(f"  Test Loss: {initial_test_loss:.4f}, Test Accuracy: {initial_test_acc:.4f}")

        for epoch in range(EPOCHS):
            train_loss, train_acc = train_epoch(packnet_model, train_loader, criterion, optimizer, device)
            test_loss, test_acc = eval_epoch(packnet_model, test_loader, criterion, device)

            train_losses.append(train_loss)
            test_losses.append(test_loss)
            train_accuracies.append(train_acc)
            test_accuracies.append(test_acc)

            scheduler.step()

            if (epoch + 1) % 5 == 0 or epoch == 0:
                print(f"epoch {epoch+1}/{EPOCHS}: train loss {train_loss:.4f}, train acc {train_acc:.4f}")
                print(f"                         test loss {test_loss:.4f}, test acc {test_acc:.4f}")

        # 프루닝 이전 학습된 모델의 체크포인트 저장
        checkpoint_path = f"/checkpoints/group_{idx}_fold_{fold}_pre_pruning.pth.tar"
        torch.save(packnet_model.state_dict(), checkpoint_path)
        print(f"Pre-pruning model checkpoint saved at {checkpoint_path}")

        # 결과 저장
        fold_results.append({
            "Fold": fold,
            "Test Accuracy": test_acc,
            "Confusion Matrix": None,
            "Classification Report": None
        })
        # learning curve
        fold_learning_curves.append({
            "Fold": fold,
            "train_losses": train_losses,
            "test_losses": test_losses,
            "train_accuracies": train_accuracies,
            "test_accuracies": test_accuracies
        })

        # confusion matrix
        y_true, y_pred = [], []
        packnet_model.eval()
        with torch.no_grad():
            for batch in test_loader:
                input_ids_batch = batch['input_ids'].to(device)
                labels_batch = batch['labels'].to(device)

                outputs = packnet_model(input_ids_batch)
                _, predicted = torch.max(outputs, 1)
                y_true.extend(labels_batch.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())

        cm = confusion_matrix(y_true, y_pred)
        if cm.ndim != 2:
            raise ValueError(f"Confusion Matrix must be 2D, but got shape {cm.shape}.")
        report = classification_report(y_true, y_pred, output_dict=True)

        fold_results.append({
        "Fold": fold,
        "Test Accuracy": test_acc,
        "Confusion Matrix": cm,
        "Classification Report": classification_report(y_true, y_pred, output_dict=True)
        })
    
    avg_accuracy = np.mean([fr["Test Accuracy"] for fr in fold_results])
    results.append({
        "Group": idx,
        "Average Test Accuracy": avg_accuracy,
        "Fold Results": fold_results
    })

    all_learning_curves.append({
        "Group": idx,
        "Fold Learning Curves": fold_learning_curves
    })

    print(f"\n그룹 {idx}의 {K_FOLDS} 폴드 평균 테스트 정확도: {avg_accuracy:.4f}")

    for curve in fold_learning_curves:
        fold_idx = curve["Fold"]

      
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, EPOCHS + 1), curve["train_losses"], label="Train Loss")
        plt.plot(range(1, EPOCHS + 1), curve["test_losses"], label="Test Loss")
        plt.title(f"Group {idx} - Fold {fold_idx} Learning Curve (Loss)")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()
        plt.grid(True)
        plt.show()

    
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, EPOCHS + 1), curve["train_accuracies"], label="Train Accuracy")
        plt.plot(range(1, EPOCHS + 1), curve["test_accuracies"], label="Test Accuracy")
        plt.title(f"Group {idx} - Fold {fold_idx} Learning Curve (Accuracy)")
        plt.xlabel("Epoch")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.grid(True)
        plt.show()

    for fold_result in fold_results:
        fold_idx = fold_result["Fold"]
        cm = fold_result["Confusion Matrix"]

        if cm is not None and cm.ndim == 2:
            plt.figure(figsize=(10, 8))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                        xticklabels=label_encoder.classes_,
                        yticklabels=label_encoder.classes_)
            plt.title(f"Group {idx} - Fold {fold_idx} Confusion Matrix")
            plt.xlabel("Predicted")
            plt.ylabel("Actual")
            plt.show()
        else:
            print(f"Confusion Matrix for Fold {fold_idx} is invalid or missing.")


Group 1 처리 중...

  Fold 1/5 처리 중...
Pruning 이전 성능:
  Test Loss: 1.1096, Test Accuracy: 0.3897


KeyboardInterrupt: 

In [None]:
def load_checkpoint_and_prune(group_idx, num_folds, start_sparsity, end_sparsity, pruning_ratio, device):
    # Determine the checkpoint path for the last fold
    checkpoint_path = f"/home/jisoo/Perceiver/Perceiver/checkpoints/group_{group_idx}_fold_{num_folds}_pre_pruning.pth.tar"
    print(f"Loading checkpoint from {checkpoint_path}...")

    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device)

    # Reinitialize Perceiver and PackNet models
    perceiver = Perceiver(
        input_dim=128,  # Assuming 128 as input dim for tokenized data
        latent_dim=64,
        latent_size=64,
        num_classes=checkpoint['num_classes'],
        num_blocks=4,
        self_attn_layers_per_block=1
    )

    combined_model = CombinedModel(
        vocab_size=checkpoint['vocab_size'],
        embed_dim=128,
        perceiver_model=perceiver
    )

    packnet_model = PackNet(combined_model)
    packnet_model.load_state_dict(checkpoint['model_state_dict'])
    packnet_model.to(device)
    print("Model successfully loaded from checkpoint.")

    # Pruning logic
    current_sparsity = start_sparsity
    while current_sparsity <= end_sparsity:
        print(f"Applying pruning at sparsity {current_sparsity:.2f}...")
        packnet_model.prune(sparsity=current_sparsity)

        # Optionally evaluate after pruning if test_loader is provided
        pruned_test_loss, pruned_test_acc = eval_epoch(packnet_model, test_loader, criterion, device)
        print(f"After pruning (sparsity {current_sparsity:.2f}): Test Loss: {pruned_test_loss:.4f}, Test Acc: {pruned_test_acc:.4f}")

        current_sparsity += pruning_ratio

    # Save the pruned model
    pruned_checkpoint_path = checkpoint_path.replace("pre_pruning", "post_pruning")
    torch.save(packnet_model.state_dict(), pruned_checkpoint_path)
    print(f"Pruned model saved at {pruned_checkpoint_path}.")

groups = [1, 2, 3, 4, 5, 6]  # Example group indices
num_folds = 5  # Assuming 5 folds for each group
start_sparsity = 0.1
end_sparsity = 0.3
pruning_ratio = 0.1

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

for group_idx in groups:
    load_checkpoint_and_prune(
        group_idx=group_idx,
        num_folds=num_folds,
        start_sparsity=start_sparsity,
        end_sparsity=end_sparsity,
        pruning_ratio=pruning_ratio,
        device=device
    )

 