In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = '/home/youlee/n24news/n24news/captions_and_labels.csv'

categories = [
    "Opinion", "Art & Design", "Television", "Music", "Travel",
    "Real Estate", "Books", "Theater", "Health", "Sports",
    "Science", "Food", "Fashion & Style", "Movies", "Technology",
    "Dance", "Media", "Style"
]

data = pd.read_csv(file_path)
filtered_data = data[data['Label'].isin(categories)]


filtered_data_sampled = filtered_data.groupby('Label').apply(
    lambda x: x.sample(n=min(2100, len(x)), random_state=42)
).reset_index(drop=True)


label_groups = [categories[i:i + 3] for i in range(0, len(categories), 3)]
group_files = []
for i, group in enumerate(label_groups, start=1):
    group_data = filtered_data_sampled[filtered_data_sampled['Label'].isin(group)]
    output_file_path = f'/home/youlee/n24news/n24news/filtered_group_{i}.csv'
    group_data.to_csv(output_file_path, index=False)
    group_files.append(output_file_path)

print("Grouped data files saved:")
for file in group_files:
    print(file)

Grouped data files saved:
/home/youlee/n24news/n24news/filtered_group_1.csv
/home/youlee/n24news/n24news/filtered_group_2.csv
/home/youlee/n24news/n24news/filtered_group_3.csv
/home/youlee/n24news/n24news/filtered_group_4.csv
/home/youlee/n24news/n24news/filtered_group_5.csv
/home/youlee/n24news/n24news/filtered_group_6.csv


  filtered_data_sampled = filtered_data.groupby('Label').apply(


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 128

def tokenize_data(df):
    input_ids, attention_masks = [], []
    
    df['Caption'] = df['Caption'].astype(str).fillna("")

    for text in df['Caption']:
        encoded = tokenizer(
            text, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt"
        )
        input_ids.append(encoded['input_ids'].squeeze(0))
        attention_masks.append(encoded['attention_mask'].squeeze(0))
    return torch.stack(input_ids), torch.stack(attention_masks)


In [12]:
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [13]:
class CrossAttention(nn.Module):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super(CrossAttention, self).__init__()
        self.key_proj = nn.Linear(d_in, d_out_kq)
        self.query_proj = nn.Linear(d_in, d_out_kq)
        self.value_proj = nn.Linear(d_in, d_out_v)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, latent):
        keys = self.key_proj(x)
        queries = self.query_proj(latent)
        values = self.value_proj(x)

        attention_scores = torch.matmul(queries, keys.transpose(-2, -1))
        attention_probs = self.softmax(attention_scores)

        attended_values = torch.matmul(attention_probs, values)
        return attended_values

class LatentTransformer(nn.Module):
    def __init__(self, latent_dim, num_heads, num_layers, embed_dim):
        super(LatentTransformer, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, latent):
        latent = latent.permute(1, 0, 2)
        latent = self.transformer(latent)
        return latent.permute(1, 0, 2)

class Averaging(nn.Module):
    def forward(self, latent):
        return latent.mean(dim=1)

class Perceiver(nn.Module):
    def __init__(self, vocab_size, embed_dim, latent_dim, num_heads, num_layers, num_classes):
        super(Perceiver, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.input_proj = nn.Linear(embed_dim, embed_dim)

        self.latents = nn.Parameter(torch.randn(1, latent_dim, embed_dim))
        self.cross_attention = CrossAttention(d_in=embed_dim, d_out_kq=embed_dim, d_out_v=embed_dim)
        self.latent_transformer = LatentTransformer(latent_dim=latent_dim, num_heads=num_heads,
                                                    num_layers=num_layers, embed_dim=embed_dim)
        self.averaging = Averaging()
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x = self.input_proj(x)

        batch_size = x.size(0)
        latent = self.latents.repeat(batch_size, 1, 1)
        latent = self.cross_attention(x, latent)
        latent = self.latent_transformer(latent)
        latent_avg = self.averaging(latent)
        logits = self.classifier(latent_avg)
        return logits

In [14]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

In [15]:
def eval_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

In [16]:
EPOCHS = 10
BATCH_SIZE = 32
results = []
VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 128
LATENT_DIM = 64
NUM_HEADS = 8
NUM_LAYERS = 4

In [17]:
for idx, group_file in enumerate(group_files, start=1):
    print(f"\nProcessing Group {idx}...")

    df = pd.read_csv(group_file)
    df['Label'] = LabelEncoder().fit_transform(df['Label'])

    input_ids, attention_masks = tokenize_data(df)
    labels = torch.tensor(df['Label'].values)

    dataset = CustomDataset(input_ids, attention_masks, labels)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = Perceiver(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM,
                      num_heads=NUM_HEADS, num_layers=NUM_LAYERS, num_classes=len(df['Label'].unique()))
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(EPOCHS):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = eval_epoch(model, test_loader, criterion, device)
        print(f'  Group {idx} Epoch {epoch+1}/{EPOCHS}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'                             Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

    y_true, y_pred = [], []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    report = classification_report(y_true, y_pred, output_dict=True)
    results.append({
        "Group": idx,
        "Test Accuracy": test_acc,
        "Classification Report": report
    })

for result in results:
    print(f"\nGroup {result['Group']} Results:")
    print(f"Test Accuracy: {result['Test Accuracy']:.4f}")
    print(pd.DataFrame(result['Classification Report']).transpose())


Processing Group 1...




  Group 1 Epoch 1/10: Train Loss: 0.7258, Train Acc: 0.6944
                             Test Loss: 0.6151, Test Acc: 0.7540
  Group 1 Epoch 2/10: Train Loss: 0.5311, Train Acc: 0.7935
                             Test Loss: 0.5451, Test Acc: 0.7802
  Group 1 Epoch 3/10: Train Loss: 0.4459, Train Acc: 0.8306
                             Test Loss: 0.5186, Test Acc: 0.7976
  Group 1 Epoch 4/10: Train Loss: 0.3894, Train Acc: 0.8514
                             Test Loss: 0.4974, Test Acc: 0.8095
  Group 1 Epoch 5/10: Train Loss: 0.3404, Train Acc: 0.8764
                             Test Loss: 0.4798, Test Acc: 0.8119
  Group 1 Epoch 6/10: Train Loss: 0.2971, Train Acc: 0.8933
                             Test Loss: 0.5036, Test Acc: 0.8103
  Group 1 Epoch 7/10: Train Loss: 0.2664, Train Acc: 0.9032
                             Test Loss: 0.4897, Test Acc: 0.8286
  Group 1 Epoch 8/10: Train Loss: 0.2245, Train Acc: 0.9177
                             Test Loss: 0.4962, Test Acc: 0.8310




  Group 2 Epoch 1/10: Train Loss: 0.9137, Train Acc: 0.5712
                             Test Loss: 0.8192, Test Acc: 0.6460
  Group 2 Epoch 2/10: Train Loss: 0.6840, Train Acc: 0.7173
                             Test Loss: 0.6870, Test Acc: 0.7079
  Group 2 Epoch 3/10: Train Loss: 0.5825, Train Acc: 0.7704
                             Test Loss: 0.6554, Test Acc: 0.7389
  Group 2 Epoch 4/10: Train Loss: 0.4976, Train Acc: 0.8091
                             Test Loss: 0.6345, Test Acc: 0.7413
  Group 2 Epoch 5/10: Train Loss: 0.4412, Train Acc: 0.8276
                             Test Loss: 0.6053, Test Acc: 0.7548
  Group 2 Epoch 6/10: Train Loss: 0.3815, Train Acc: 0.8528
                             Test Loss: 0.5938, Test Acc: 0.7706
  Group 2 Epoch 7/10: Train Loss: 0.3166, Train Acc: 0.8813
                             Test Loss: 0.6390, Test Acc: 0.7635
  Group 2 Epoch 8/10: Train Loss: 0.2756, Train Acc: 0.9018
                             Test Loss: 0.6130, Test Acc: 0.7730




  Group 3 Epoch 1/10: Train Loss: 0.7393, Train Acc: 0.6823
                             Test Loss: 0.5993, Test Acc: 0.7524
  Group 3 Epoch 2/10: Train Loss: 0.5414, Train Acc: 0.7786
                             Test Loss: 0.5549, Test Acc: 0.7778
  Group 3 Epoch 3/10: Train Loss: 0.4438, Train Acc: 0.8310
                             Test Loss: 0.5002, Test Acc: 0.7905
  Group 3 Epoch 4/10: Train Loss: 0.3923, Train Acc: 0.8429
                             Test Loss: 0.4559, Test Acc: 0.8127
  Group 3 Epoch 5/10: Train Loss: 0.3293, Train Acc: 0.8760
                             Test Loss: 0.4494, Test Acc: 0.8222
  Group 3 Epoch 6/10: Train Loss: 0.2998, Train Acc: 0.8899
                             Test Loss: 0.4632, Test Acc: 0.8238
  Group 3 Epoch 7/10: Train Loss: 0.2666, Train Acc: 0.9006
                             Test Loss: 0.5293, Test Acc: 0.8095
  Group 3 Epoch 8/10: Train Loss: 0.2298, Train Acc: 0.9163
                             Test Loss: 0.5202, Test Acc: 0.8206




  Group 4 Epoch 1/10: Train Loss: 0.9199, Train Acc: 0.5605
                             Test Loss: 0.7402, Test Acc: 0.6857
  Group 4 Epoch 2/10: Train Loss: 0.6595, Train Acc: 0.7218
                             Test Loss: 0.6317, Test Acc: 0.7444
  Group 4 Epoch 3/10: Train Loss: 0.5324, Train Acc: 0.7833
                             Test Loss: 0.5453, Test Acc: 0.7730
  Group 4 Epoch 4/10: Train Loss: 0.4396, Train Acc: 0.8252
                             Test Loss: 0.5118, Test Acc: 0.7952
  Group 4 Epoch 5/10: Train Loss: 0.3676, Train Acc: 0.8601
                             Test Loss: 0.4944, Test Acc: 0.8000
  Group 4 Epoch 6/10: Train Loss: 0.2911, Train Acc: 0.8925
                             Test Loss: 0.4633, Test Acc: 0.8206
  Group 4 Epoch 7/10: Train Loss: 0.2263, Train Acc: 0.9169
                             Test Loss: 0.5120, Test Acc: 0.8183
  Group 4 Epoch 8/10: Train Loss: 0.1813, Train Acc: 0.9349
                             Test Loss: 0.5645, Test Acc: 0.8040




  Group 5 Epoch 1/10: Train Loss: 0.7709, Train Acc: 0.6450
                             Test Loss: 0.5872, Test Acc: 0.7603
  Group 5 Epoch 2/10: Train Loss: 0.5623, Train Acc: 0.7716
                             Test Loss: 0.5316, Test Acc: 0.7897
  Group 5 Epoch 3/10: Train Loss: 0.4726, Train Acc: 0.8103
                             Test Loss: 0.4749, Test Acc: 0.8119
  Group 5 Epoch 4/10: Train Loss: 0.4171, Train Acc: 0.8395
                             Test Loss: 0.4721, Test Acc: 0.8206
  Group 5 Epoch 5/10: Train Loss: 0.3695, Train Acc: 0.8579
                             Test Loss: 0.4233, Test Acc: 0.8325
  Group 5 Epoch 6/10: Train Loss: 0.3352, Train Acc: 0.8756
                             Test Loss: 0.4164, Test Acc: 0.8397
  Group 5 Epoch 7/10: Train Loss: 0.2811, Train Acc: 0.8942
                             Test Loss: 0.4360, Test Acc: 0.8389
  Group 5 Epoch 8/10: Train Loss: 0.2421, Train Acc: 0.9121
                             Test Loss: 0.4671, Test Acc: 0.8294




  Group 6 Epoch 1/10: Train Loss: 0.8641, Train Acc: 0.5871
                             Test Loss: 0.6871, Test Acc: 0.6968
  Group 6 Epoch 2/10: Train Loss: 0.6735, Train Acc: 0.6966
                             Test Loss: 0.6952, Test Acc: 0.6841
  Group 6 Epoch 3/10: Train Loss: 0.5948, Train Acc: 0.7401
                             Test Loss: 0.6202, Test Acc: 0.7238
  Group 6 Epoch 4/10: Train Loss: 0.5306, Train Acc: 0.7726
                             Test Loss: 0.5713, Test Acc: 0.7675
  Group 6 Epoch 5/10: Train Loss: 0.4729, Train Acc: 0.8038
                             Test Loss: 0.6446, Test Acc: 0.7222
  Group 6 Epoch 6/10: Train Loss: 0.4369, Train Acc: 0.8234
                             Test Loss: 0.5462, Test Acc: 0.7730
  Group 6 Epoch 7/10: Train Loss: 0.3864, Train Acc: 0.8433
                             Test Loss: 0.5572, Test Acc: 0.7730
  Group 6 Epoch 8/10: Train Loss: 0.3355, Train Acc: 0.8663
                             Test Loss: 0.6047, Test Acc: 0.7746
