In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = '/home/youlee/n24news/n24news/captions_and_labels.csv'

categories = [
    "Opinion", "Art & Design", "Television", "Music", "Travel",
    "Real Estate", "Books", "Theater", "Health", "Sports",
    "Science", "Food", "Fashion & Style", "Movies", "Technology",
    "Dance", "Media", "Style"
]

data = pd.read_csv(file_path)
filtered_data = data[data['Label'].isin(categories)]


label_groups = [categories[i:i + 3] for i in range(0, len(categories), 3)]
group_files = []
for i, group in enumerate(label_groups, start=1):
    group_data = filtered_data[filtered_data['Label'].isin(group)]
    output_file_path = f'/home/youlee/n24news/n24news/group_{i}.csv'
    group_data.to_csv(output_file_path, index=False)
    group_files.append(output_file_path)

print("Grouped data files saved:")
for file in group_files:
    print(file)

Grouped data files saved:
/home/youlee/n24news/n24news/group_1.csv
/home/youlee/n24news/n24news/group_2.csv
/home/youlee/n24news/n24news/group_3.csv
/home/youlee/n24news/n24news/group_4.csv
/home/youlee/n24news/n24news/group_5.csv
/home/youlee/n24news/n24news/group_6.csv


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 128

def tokenize_data(df):
    input_ids, attention_masks = [], []
    
    df['Caption'] = df['Caption'].astype(str).fillna("")

    for text in df['Caption']:
        encoded = tokenizer(
            text, padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt"
        )
        input_ids.append(encoded['input_ids'].squeeze(0))
        attention_masks.append(encoded['attention_mask'].squeeze(0))
    return torch.stack(input_ids), torch.stack(attention_masks)


In [4]:
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [5]:
class CrossAttention(nn.Module):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super(CrossAttention, self).__init__()
        self.key_proj = nn.Linear(d_in, d_out_kq)
        self.query_proj = nn.Linear(d_in, d_out_kq)
        self.value_proj = nn.Linear(d_in, d_out_v)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, latent):
        keys = self.key_proj(x)
        queries = self.query_proj(latent)
        values = self.value_proj(x)

        attention_scores = torch.matmul(queries, keys.transpose(-2, -1))
        attention_probs = self.softmax(attention_scores)

        attended_values = torch.matmul(attention_probs, values)
        return attended_values

class LatentTransformer(nn.Module):
    def __init__(self, latent_dim, num_heads, num_layers, embed_dim):
        super(LatentTransformer, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, latent):
        latent = latent.permute(1, 0, 2)
        latent = self.transformer(latent)
        return latent.permute(1, 0, 2)

class Averaging(nn.Module):
    def forward(self, latent):
        return latent.mean(dim=1)

class Perceiver(nn.Module):
    def __init__(self, vocab_size, embed_dim, latent_dim, num_heads, num_layers, num_classes):
        super(Perceiver, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.input_proj = nn.Linear(embed_dim, embed_dim)

        self.latents = nn.Parameter(torch.randn(1, latent_dim, embed_dim))
        self.cross_attention = CrossAttention(d_in=embed_dim, d_out_kq=embed_dim, d_out_v=embed_dim)
        self.latent_transformer = LatentTransformer(latent_dim=latent_dim, num_heads=num_heads,
                                                    num_layers=num_layers, embed_dim=embed_dim)
        self.averaging = Averaging()
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x = self.input_proj(x)

        batch_size = x.size(0)
        latent = self.latents.repeat(batch_size, 1, 1)
        latent = self.cross_attention(x, latent)
        latent = self.latent_transformer(latent)
        latent_avg = self.averaging(latent)
        logits = self.classifier(latent_avg)
        return logits

In [6]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

In [7]:
def eval_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

In [8]:
EPOCHS = 10
BATCH_SIZE = 32
results = []
VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 128
LATENT_DIM = 64
NUM_HEADS = 8
NUM_LAYERS = 4

In [9]:
for idx, group_file in enumerate(group_files, start=1):
    print(f"\nProcessing Group {idx}...")

    df = pd.read_csv(group_file)
    df['Label'] = LabelEncoder().fit_transform(df['Label'])

    input_ids, attention_masks = tokenize_data(df)
    labels = torch.tensor(df['Label'].values)

    dataset = CustomDataset(input_ids, attention_masks, labels)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = Perceiver(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM,
                      num_heads=NUM_HEADS, num_layers=NUM_LAYERS, num_classes=len(df['Label'].unique()))
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(EPOCHS):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        test_loss, test_acc = eval_epoch(model, test_loader, criterion, device)
        print(f'  Group {idx} Epoch {epoch+1}/{EPOCHS}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'                             Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

    y_true, y_pred = [], []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    report = classification_report(y_true, y_pred, output_dict=True)
    results.append({
        "Group": idx,
        "Test Accuracy": test_acc,
        "Classification Report": report
    })

for result in results:
    print(f"\nGroup {result['Group']} Results:")
    print(f"Test Accuracy: {result['Test Accuracy']:.4f}")
    print(pd.DataFrame(result['Classification Report']).transpose())


Processing Group 1...




  Group 1 Epoch 1/10: Train Loss: 0.7289, Train Acc: 0.6979
                             Test Loss: 0.5983, Test Acc: 0.7778
  Group 1 Epoch 2/10: Train Loss: 0.5293, Train Acc: 0.7991
                             Test Loss: 0.5231, Test Acc: 0.7908
  Group 1 Epoch 3/10: Train Loss: 0.4550, Train Acc: 0.8298
                             Test Loss: 0.5147, Test Acc: 0.8114
  Group 1 Epoch 4/10: Train Loss: 0.3917, Train Acc: 0.8607
                             Test Loss: 0.4681, Test Acc: 0.8278
  Group 1 Epoch 5/10: Train Loss: 0.3525, Train Acc: 0.8755
                             Test Loss: 0.4898, Test Acc: 0.8217
  Group 1 Epoch 6/10: Train Loss: 0.3051, Train Acc: 0.8924
                             Test Loss: 0.5371, Test Acc: 0.8176
  Group 1 Epoch 7/10: Train Loss: 0.2752, Train Acc: 0.9015
                             Test Loss: 0.5252, Test Acc: 0.8224
  Group 1 Epoch 8/10: Train Loss: 0.2359, Train Acc: 0.9194
                             Test Loss: 0.5131, Test Acc: 0.8285




  Group 2 Epoch 1/10: Train Loss: 0.9197, Train Acc: 0.5693
                             Test Loss: 0.8336, Test Acc: 0.6211
  Group 2 Epoch 2/10: Train Loss: 0.7159, Train Acc: 0.7024
                             Test Loss: 0.9288, Test Acc: 0.6094
  Group 2 Epoch 3/10: Train Loss: 0.6236, Train Acc: 0.7471
                             Test Loss: 0.6342, Test Acc: 0.7467
  Group 2 Epoch 4/10: Train Loss: 0.5277, Train Acc: 0.7916
                             Test Loss: 0.5748, Test Acc: 0.7757
  Group 2 Epoch 5/10: Train Loss: 0.4651, Train Acc: 0.8212
                             Test Loss: 0.5920, Test Acc: 0.7688
  Group 2 Epoch 6/10: Train Loss: 0.4118, Train Acc: 0.8448
                             Test Loss: 0.5829, Test Acc: 0.7750
  Group 2 Epoch 7/10: Train Loss: 0.3671, Train Acc: 0.8641
                             Test Loss: 0.6339, Test Acc: 0.7764
  Group 2 Epoch 8/10: Train Loss: 0.3321, Train Acc: 0.8761
                             Test Loss: 0.5860, Test Acc: 0.7874




  Group 3 Epoch 1/10: Train Loss: 0.7049, Train Acc: 0.6852
                             Test Loss: 0.5894, Test Acc: 0.7545
  Group 3 Epoch 2/10: Train Loss: 0.4881, Train Acc: 0.8013
                             Test Loss: 0.5089, Test Acc: 0.8008
  Group 3 Epoch 3/10: Train Loss: 0.4200, Train Acc: 0.8352
                             Test Loss: 0.4721, Test Acc: 0.8140
  Group 3 Epoch 4/10: Train Loss: 0.3620, Train Acc: 0.8650
                             Test Loss: 0.4688, Test Acc: 0.8313
  Group 3 Epoch 5/10: Train Loss: 0.3186, Train Acc: 0.8830
                             Test Loss: 0.4483, Test Acc: 0.8292
  Group 3 Epoch 6/10: Train Loss: 0.2797, Train Acc: 0.9023
                             Test Loss: 0.4844, Test Acc: 0.8278
  Group 3 Epoch 7/10: Train Loss: 0.2517, Train Acc: 0.9108
                             Test Loss: 0.4652, Test Acc: 0.8340
  Group 3 Epoch 8/10: Train Loss: 0.2193, Train Acc: 0.9244
                             Test Loss: 0.4797, Test Acc: 0.8382




  Group 4 Epoch 1/10: Train Loss: 0.9534, Train Acc: 0.5269
                             Test Loss: 0.7733, Test Acc: 0.6623
  Group 4 Epoch 2/10: Train Loss: 0.6885, Train Acc: 0.7036
                             Test Loss: 0.6692, Test Acc: 0.7117
  Group 4 Epoch 3/10: Train Loss: 0.5625, Train Acc: 0.7684
                             Test Loss: 0.6177, Test Acc: 0.7458
  Group 4 Epoch 4/10: Train Loss: 0.4610, Train Acc: 0.8142
                             Test Loss: 0.5410, Test Acc: 0.7967
  Group 4 Epoch 5/10: Train Loss: 0.3789, Train Acc: 0.8522
                             Test Loss: 0.5716, Test Acc: 0.7869
  Group 4 Epoch 6/10: Train Loss: 0.3179, Train Acc: 0.8767
                             Test Loss: 0.5314, Test Acc: 0.8127
  Group 4 Epoch 7/10: Train Loss: 0.2495, Train Acc: 0.9070
                             Test Loss: 0.4594, Test Acc: 0.8336
  Group 4 Epoch 8/10: Train Loss: 0.2114, Train Acc: 0.9201
                             Test Loss: 0.4744, Test Acc: 0.8370




  Group 5 Epoch 1/10: Train Loss: 0.7131, Train Acc: 0.6865
                             Test Loss: 0.6116, Test Acc: 0.7516
  Group 5 Epoch 2/10: Train Loss: 0.5252, Train Acc: 0.7962
                             Test Loss: 0.5015, Test Acc: 0.7999
  Group 5 Epoch 3/10: Train Loss: 0.4339, Train Acc: 0.8361
                             Test Loss: 0.4639, Test Acc: 0.8153
  Group 5 Epoch 4/10: Train Loss: 0.3816, Train Acc: 0.8583
                             Test Loss: 0.4897, Test Acc: 0.8090
  Group 5 Epoch 5/10: Train Loss: 0.3391, Train Acc: 0.8728
                             Test Loss: 0.4590, Test Acc: 0.8293
  Group 5 Epoch 6/10: Train Loss: 0.2962, Train Acc: 0.8954
                             Test Loss: 0.4687, Test Acc: 0.8293
  Group 5 Epoch 7/10: Train Loss: 0.2499, Train Acc: 0.9108
                             Test Loss: 0.4727, Test Acc: 0.8244
  Group 5 Epoch 8/10: Train Loss: 0.2187, Train Acc: 0.9244
                             Test Loss: 0.5047, Test Acc: 0.8279




  Group 6 Epoch 1/10: Train Loss: 0.8411, Train Acc: 0.5975
                             Test Loss: 0.7238, Test Acc: 0.6800
  Group 6 Epoch 2/10: Train Loss: 0.6242, Train Acc: 0.7187
                             Test Loss: 0.6761, Test Acc: 0.7171
  Group 6 Epoch 3/10: Train Loss: 0.5418, Train Acc: 0.7647
                             Test Loss: 0.6024, Test Acc: 0.7404
  Group 6 Epoch 4/10: Train Loss: 0.5019, Train Acc: 0.7900
                             Test Loss: 0.5945, Test Acc: 0.7542
  Group 6 Epoch 5/10: Train Loss: 0.4523, Train Acc: 0.8127
                             Test Loss: 0.5802, Test Acc: 0.7615
  Group 6 Epoch 6/10: Train Loss: 0.4071, Train Acc: 0.8342
                             Test Loss: 0.5884, Test Acc: 0.7578
  Group 6 Epoch 7/10: Train Loss: 0.3706, Train Acc: 0.8553
                             Test Loss: 0.5685, Test Acc: 0.7636
  Group 6 Epoch 8/10: Train Loss: 0.3188, Train Acc: 0.8787
                             Test Loss: 0.6020, Test Acc: 0.7695
