# Question4


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import wandb
from torch.utils.data import DataLoader, Subset
import numpy as np

# Very Simple CNN Model
class TinyCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(TinyCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

def load_data(dataset_name, batch_size=64, fraction=0.1):
    """Load CIFAR dataset with only a fraction of training data"""
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    if dataset_name == 'CIFAR10':
        trainset = torchvision.datasets.CIFAR10(
            root='./data', train=True, download=True, transform=transform)
        testset = torchvision.datasets.CIFAR10(
            root='./data', train=False, download=True, transform=transform)
    else:  # CIFAR100
        trainset = torchvision.datasets.CIFAR100(
            root='./data', train=True, download=True, transform=transform)
        testset = torchvision.datasets.CIFAR100(
            root='./data', train=False, download=True, transform=transform)

    # Use only a fraction of training data
    num_train = len(trainset)
    indices = np.random.choice(num_train, int(num_train * fraction), replace=False)
    trainset = Subset(trainset, indices)

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader

def evaluate(model, testloader, device):
    """Evaluate model accuracy"""
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

def train_epoch(model, trainloader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in trainloader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(trainloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

def sequential_training(first_dataset, second_dataset, experiment_name, epochs=20):
    """Train on two datasets sequentially with reduced epochs"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize W&B
    wandb.init(
        project="cifar-sequential-fast",
        name=experiment_name,
        config={
            "first_dataset": first_dataset,
            "second_dataset": second_dataset,
            "epochs_per_task": epochs,
            "learning_rate": 0.01,
            "batch_size": 64,
            "architecture": "TinyCNN",
            "data_fraction": 0.1
        }
    )

    # Load datasets (10% of training data)
    num_classes_first = 10 if first_dataset == 'CIFAR10' else 100
    num_classes_second = 10 if second_dataset == 'CIFAR10' else 100

    train_loader_1, test_loader_1 = load_data(first_dataset, fraction=0.1)
    train_loader_2, test_loader_2 = load_data(second_dataset, fraction=0.1)

    # Initialize model for first task
    model = TinyCNN(num_classes=num_classes_first).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    print(f"\nPhase 1: Training on {first_dataset}...")

    # Train on first dataset
    for epoch in range(epochs):
        train_loss, train_acc = train_epoch(model, train_loader_1, criterion, optimizer, device)
        test_acc_1 = evaluate(model, test_loader_1, device)

        wandb.log({
            "phase": 1,
            "epoch": epoch + 1,
            f"{first_dataset}_train_loss": train_loss,
            f"{first_dataset}_train_acc": train_acc,
            f"{first_dataset}_test_acc": test_acc_1,
        })

        if (epoch + 1) % 5 == 0:
            print(f"  Epoch {epoch+1}/{epochs}: Test Acc = {test_acc_1:.2f}%")

    # Save accuracy on first task after training
    acc_task1_after_task1 = evaluate(model, test_loader_1, device)
    print(f"  ✓ Final accuracy: {acc_task1_after_task1:.2f}%")

    # Save first task model state for later evaluation
    first_task_state = {
        'features': model.features.state_dict(),
        'classifier': model.classifier.state_dict()
    }

    print(f"\n{'='*60}")
    print(f"Phase 2: Training on {second_dataset} (10% data)")
    print(f"{'='*60}")

    # Modify classifier for second task
    model.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(64 * 8 * 8, 128),
        nn.ReLU(),
        nn.Linear(128, num_classes_second)
    ).to(device)

    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    # Train on second dataset and track forgetting
    for epoch in range(epochs):
        train_loss, train_acc = train_epoch(model, train_loader_2, criterion, optimizer, device)
        test_acc_2 = evaluate(model, test_loader_2, device)

        # Check forgetting every 5 epochs
        if (epoch + 1) % 5 == 0:
            # Temporarily restore first task classifier
            temp_classifier = model.classifier
            model.classifier = nn.Sequential(
                nn.Flatten(),
                nn.Linear(64 * 8 * 8, 128),
                nn.ReLU(),
                nn.Linear(128, num_classes_first)
            ).to(device)
            model.classifier.load_state_dict(first_task_state['classifier'])

            acc_task1_current = evaluate(model, test_loader_1, device)

            # Restore second task classifier
            model.classifier = temp_classifier

            wandb.log({
                "phase": 2,
                "epoch": epochs + epoch + 1,
                f"{second_dataset}_train_loss": train_loss,
                f"{second_dataset}_train_acc": train_acc,
                f"{second_dataset}_test_acc": test_acc_2,
                f"{first_dataset}_test_acc_during_task2": acc_task1_current,
                "forgetting": acc_task1_after_task1 - acc_task1_current
            })

            print(f"Epoch [{epoch+1}/{epochs}] - Loss: {train_loss:.4f}, "
                  f"Task2: {test_acc_2:.2f}%, Task1 Retention: {acc_task1_current:.2f}%")
        else:
            wandb.log({
                "phase": 2,
                "epoch": epochs + epoch + 1,
                f"{second_dataset}_train_loss": train_loss,
                f"{second_dataset}_train_acc": train_acc,
                f"{second_dataset}_test_acc": test_acc_2,
            })

    # Final evaluation on both tasks
    acc_task2_after_task2 = evaluate(model, test_loader_2, device)

    # Restore first task classifier for final measurement
    model.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(64 * 8 * 8, 128),
        nn.ReLU(),
        nn.Linear(128, num_classes_first)
    ).to(device)
    model.classifier.load_state_dict(first_task_state['classifier'])
    acc_task1_after_task2 = evaluate(model, test_loader_1, device)

    forgetting = acc_task1_after_task1 - acc_task1_after_task2

    print(f"\n{'='*60}")
    print(f"FINAL RESULTS - {experiment_name}")
    print(f"{'='*60}")
    print(f"{first_dataset} accuracy AFTER Phase 1: {acc_task1_after_task1:.2f}%")
    print(f"{first_dataset} accuracy AFTER Phase 2: {acc_task1_after_task2:.2f}%")
    print(f"→ Forgetting: {forgetting:.2f}% ({'CATASTROPHIC' if forgetting > 30 else 'MODERATE'})")
    print(f"\n{second_dataset} accuracy AFTER Phase 2: {acc_task2_after_task2:.2f}%")

    wandb.log({
        f"final_{first_dataset}_after_task1": acc_task1_after_task1,
        f"final_{first_dataset}_after_task2": acc_task1_after_task2,
        f"final_{second_dataset}_acc": acc_task2_after_task2,
        "final_forgetting": forgetting
    })

    wandb.finish()

    return {
        'task1_after_task1': acc_task1_after_task1,
        'task1_after_task2': acc_task1_after_task2,
        'task2_acc': acc_task2_after_task2,
        'forgetting': forgetting
    }

# Main execution
if __name__ == "__main__":
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    print("EXPERIMENT 1: CIFAR-100 → CIFAR-10")
    print("="*60)
    results_exp1 = sequential_training('CIFAR100', 'CIFAR10',
                                       'exp1_C100→C10',
                                       epochs=100)

    print("\n\n")
    print("EXPERIMENT 2: CIFAR-10 → CIFAR-100")
    print("="*60)
    results_exp2 = sequential_training('CIFAR10', 'CIFAR100',
                                       'exp2_C10→C100',
                                       epochs=100)

    print("\n\n")
    print("="*60)
    print("COMPARATIVE ANALYSIS")
    print("="*60)
    print(f"\nExperiment 1 (CIFAR-100 → CIFAR-10):")
    print(f"  Task 1 (C100) after training: {results_exp1['task1_after_task1']:.2f}%")
    print(f"  Task 1 (C100) after Task 2:  {results_exp1['task1_after_task2']:.2f}%")
    print(f"  Task 2 (C10) final:           {results_exp1['task2_acc']:.2f}%")

    print(f"\nExperiment 2 (CIFAR-10 → CIFAR-100):")
    print(f"  Task 1 (C10) after training:  {results_exp2['task1_after_task1']:.2f}%")
    print(f"  Task 1 (C10) after Task 2:    {results_exp2['task1_after_task2']:.2f}%")
    print(f"  Task 2 (C100) final:          {results_exp2['task2_acc']:.2f}%")


EXPERIMENT 1: CIFAR-100 → CIFAR-10
Using device: cuda


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m112201026[0m ([33m112201026-indian-institute-of-technology-palakkad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 169M/169M [00:25<00:00, 6.64MB/s]
100%|██████████| 170M/170M [00:14<00:00, 12.0MB/s]



Phase 1: Training on CIFAR100...
  Epoch 5/100: Test Acc = 10.92%
  Epoch 10/100: Test Acc = 17.20%
  Epoch 15/100: Test Acc = 18.24%
  Epoch 20/100: Test Acc = 17.90%
  Epoch 25/100: Test Acc = 17.73%
  Epoch 30/100: Test Acc = 18.33%
  Epoch 35/100: Test Acc = 19.08%
  Epoch 40/100: Test Acc = 19.92%
  Epoch 45/100: Test Acc = 19.93%
  Epoch 50/100: Test Acc = 19.87%
  Epoch 55/100: Test Acc = 19.89%
  Epoch 60/100: Test Acc = 19.86%
  Epoch 65/100: Test Acc = 19.82%
  Epoch 70/100: Test Acc = 19.84%
  Epoch 75/100: Test Acc = 19.84%
  Epoch 80/100: Test Acc = 19.88%
  Epoch 85/100: Test Acc = 19.91%
  Epoch 90/100: Test Acc = 19.88%
  Epoch 95/100: Test Acc = 19.86%
  Epoch 100/100: Test Acc = 19.83%
  ✓ Final accuracy: 19.83%

Phase 2: Training on CIFAR10 (10% data)
Epoch [5/100] - Loss: 0.5023, Task2: 53.04%, Task1 Retention: 17.90%
Epoch [10/100] - Loss: 0.0347, Task2: 52.57%, Task1 Retention: 18.48%
Epoch [15/100] - Loss: 0.0022, Task2: 53.30%, Task1 Retention: 18.81%
Epoch [20

0,1
CIFAR100_test_acc,▁▂▄▄▆▇▇▆▇▆▆▆▇▇▇█████████████████████████
CIFAR100_test_acc_during_task2,▁▅▇██████▇██████████
CIFAR100_train_acc,▁▂▂▂▃▄▅▆▆▆██████████████████████████████
CIFAR100_train_loss,██▇▇▆▆▅▅▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
CIFAR10_test_acc,▁▇▆▅▅███████████████████████████████████
CIFAR10_train_acc,▁███████████████████████████████████████
CIFAR10_train_loss,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇███
final_CIFAR100_after_task1,▁
final_CIFAR100_after_task2,▁

0,1
CIFAR100_test_acc,19.83
CIFAR100_test_acc_during_task2,18.94
CIFAR100_train_acc,100
CIFAR100_train_loss,0.00019
CIFAR10_test_acc,53.45
CIFAR10_train_acc,100
CIFAR10_train_loss,0.00015
epoch,200
final_CIFAR100_after_task1,19.83
final_CIFAR100_after_task2,18.94





EXPERIMENT 2: CIFAR-10 → CIFAR-100
Using device: cuda



Phase 1: Training on CIFAR10...
  Epoch 5/100: Test Acc = 45.83%
  Epoch 10/100: Test Acc = 53.07%
  Epoch 15/100: Test Acc = 55.28%
  Epoch 20/100: Test Acc = 56.01%
  Epoch 25/100: Test Acc = 56.93%
  Epoch 30/100: Test Acc = 56.95%
  Epoch 35/100: Test Acc = 56.98%
  Epoch 40/100: Test Acc = 56.90%
  Epoch 45/100: Test Acc = 56.94%
  Epoch 50/100: Test Acc = 56.83%
  Epoch 55/100: Test Acc = 56.90%
  Epoch 60/100: Test Acc = 56.87%
  Epoch 65/100: Test Acc = 56.82%
  Epoch 70/100: Test Acc = 56.83%
  Epoch 75/100: Test Acc = 56.86%
  Epoch 80/100: Test Acc = 56.88%
  Epoch 85/100: Test Acc = 56.74%
  Epoch 90/100: Test Acc = 56.70%
  Epoch 95/100: Test Acc = 56.76%
  Epoch 100/100: Test Acc = 56.72%
  ✓ Final accuracy: 56.72%

Phase 2: Training on CIFAR100 (10% data)
Epoch [5/100] - Loss: 1.9876, Task2: 18.54%, Task1 Retention: 53.40%
Epoch [10/100] - Loss: 0.3358, Task2: 19.41%, Task1 Retention: 53.55%
Epoch [15/100] - Loss: 0.0327, Task2: 20.12%, Task1 Retention: 52.58%
Epoch [20

0,1
CIFAR100_test_acc,▁▆▇▇▇▇██████████████████████████████████
CIFAR100_train_acc,▁▂▄▇▇███████████████████████████████████
CIFAR100_train_loss,█▅▄▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
CIFAR10_test_acc,▁▄▅▅▆▇▇▇▇▇██████████████████████████████
CIFAR10_test_acc_during_task2,▇█▁▆▆▆▆▆▆▆▆▆▆▆▆▇▆▆▆▆
CIFAR10_train_acc,▁▃▄▅▆▇██████████████████████████████████
CIFAR10_train_loss,█▇▆▆▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
final_CIFAR100_acc,▁
final_CIFAR10_after_task1,▁

0,1
CIFAR100_test_acc,20.93
CIFAR100_train_acc,100
CIFAR100_train_loss,0.00027
CIFAR10_test_acc,56.72
CIFAR10_test_acc_during_task2,53.31
CIFAR10_train_acc,100
CIFAR10_train_loss,0.00022
epoch,200
final_CIFAR100_acc,20.93
final_CIFAR10_after_task1,56.72





COMPARATIVE ANALYSIS

Experiment 1 (CIFAR-100 → CIFAR-10):
  Task 1 (C100) after training: 19.83%
  Task 1 (C100) after Task 2:  18.94%
  Task 2 (C10) final:           53.45%
  → Forgetting:                 0.89%

Experiment 2 (CIFAR-10 → CIFAR-100):
  Task 1 (C10) after training:  56.72%
  Task 1 (C10) after Task 2:    53.31%
  Task 2 (C100) final:          20.93%
  → Forgetting:                 3.41%


# Question 2

In [None]:
import wandb
import pandas as pd
from datasets import load_dataset
from snorkel.labeling import labeling_function, PandasLFApplier

ABSTAIN, DATE_MISC, ORG = -1, 0, 1

@labeling_function()
def lf_year_detection(x):
    try:
        y = int(x.token)
        if 1900 <= y <= 2099:
            return DATE_MISC
    except ValueError:
        return ABSTAIN
    return ABSTAIN

@labeling_function()
def lf_org_suffix(x):
    suffixes = ["Inc.", "Corp.", "Ltd.", "LLC", "Group", "Co."]
    return ORG if any(x.token.endswith(s) for s in suffixes) else ABSTAIN

# Initialize W&B
wandb.init(project="Q2-weak-supervision-ner-conll2003", name="lf_analysis")

# Load dataset
ds = load_dataset("eriktks/conll2003", revision="convert/parquet")["train"]

# Flatten token-level data
rows = [
    {
        "token": token,
        "true_label": DATE_MISC if tag in [7, 8] else ORG if tag in [3, 4] else ABSTAIN
    }
    for ex in ds
    for token, tag in zip(ex["tokens"], ex["ner_tags"])
]
df = pd.DataFrame(rows)
print(f"Dataset contains {len(df)} tokens.")

# Apply labeling functions
lfs = [lf_year_detection, lf_org_suffix]
applier = PandasLFApplier(lfs)
L = applier.apply(df)

# Compute simple coverage and accuracy
for lf, col in zip(lfs, L.T):
    coverage = (col != ABSTAIN).mean()
    accuracy = (col == df["true_label"]).mean()
    print(f"{lf.name}: Coverage = {coverage:.4f}, Accuracy = {accuracy:.4f}")
    wandb.log({
        f"{lf.name}_coverage": coverage,
        f"{lf.name}_accuracy": accuracy
    })
    wandb.run.summary[f"{lf.name}_coverage"] = coverage
    wandb.run.summary[f"{lf.name}_accuracy"] = accuracy

wandb.finish()


Dataset contains 203621 tokens.


100%|██████████| 203621/203621 [00:08<00:00, 25377.82it/s]


lf_year_detection: Coverage = 0.0027, Accuracy = 0.9256
lf_org_suffix: Coverage = 0.0002, Accuracy = 0.9284


0,1
lf_org_suffix_accuracy,▁
lf_org_suffix_coverage,▁
lf_year_detection_accuracy,▁
lf_year_detection_coverage,▁

0,1
lf_org_suffix_accuracy,0.92835
lf_org_suffix_coverage,0.00023
lf_year_detection_accuracy,0.92558
lf_year_detection_coverage,0.00267


# Question 3

In [None]:
import wandb
import pandas as pd
import re
from datasets import load_dataset
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model import MajorityLabelVoter

ABSTAIN = -1
ORGANIZATION = 0
MISC = 1

@labeling_function()
def lf_year_misc(x):
    return MISC if re.fullmatch(r"(19|20)\d{2}", x.token) else ABSTAIN

@labeling_function()
def lf_org_suffixes(x):
    suffixes = {"Inc.", "Corp.", "Ltd."}
    return ORGANIZATION if any(x.token.endswith(s) for s in suffixes) else ABSTAIN

# Initialize W&B
run = wandb.init(project="Q3", job_type="majority_voter_eval")

# Load dataset (mirrored, parquet version)
dataset = load_dataset("eriktks/conll2003", revision="convert/parquet", split="train")

# Flatten token-level data
rows = [
    {
        "token": token,
        "true_label": ORGANIZATION if tag in [3, 4] else MISC if tag in [7, 8] else ABSTAIN
    }
    for ex in dataset
    for token, tag in zip(ex["tokens"], ex["ner_tags"])
]
df = pd.DataFrame(rows)

# Apply labeling functions
lfs = [lf_year_misc, lf_org_suffixes]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df)

# Majority Label Voter
majority_model = MajorityLabelVoter()
has_label = (L_train != ABSTAIN).any(axis=1)

L_train_labeled = L_train[has_label]
y_true_labeled = df["true_label"].values[has_label]

score = majority_model.score(L=L_train_labeled, Y=y_true_labeled, tie_break_policy="random")
accuracy = score["accuracy"]

print(f"Majority Label Voter Accuracy: {accuracy:.4f}")

wandb.summary["majority_voter_accuracy"] = accuracy
wandb.log({"majority_voter_accuracy": accuracy})

run.finish()


100%|██████████| 203621/203621 [00:04<00:00, 41532.09it/s]


Majority Label Voter Accuracy: 0.9615


0,1
majority_voter_accuracy,▁

0,1
majority_voter_accuracy,0.96154


# Question 1

In [1]:
import wandb
from collections import Counter
from datasets import load_dataset
from tabulate import tabulate

# 1. Initialize W&B Run
run = wandb.init(project="Q1-weak-supervision-ner", job_type="data_analysis")

# 2. Load Dataset
dataset = load_dataset("conll2003")

# 3. Get Entity Names
tag_names = dataset['train'].features['ner_tags'].feature.names

# Containers for stats
split_stats = {}
total_counts = {
    "B": Counter(),
    "I": Counter(),
    "combined": Counter(),
}

# 4. Iterate over splits
for split_name, split_data in dataset.items():
    num_samples = len(split_data)

    # Counters for each tag type
    b_counts = Counter()
    i_counts = Counter()
    combined_counts = Counter()

    for sample in split_data:
        for tag_id in sample["ner_tags"]:
            tag = tag_names[tag_id]
            if tag.startswith("B-"):
                entity = tag[2:]
                b_counts[entity] += 1
                combined_counts[entity] += 1
            elif tag.startswith("I-"):
                entity = tag[2:]
                i_counts[entity] += 1
                combined_counts[entity] += 1

    # Store results
    split_stats[split_name] = {
        "B": dict(b_counts),
        "I": dict(i_counts),
        "combined": dict(combined_counts),
    }

    # Update totals
    total_counts["B"].update(b_counts)
    total_counts["I"].update(i_counts)
    total_counts["combined"].update(combined_counts)

    # Log stats to W&B
    wandb.summary[f"{split_name}_samples"] = num_samples
    for tag_type, counter in [("B", b_counts), ("I", i_counts), ("combined", combined_counts)]:
        for entity, count in counter.items():
            wandb.summary[f"{split_name}_{tag_type}_entity_count_{entity}"] = count

# 5. Log total stats
total_samples = sum(len(split) for split in dataset.values())
wandb.summary["total_samples"] = total_samples

for tag_type, counter in total_counts.items():
    for entity, count in counter.items():
        wandb.summary[f"total_{tag_type}_entity_count_{entity}"] = count

run.finish()

# 6. Print Results (Formatted)
print("\n" + "=" * 50)
print("DATASET STATISTICS")
print("=" * 50)

for split_name, stats in split_stats.items():
    print(f"\nSplit: {split_name.upper()}")
    print(f"Number of samples: {len(dataset[split_name])}\n")

    headers = ["Tag Type", "ORG", "MISC", "PER", "LOC"]
    table = [
        ["B-Tags",
         stats["B"].get("ORG", 0),
         stats["B"].get("MISC", 0),
         stats["B"].get("PER", 0),
         stats["B"].get("LOC", 0)],
        ["I-Tags",
         stats["I"].get("ORG", 0),
         stats["I"].get("MISC", 0),
         stats["I"].get("PER", 0),
         stats["I"].get("LOC", 0)],
        ["Combined (B+I)",
         stats["combined"].get("ORG", 0),
         stats["combined"].get("MISC", 0),
         stats["combined"].get("PER", 0),
         stats["combined"].get("LOC", 0)],
    ]
    print(tabulate(table, headers=headers, tablefmt="grid"))
    print(f"\nTotal Samples: {len(dataset[split_name])}")

# Print totals across all splits
print("\n" + "=" * 50)
print("TOTAL ENTITY COUNTS (Across All Splits)")
print("=" * 50)

headers = ["Tag Type", "ORG", "MISC", "PER", "LOC"]
total_table = [
    ["B-Tags",
     total_counts["B"].get("ORG", 0),
     total_counts["B"].get("MISC", 0),
     total_counts["B"].get("PER", 0),
     total_counts["B"].get("LOC", 0)],
    ["I-Tags",
     total_counts["I"].get("ORG", 0),
     total_counts["I"].get("MISC", 0),
     total_counts["I"].get("PER", 0),
     total_counts["I"].get("LOC", 0)],
    ["Combined (B+I)",
     total_counts["combined"].get("ORG", 0),
     total_counts["combined"].get("MISC", 0),
     total_counts["combined"].get("PER", 0),
     total_counts["combined"].get("LOC", 0)],
]
print(tabulate(total_table, headers=headers, tablefmt="grid"))
print(f"\nTotal Samples Across All Splits: {total_samples}")


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m112201026[0m ([33m112201026-indian-institute-of-technology-palakkad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

RuntimeError: Dataset scripts are no longer supported, but found conll2003.py