In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vaclavpechtor/rvl-cdip-small-200")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/vaclavpechtor/rvl-cdip-small-200?dataset_version_number=1...


100%|██████████| 294M/294M [00:15<00:00, 20.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1


In [2]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from torch.optim import lr_scheduler

In [8]:
import copy
import time

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # convert 1 channel to 3 channels
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)  # normalize for 3 channels
])

In [4]:
train_dir = '/root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1/rvl-cdip-small-200/train'
val_dir = '/root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1/rvl-cdip-small-200/val'

train_data = ImageFolder(train_dir, transform=transform)
val_data = ImageFolder(val_dir, transform=transform)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [5]:
import torch.nn as nn
from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights

# Load pretrained model
weights = EfficientNet_B3_Weights.DEFAULT
model = efficientnet_b3(weights=weights)

# Modify the classifier for 16 classes
model.classifier[1] = nn.Linear(model.classifier[1].in_features, 16)

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Downloading: "https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b3_rwightman-b3899882.pth
100%|██████████| 47.2M/47.2M [00:00<00:00, 150MB/s]


In [6]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001, weight_decay=1e-4)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)



In [9]:
num_epochs = 20
patience = 5
best_val_acc = 0.0
epochs_no_improve = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    model.train()
    running_loss = 0.0
    running_corrects = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels.data)
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = running_corrects.double() / total
    print(f"Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

    # Validation phase
    model.eval()
    val_running_corrects = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            val_running_corrects += torch.sum(preds == labels.data)
            val_total += labels.size(0)

    val_acc = val_running_corrects.double() / val_total
    print(f"Val Accuracy: {val_acc:.4f}")

    # Scheduler step
    scheduler.step(val_acc)

    # Check for improvement
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
        print("Validation accuracy improved, saving model.")
    else:
        epochs_no_improve += 1
        print(f"No improvement in val acc for {epochs_no_improve} epoch(s).")

    if epochs_no_improve == patience:
        print(f"Early stopping after {epoch+1} epochs.")
        break

# Load best model weights
model.load_state_dict(best_model_wts)
print(f"Best validation accuracy: {best_val_acc:.4f}")


Epoch 1/20
Train Loss: 2.0359 Acc: 0.5043
Val Accuracy: 0.5469
Validation accuracy improved, saving model.

Epoch 2/20
Train Loss: 1.6297 Acc: 0.6070
Val Accuracy: 0.5953
Validation accuracy improved, saving model.

Epoch 3/20
Train Loss: 1.3897 Acc: 0.6969
Val Accuracy: 0.6391
Validation accuracy improved, saving model.

Epoch 4/20
Train Loss: 1.2065 Acc: 0.7727
Val Accuracy: 0.6516
Validation accuracy improved, saving model.

Epoch 5/20
Train Loss: 1.0349 Acc: 0.8395
Val Accuracy: 0.6609
Validation accuracy improved, saving model.

Epoch 6/20
Train Loss: 0.9107 Acc: 0.8953
Val Accuracy: 0.6656
Validation accuracy improved, saving model.

Epoch 7/20
Train Loss: 0.8169 Acc: 0.9297
Val Accuracy: 0.6531
No improvement in val acc for 1 epoch(s).

Epoch 8/20
Train Loss: 0.7814 Acc: 0.9441
Val Accuracy: 0.6719
Validation accuracy improved, saving model.

Epoch 9/20
Train Loss: 0.7292 Acc: 0.9652
Val Accuracy: 0.6672
No improvement in val acc for 1 epoch(s).

Epoch 10/20
Train Loss: 0.7065 

In [10]:
torch.save(model.state_dict(), 'efficientnet_b3_rvl_cdip_small_200.pth')

In [19]:
import torch
from torchvision import models, transforms
from PIL import Image
from torchvision.models import EfficientNet_B3_Weights # Import weights
import torch.nn.functional as F

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained model architecture exactly like during training
# Change efficientnet_b0 to efficientnet_b3 to match the saved model
weights = EfficientNet_B3_Weights.DEFAULT # Define weights for the new model
model = models.efficientnet_b3(weights=weights) # Load efficientnet_b3

# Modify the classifier for 16 classes (must match training)
num_classes = 16
model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, num_classes)

# Load your saved weights
model.load_state_dict(torch.load('/content/efficientnet_b3_rvl_cdip_small_200.pth', map_location=device))
model.to(device)
model.eval()

# Define the same transform as training
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # convert grayscale to 3-channel
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Load and transform the image
img_path = '/content/lengthy-itemised-hospital-bill-listing-every-item-used-and-cost-for-E0HCXN.jpg'
# ensure image is in RGB format before applying the transform that expects 3 channels
img = Image.open(img_path).convert('RGB')
img_tensor = transform(img).unsqueeze(0).to(device)  # add batch dimension and move to device

# Prediction
with torch.no_grad():
    outputs = model(img_tensor)
    probabilities = F.softmax(outputs, dim=1)
    top_p, top_class = probabilities.topk(3, dim=1)
top_p = top_p.squeeze().tolist()
top_class = top_class.squeeze().tolist()

print("Top 3 Predicted Classes and Confidence Scores:")
for i in range(len(top_class)):
    print(f"Class Index: {top_class[i]}, Confidence: {top_p[i]:.4f}")

Top 3 Predicted Classes and Confidence Scores:
Class Index: 6, Confidence: 0.2941
Class Index: 1, Confidence: 0.1018
Class Index: 0, Confidence: 0.0736


In [13]:
from torchvision import datasets

train_dataset = datasets.ImageFolder(root="/root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1/rvl-cdip-small-200/train")
print(train_dataset.class_to_idx)

{'advertisement': 0, 'budget': 1, 'email': 2, 'file_folder': 3, 'form': 4, 'handwritten': 5, 'invoice': 6, 'letter': 7, 'memo': 8, 'news_article': 9, 'presentation': 10, 'questionnaire': 11, 'resume': 12, 'scientific_publication': 13, 'scientific_report': 14, 'specification': 15}


In [20]:
idx_to_class = {v: k for k, v in train_dataset.class_to_idx.items()}
predicted_label = idx_to_class[top_class[0]]
predicted_label_1 = idx_to_class[top_class[1]]
predicted_label_2 = idx_to_class[top_class[2]]
print(predicted_label)
print(predicted_label_1)
print(predicted_label_2)

invoice
budget
advertisement
