In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vaclavpechtor/rvl-cdip-small-200")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/vaclavpechtor/rvl-cdip-small-200?dataset_version_number=1...


100%|██████████| 294M/294M [00:10<00:00, 28.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1


In [11]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from torch.optim import lr_scheduler

In [3]:
import copy
import time

In [19]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # convert 1 channel to 3 channels
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)  # normalize for 3 channels
])

In [16]:
# train_transforms = transforms.Compose([
#     transforms.Grayscale(num_output_channels=3),
#     transforms.RandomResizedCrop(224),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomRotation(10),
#     transforms.ToTensor(),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
# ])

# val_transforms = transforms.Compose([
#     transforms.Grayscale(num_output_channels=3),
#     transforms.Resize(256),
#     transforms.CenterCrop(224),
#     transforms.ToTensor(),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
# ])

In [20]:
train_dir = '/root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1/rvl-cdip-small-200/train'
val_dir = '/root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1/rvl-cdip-small-200/val'

train_data = ImageFolder(train_dir, transform=transform)
val_data = ImageFolder(val_dir, transform=transform)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [9]:
import torch.nn as nn
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights

# Load pretrained model
weights = EfficientNet_B0_Weights.DEFAULT
model = efficientnet_b0(weights=weights)

# Modify the classifier for 16 classes
model.classifier[1] = nn.Linear(model.classifier[1].in_features, 16)

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [12]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001, weight_decay=1e-4)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)

In [18]:
num_epochs = 20
patience = 5
best_val_acc = 0.0
epochs_no_improve = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    model.train()
    running_loss = 0.0
    running_corrects = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels.data)
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = running_corrects.double() / total
    print(f"Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

    # Validation phase
    model.eval()
    val_running_corrects = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            val_running_corrects += torch.sum(preds == labels.data)
            val_total += labels.size(0)

    val_acc = val_running_corrects.double() / val_total
    print(f"Val Accuracy: {val_acc:.4f}")

    # Scheduler step
    scheduler.step(val_acc)

    # Check for improvement
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_wts = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
        print("Validation accuracy improved, saving model.")
    else:
        epochs_no_improve += 1
        print(f"No improvement in val acc for {epochs_no_improve} epoch(s).")

    if epochs_no_improve == patience:
        print(f"Early stopping after {epoch+1} epochs.")
        break

# Load best model weights
model.load_state_dict(best_model_wts)
print(f"Best validation accuracy: {best_val_acc:.4f}")


Epoch 1/20
Train Loss: 1.8261 Acc: 0.5215
Val Accuracy: 0.6031
Validation accuracy improved, saving model.

Epoch 2/20
Train Loss: 1.6333 Acc: 0.5813
Val Accuracy: 0.6531
Validation accuracy improved, saving model.

Epoch 3/20
Train Loss: 1.5899 Acc: 0.6043
Val Accuracy: 0.6531
No improvement in val acc for 1 epoch(s).

Epoch 4/20
Train Loss: 1.5068 Acc: 0.6230
Val Accuracy: 0.6594
Validation accuracy improved, saving model.

Epoch 5/20
Train Loss: 1.5079 Acc: 0.6379
Val Accuracy: 0.6531
No improvement in val acc for 1 epoch(s).

Epoch 6/20
Train Loss: 1.4438 Acc: 0.6508
Val Accuracy: 0.6594
No improvement in val acc for 2 epoch(s).

Epoch 7/20
Train Loss: 1.4354 Acc: 0.6602
Val Accuracy: 0.6625
Validation accuracy improved, saving model.

Epoch 8/20
Train Loss: 1.4422 Acc: 0.6586
Val Accuracy: 0.6641
Validation accuracy improved, saving model.

Epoch 9/20
Train Loss: 1.4276 Acc: 0.6586
Val Accuracy: 0.6734
Validation accuracy improved, saving model.

Epoch 10/20
Train Loss: 1.3903 Ac

In [14]:
torch.save(model.state_dict(), 'efficientnet_b0_rvl_cdip_small_200.pth')

In [15]:
from sklearn.metrics import classification_report
import torch
import numpy as np

# Set model to evaluation mode
model.eval()

# To store true and predicted labels
all_preds = []
all_labels = []

# Disable gradient calculation
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Generate classification report
target_names = [str(i) for i in range(16)]  # assuming 16 document classes in RVL-CDIP
report = classification_report(all_labels, all_preds, target_names=target_names)

print("📄 Classification Report:\n")
print(report)

📄 Classification Report:

              precision    recall  f1-score   support

           0       0.77      0.75      0.76        40
           1       0.69      0.55      0.61        40
           2       0.95      0.88      0.91        40
           3       0.84      0.90      0.87        40
           4       0.48      0.62      0.54        40
           5       0.76      0.93      0.83        40
           6       0.51      0.50      0.51        40
           7       0.67      0.70      0.68        40
           8       0.73      0.80      0.76        40
           9       0.97      0.72      0.83        40
          10       0.63      0.65      0.64        40
          11       0.57      0.53      0.55        40
          12       0.74      0.93      0.82        40
          13       0.80      0.70      0.75        40
          14       0.34      0.30      0.32        40
          15       0.80      0.70      0.75        40

    accuracy                           0.70       640


In [52]:
import torch
from torchvision import models, transforms
from PIL import Image

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained model architecture exactly like during training
model = models.efficientnet_b0(pretrained=False)  # pretrained=False because you load weights manually

# Replace the final layer if you did during training (adjust number of classes accordingly)
num_classes = 16  # change to your number of classes
model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, num_classes)

# Load your saved weights
model.load_state_dict(torch.load('/content/efficientnet_b0_rvl_cdip_small_200.pth', map_location=device))
model.to(device)
model.eval()

# Define the same transform as training
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # convert grayscale to 3-channel
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Load and transform the image
img_path = '/content/Sample-handwritten-text-from-CVL-Database.png'
img = Image.open(img_path).convert('L')  # convert to grayscale if original is not
img_tensor = transform(img).unsqueeze(0).to(device)  # add batch dimension and move to device

# Prediction
with torch.no_grad():
    outputs = model(img_tensor)
    pred_class = torch.argmax(outputs, dim=1).item()

print(f"Predicted class: {pred_class}")



Predicted class: 10


In [53]:
from torchvision import datasets

train_dataset = datasets.ImageFolder(root="/root/.cache/kagglehub/datasets/vaclavpechtor/rvl-cdip-small-200/versions/1/rvl-cdip-small-200/train")
print(train_dataset.class_to_idx)


{'advertisement': 0, 'budget': 1, 'email': 2, 'file_folder': 3, 'form': 4, 'handwritten': 5, 'invoice': 6, 'letter': 7, 'memo': 8, 'news_article': 9, 'presentation': 10, 'questionnaire': 11, 'resume': 12, 'scientific_publication': 13, 'scientific_report': 14, 'specification': 15}


In [49]:
idx_to_class = {v: k for k, v in train_dataset.class_to_idx.items()}

In [54]:
predicted_label = idx_to_class[pred_class]

In [55]:
print(predicted_label)

presentation
