In [3]:
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import os

import torch
from torch.utils.data import Dataset
import torch.optim as optim
import torch.nn as nn
from torchvision import transforms

from sklearn.metrics import classification_report, f1_score
from sklearn import svm, metrics

import pandas as pd

In [None]:
#This cell contains the code to create the dataset

import os
from pypdfium2 import PdfDocument
from PIL import Image
import matplotlib.pyplot as plt

pdf_dir = "./datasets/extractor_classifier/slides"
relevant_dir = "./datasets/extractor_classifier/dataset_images/relevant"
not_relevant_dir = "./datasets/extractor_classifier/dataset_images/not_relevant"

for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        pdf_file = open(os.path.join(pdf_dir, filename), "rb")
        pdf_document = PdfDocument(pdf_file)

        for page_index, page_content in enumerate(pdf_document, 0):
            bitmap = page_content.render(scale=2)
            page_image = bitmap.to_pil()
            plt.imshow(page_image)
            plt.show()
            input_str = input("Is this image relevant? (y/n)")

            if input_str.lower() == "n":
                image_path = os.path.join(not_relevant_dir, f"{filename}_{page_index}.png")
            else:
                image_path = os.path.join(relevant_dir, f"{filename}_{page_index}.png")
            page_image.save(image_path)

        pdf_file.close()

In [22]:
# Create train, validation and test data from dataset
# Only do this once, further method to load data from the created directories has to be implemented

import shutil
import random

root_dir = "../../../datasets/extractor_classifier/dataset_images/"

# Define the percentage of data to use for each set
train_percent = 0.7
val_percent = 0.15
test_percent = 0.15

# Create a list of class names (assumes each class is a subfolder of root_dir)
class_names = sorted(os.listdir(root_dir))

if ".DS_Store" in class_names:
    class_names.remove(".DS_Store")

# Define the output directories for the saved datasets
train_output_dir = "../../../datasets/extractor_classifier/train/"
val_output_dir = "../../../datasets/extractor_classifier/validation/"
test_output_dir = "../../../datasets/extractor_classifier/test/"

# Create the output directories if they don't already exist
os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(val_output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)

# Create train, validation, and test list
train_list = []
validation_list = []
test_list = []

# Split the data for each class into train, validation, and test sets
for class_name in class_names:
    # Get a list of all images for this class
    images = os.listdir(root_dir + class_name)
    random.Random(42).shuffle(images)

    # Split the images into train, validation, and test sets
    num_images = len(images)
    num_train = int(train_percent * num_images)
    num_val = int(val_percent * num_images)

    train_images = images[:num_train]
    val_images = images[num_train:num_train+num_val]
    test_images = images[num_train+num_val:]

    for image in train_images:
        src_path = root_dir + class_name + "/" + image
        label = class_names.index(class_name)
        train_list.append((Image.open(src_path), label))

    for image in val_images:
        src_path = root_dir + class_name + "/" + image
        label = class_names.index(class_name)
        validation_list.append((Image.open(src_path), label))

    for image in test_images:
        src_path = root_dir + class_name + "/" + image
        label = class_names.index(class_name)
        test_list.append((Image.open(src_path), label))

# Save the train dataset
for image, label in train_list:
    class_name = class_names[label]
    output_path = os.path.join(train_output_dir, class_name)
    os.makedirs(output_path, exist_ok=True)
    image_filename = os.path.splitext(os.path.basename(image.filename))[0] + ".jpg"
    shutil.copyfile(image.filename, os.path.join(output_path, image_filename))

# Save the validation dataset
for image, label in validation_list:
    class_name = class_names[label]
    output_path = os.path.join(val_output_dir, class_name)
    os.makedirs(output_path, exist_ok=True)
    image_filename = os.path.splitext(os.path.basename(image.filename))[0] + ".jpg"
    shutil.copyfile(image.filename, os.path.join(output_path, image_filename))

# Save the test dataset
for image, label in test_list:
    class_name = class_names[label]
    output_path = os.path.join(test_output_dir, class_name)
    os.makedirs(output_path, exist_ok=True)
    image_filename = os.path.splitext(os.path.basename(image.filename))[0] + ".jpg"
    shutil.copyfile(image.filename, os.path.join(output_path, image_filename))

In [23]:
import torchvision.transforms as T

In [None]:
# WIP data augmentation

for image, label in train_list:
        image_filename = os.path.splitext(os.path.basename(image.filename))[0] + "_resized.jpg"
        output_path = os.path.join(train_augmented_output_dir, label, image_filename)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        augmented_image.save(output_path)

resized_images = [T.Resize(size=size)(the_image) for size in [32,128,224]]

In [16]:
class ExtractorClassifierDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.to_tensor = transforms.ToTensor()
        self.resize_image = transforms.Resize((256, 256), antialias=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image, label = self. data[idx]
        image_tensor = self.to_tensor(image)
        image_tensor = self.resize_image(image_tensor)
        image_tensor = image_tensor/255
        label_tensor = torch.zeros(4)
        label_tensor[label] = 1
        return image_tensor, label_tensor

# Define the train/validation/test datasets
train_data = ExtractorClassifierDataset(train_list)
validation_data = ExtractorClassifierDataset(validation_list)
test_data = ExtractorClassifierDataset(test_list)

# Define the dataloaders for each dataset
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True, )
validation_loader = torch.utils.data.DataLoader(validation_data, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True)

In [4]:
from torchvision.models import resnet50

class Resnet50Model(nn.Module):
    def __init__(self, pretrained=True):
        super(Resnet50Model, self).__init__()
        self.resnet_model = resnet50(pretrained=pretrained)
        self.fc = nn.Linear(in_features=1000, out_features=4)

    def forward(self, x):
        x = self.resnet_model(x)
        x = self.fc(x)
        return x

In [19]:
device="mps"
def train(model, train_loader, valid_loader, test_loader, criterion, optimizer, epochs):
    # Move the model to the device
    model.to(device)

    # Define variables to track the best validation accuracy and the corresponding model state
    best_valid_acc = 0.0
    best_model_state = None
    train_loss_values = []
    valid_loss_values = []
    train_acc_values = []
    valid_acc_values = []

    for epoch in range(epochs):
        # Train the model for one epoch
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            _, labels = torch.max(labels.data, 1)
            train_correct += (predicted == labels).sum().item()
            train_total += labels.size(0)
        train_loss = train_loss / len(train_loader.dataset)
        train_loss_values.append(train_loss)
        train_acc = train_correct / train_total
        train_acc_values.append(train_acc)

        # Evaluate the model on the validation set
        valid_loss = 0.0
        valid_correct = 0
        valid_total = 0
        model.eval()
        with torch.no_grad():
            for images, labels in valid_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                valid_loss += loss.item() * images.size(0)
                _, predicted = torch.max(outputs.data, 1)
                _, labels = torch.max(labels.data, 1)
                valid_correct += (predicted == labels).sum().item()
                valid_total += labels.size(0)
        valid_loss /= len(valid_loader.dataset)
        valid_acc = valid_correct / valid_total
        valid_loss_values.append(valid_loss)
        valid_acc_values.append(valid_acc)

        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {train_loss:.4f} - Train Acc: {train_acc:.4f} - "
              f"Valid Loss: {valid_loss:.4f} - Valid Acc: {valid_acc:.4f}")

        validate_model(model)

        # Save the model state if the current validation accuracy is better than the previous best
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_model_state = model.state_dict()

    # Load the best model state and evaluate on the test set
    model.load_state_dict(best_model_state)
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            _, labels = torch.max(labels.data, 1)
            test_correct += (predicted == labels).sum().item()
            test_total += labels.size(0)
    test_acc = test_correct / test_total

    print(f"Test Acc: {test_acc:.4f}")

    return model, train_loss_values, train_acc_values, valid_loss_values, valid_acc_values

def validate_model(model):
    model.to(device)
    y_validation = []
    y_pred = []
    model.eval()
    with torch.no_grad():
        for images, labels in validation_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            _, labels = torch.max(labels.data, 1)
            y_validation.extend(labels.detach().cpu().numpy())
            y_pred.extend(predicted.detach().cpu().numpy())
    print(f"Accuracy: {metrics.accuracy_score(y_validation, y_pred)}")
    print(classification_report(y_validation, y_pred))

In [20]:
resnet_model = Resnet50Model()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet_model.parameters(), lr=0.001)

if os.path.isfile("model_results/resnet50_model_50.pt"):
    resnet_model.load_state_dict(torch.load("model_results/resnet50_model_50.pt", map_location=torch.device(device)))
    validate_model(resnet_model)
else:
    print("Train model")
    model, train_loss_values, train_accuracy_values, validation_loss_values, validation_accuracy_values = train(resnet_model, train_loader, validation_loader, test_loader, criterion, optimizer, 50)
    torch.save(model.state_dict(), "model_results/resnet50_model_50.pt")
    np.save("model_results/resnet50_model_50_train_loss", train_loss_values)
    np.save("model_results/resnet50_model_50_train_acc", train_accuracy_values)
    np.save("model_results/resnet50_model_50_valid_loss", validation_loss_values)
    np.save("model_results/resnet50_model_50_valid_acc", validation_accuracy_values)



Train model
Epoch 1/50 - Train Loss: 0.6196 - Train Acc: 0.8291 - Valid Loss: 1.5464 - Valid Acc: 0.1854
Accuracy: 0.185378590078329
              precision    recall  f1-score   support

           0       0.19      1.00      0.31        71
           1       0.00      0.00      0.00       312

    accuracy                           0.19       383
   macro avg       0.09      0.50      0.16       383
weighted avg       0.03      0.19      0.06       383



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/50 - Train Loss: 0.2870 - Train Acc: 0.9090 - Valid Loss: 3.6650 - Valid Acc: 0.3003
Accuracy: 0.3002610966057441
              precision    recall  f1-score   support

           0       0.20      0.96      0.34        71
           1       0.94      0.15      0.26       312

    accuracy                           0.30       383
   macro avg       0.57      0.55      0.30       383
weighted avg       0.80      0.30      0.27       383

Epoch 3/50 - Train Loss: 0.2922 - Train Acc: 0.9028 - Valid Loss: 0.5551 - Valid Acc: 0.6997
Accuracy: 0.6997389033942559
              precision    recall  f1-score   support

           0       0.37      0.90      0.53        71
           1       0.97      0.65      0.78       312

    accuracy                           0.70       383
   macro avg       0.67      0.78      0.65       383
weighted avg       0.86      0.70      0.73       383

Epoch 4/50 - Train Loss: 0.2511 - Train Acc: 0.9140 - Valid Loss: 0.4389 - Valid Acc: 0.8877
Accuracy:

RuntimeError: Parent directory model_results does not exist.