In [9]:
import os
from pypdfium2 import PdfDocument
import pypdfium2
from PIL import Image
import matplotlib.pyplot as plt
import shutil
import random
import os
import random
import cv2
from tqdm import tqdm
import pandas as pd

In [8]:
!pip install opencv-python

Defaulting to user installation because normal site-packages is not writeable
Collecting opencv-python
  Downloading opencv_python-4.8.0.74-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (61.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: opencv-python
Successfully installed opencv-python-4.8.0.74

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# This cell contains the code to label the dataset. The user is asked whether the page/image is relevant or not

pdf_dir = "../../../datasets/extractor_classifier/slides" # Slides to label
relevant_dir = "../../../datasets/extractor_classifier/dataset_images/relevant"
not_relevant_dir = "../../../datasets/extractor_classifier/dataset_images/not_relevant"

for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        pdf_file = open(os.path.join(pdf_dir, filename), "rb")
        pdf_document = PdfDocument(pdf_file)

        for page_index, page_content in enumerate(pdf_document, 0):
            bitmap = page_content.render(scale=2)
            page_image = bitmap.to_pil()
            plt.imshow(page_image)
            plt.show()
            input_str = input("Is this image relevant? (y/n)")

            if input_str.lower() == "n":
                image_path = os.path.join(not_relevant_dir, f"{filename}_{page_index}.png")
            else:
                image_path = os.path.join(relevant_dir, f"{filename}_{page_index}.png")
            page_image.save(image_path)

        pdf_file.close()

In [8]:
# Create dataset from new master label data
ankinator_master_data = pd.read_csv("../../../datasets/anki_data/Ankinator_Master_Labeling.csv", delimiter=";")
ankinator_master_data.head(5)

Unnamed: 0,PDF-Name,Topic,Page Number,Marked for processing,Includes Image Data,Includes formula,Question 1,Question 2,Question 3,Title of the slide,Type of Question,Comment
0,ase_combined.pdf,Agile Software Engineering,1,No,,,,,,,,
1,ase_combined.pdf,Agile Software Engineering,2,No,,,,,,,,
2,ase_combined.pdf,Agile Software Engineering,3,No,,,,,,,,
3,ase_combined.pdf,Agile Software Engineering,4,No,,,,,,,,
4,ase_combined.pdf,Agile Software Engineering,5,No,No,No,,,,Cost of Software Failures,,


In [9]:
data_length = len(ankinator_master_data)
print("DataFrame length:", data_length)
ankinator_master_data.dropna(subset=["Marked for processing"], inplace=True)
data_length = len(ankinator_master_data)
print("DataFrame length:", data_length)

DataFrame length: 5739
DataFrame length: 3482


In [10]:
counts = ankinator_master_data["Marked for processing"].value_counts()

total_count = counts.sum()
percentage = (counts / total_count) * 100

print("Counts:")
print(counts)
print("\nPercentages:")
print(percentage)

Counts:
Yes    2181
No     1301
Name: Marked for processing, dtype: int64

Percentages:
Yes    62.636416
No     37.363584
Name: Marked for processing, dtype: float64


In [13]:
pdf_dir = "../../../datasets/anki_data/pdf_files"
relevant_dir_image = "../../../datasets/extractor_classifier/dataset_images/relevant"
not_relevant_dir_image = "../../../datasets/extractor_classifier/dataset_images/not_relevant"

for index, row in ankinator_master_data.iterrows():
    pdf_filename = os.path.join(pdf_dir, row['PDF-Name'])
    page_number = int(row['Page Number'])
    marked_for_processing = row['Marked for processing']

    pdf_file = open(pdf_filename, 'rb')
    pdf_document = pypdfium2.PdfDocument(pdf_file)
    page = pdf_document.get_page(page_number - 1)  # Pages are 0-indexed

    bitmap = page.render(scale=1)
    image = bitmap.to_pil()

    if marked_for_processing == "Yes":
         image_path = os.path.join(relevant_dir_image, f"{row['PDF-Name']}_page_{page_number}.png")
    else:
        image_path = os.path.join(not_relevant_dir_image, f"{row['PDF-Name']}_page_{page_number}.png")

    image.save(image_path)
    pdf_file.close()

In [15]:
# Create train, validation and test data from dataset
# Only do this once

root_dir = "../../../datasets/extractor_classifier/dataset_images/"

# Define the percentage of data to use for each set
train_percent = 0.7
val_percent = 0.10
test_percent = 0.20

# Create a list of class names (assumes each class is a subfolder of root_dir)
class_names = sorted(os.listdir(root_dir))

if ".DS_Store" in class_names:
    class_names.remove(".DS_Store")

# Define the output directories for the saved datasets
train_output_dir = "../../../datasets/extractor_classifier/train/"
val_output_dir = "../../../datasets/extractor_classifier/validation/"
test_output_dir = "../../../datasets/extractor_classifier/test/"

# Create the output directories if they don't already exist
os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(val_output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)

# Create train, validation, and test list
train_list = []
validation_list = []
test_list = []

# Split the data for each class into train, validation, and test sets
for class_name in class_names:
    # Get a list of all images for this class
    images = os.listdir(root_dir + class_name)
    random.Random(42).shuffle(images)

    # Split the images into train, validation, and test sets
    num_images = len(images)
    num_train = int(train_percent * num_images)
    num_val = int(val_percent * num_images)

    train_images = images[:num_train]
    val_images = images[num_train:num_train+num_val]
    test_images = images[num_train+num_val:]

    for image in train_images:
        if image != ".ipynb_checkpoints":
            src_path = root_dir + class_name + "/" + image
            label = class_names.index(class_name)
            train_list.append((Image.open(src_path), label))

    for image in val_images:
        if image != ".ipynb_checkpoints":
            src_path = root_dir + class_name + "/" + image
            label = class_names.index(class_name)
            validation_list.append((Image.open(src_path), label))

    for image in test_images:
        if image != ".ipynb_checkpoints":
            src_path = root_dir + class_name + "/" + image
            label = class_names.index(class_name)
            test_list.append((Image.open(src_path), label))

# Save the train dataset
for image, label in train_list:
    class_name = class_names[label]
    output_path = os.path.join(train_output_dir, class_name)
    os.makedirs(output_path, exist_ok=True)
    image_filename = os.path.splitext(os.path.basename(image.filename))[0] + ".jpg"
    shutil.copyfile(image.filename, os.path.join(output_path, image_filename))

# Save the validation dataset
for image, label in validation_list:
    class_name = class_names[label]
    output_path = os.path.join(val_output_dir, class_name)
    os.makedirs(output_path, exist_ok=True)
    image_filename = os.path.splitext(os.path.basename(image.filename))[0] + ".jpg"
    shutil.copyfile(image.filename, os.path.join(output_path, image_filename))

# Save the test dataset
for image, label in test_list:
    class_name = class_names[label]
    output_path = os.path.join(test_output_dir, class_name)
    os.makedirs(output_path, exist_ok=True)
    image_filename = os.path.splitext(os.path.basename(image.filename))[0] + ".jpg"
    shutil.copyfile(image.filename, os.path.join(output_path, image_filename))

In [10]:
import numpy as np

# Data augmentation



# Path to the folder containing the images
data_dir = "../../../datasets/extractor_classifier/"

# Define the directories where the images are stored
relevant_images_dir = os.path.join(data_dir, "train/relevant")
not_relevant_images_dir = os.path.join(data_dir, "train/not_relevant")

# Create a new directory to store the augmented images
train_data_augmentation_dir = "train_data_augmentation"

# Path to the output folder for augmented images
train_data_augmentation_dir = os.path.join(data_dir, "train_data_augmentation")

# Create the output folder if it doesn't exist
if not os.path.exists(train_data_augmentation_dir):
    os.makedirs(train_data_augmentation_dir)
    os.makedirs(os.path.join(train_data_augmentation_dir, "relevant"))
    os.makedirs(os.path.join(train_data_augmentation_dir, "not_relevant"))

def blur_augmentation(image_dir, relevance):
    for image_file in tqdm(os.listdir(image_dir)):
        # Read the image
        image = cv2.imread(os.path.join(image_dir, image_file))

        # Apply augmentation
        augmented_image = cv2.GaussianBlur(image, (7, 7), 0)

        # Save the augmented image
        save_path = os.path.join(train_data_augmentation_dir, f"{relevance}/{image_file}_blur.png")
        cv2.imwrite(save_path, augmented_image)

def add_random_boxes(img,n_k,size=32):
    h,w = size,size
    img = np.asarray(img)
    img_size = img.shape[1]
    boxes = []
    for k in range(n_k):
        y,x = np.random.randint(0,img_size-w,(2,))
        img[y:y+h,x:x+w] = 0
        boxes.append((x,y,h,w))
    return img

def noise_augmentation(image_dir, relevance):
    for image_file in tqdm(os.listdir(image_dir)):
        # Read the image
        image = cv2.imread(os.path.join(image_dir, image_file))
        noisy_image = add_random_boxes(image, 30, 128)

        # Save the augmented image
        save_path = os.path.join(train_data_augmentation_dir, f"{relevance}/{image_file}_random_blocks.png")
        cv2.imwrite(save_path, noisy_image)

noise_augmentation(relevant_images_dir, "relevant")
noise_augmentation(not_relevant_images_dir, "not_relevant")
blur_augmentation(relevant_images_dir, "relevant")
blur_augmentation(not_relevant_images_dir, "not_relevant")


100%|██████████| 1527/1527 [00:49<00:00, 30.60it/s]
100%|██████████| 910/910 [00:29<00:00, 30.72it/s]
100%|██████████| 1527/1527 [00:20<00:00, 74.14it/s]
100%|██████████| 910/910 [00:11<00:00, 78.04it/s]
