# Notebook to convert the master labeling .csv to a compatible format for the Apple-VQG model

In [1]:
import pandas as pd
from pypdfium2 import PdfDocument

In [2]:
data_df = pd.read_csv("../../../datasets/master/Dataset_Master.csv")
data_df

Unnamed: 0,PDF-Name,Topic,Page Number,Marked for processing,Includes Image Data,Includes formula,Question 1,Question 2,Question 3,Title of the slide,Type of Question,Comment
0,ase_combined.pdf,Agile Software Engineering,1,No,,,,,,,,
1,ase_combined.pdf,Agile Software Engineering,2,No,,,,,,,,
2,ase_combined.pdf,Agile Software Engineering,3,No,,,,,,,,
3,ase_combined.pdf,Agile Software Engineering,4,No,,,,,,,,
4,ase_combined.pdf,Agile Software Engineering,5,No,No,No,,,,Cost of Software Failures,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,it-security-all-slides_no_duplicates.pdf,IT-Security,592,Yes,Yes,No,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition,,
5735,it-security-all-slides_no_duplicates.pdf,IT-Security,593,Yes,Yes,Yes,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...,,
5736,it-security-all-slides_no_duplicates.pdf,IT-Security,594,Yes,Yes,Yes,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺,,
5737,it-security-all-slides_no_duplicates.pdf,IT-Security,595,Yes,Yes,Yes,What is the Privacy Budget? What are challenges?,,,Privacy Budget,,


In [3]:
data_df["PDF-Name"].unique()

array(['ase_combined.pdf', 'corporate_knowledge_management_combined.pdf',
       'data_mining_combined.pdf', 'database_dhbw_combined.pdf',
       'decision_support_combined.pdf',
       'information_retrieval_combined.pdf', 'large_scale_combined.pdf',
       'leadership_combined.pdf', 'machine_learning_combined.pdf',
       'management_enterprise_combined.pdf',
       'network_science_combined.pdf', 'project_management_combined.pdf',
       'it-security-all-slides_no_duplicates.pdf'], dtype=object)

In [4]:
data_df = data_df.dropna(subset="Question 1").drop(columns=["Topic", "Marked for processing", "Includes Image Data", "Includes formula", "Type of Question", "Comment"])
data_df

Unnamed: 0,PDF-Name,Page Number,Question 1,Question 2,Question 3,Title of the slide
5,ase_combined.pdf,6,What is the definition of Verification and Val...,What are the objectives of Verification and Va...,,Verification and Validation (V&V)
6,ase_combined.pdf,7,What are the goals of verification and validat...,On what does verification and validation depend?,,V&V Goals
7,ase_combined.pdf,8,What is static and what is dynamic V&V?,How does static V&V differentiate from dynamic...,,Static versus Dynamic V&V
8,ase_combined.pdf,9,How the relative cost of fixing defects behave...,,,Relative Cost of Fixing Defects
9,ase_combined.pdf,10,What is model-driven development?,,,Model-Driven Development
...,...,...,...,...,...,...
5734,it-security-all-slides_no_duplicates.pdf,592,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition
5735,it-security-all-slides_no_duplicates.pdf,593,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...
5736,it-security-all-slides_no_duplicates.pdf,594,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺
5737,it-security-all-slides_no_duplicates.pdf,595,What is the Privacy Budget? What are challenges?,,,Privacy Budget


In [None]:
from src.image_to_text.data_preprocessing.util import extract_text
from tqdm import tqdm
import os

# TODO: Kernel dies after slide-deck 9

pdf_paths = [os.path.join("../../../datasets/master/", pdf_slides) for pdf_slides in data_df["PDF-Name"].unique()]
image_ids = []
for pdf_path in pdf_paths:

    pdf_name = os.path.split(pdf_path)[-1]
    allowed_page_numbers = data_df.loc[data_df["PDF-Name"] == pdf_name]["Page Number"].to_list()

    pdf = PdfDocument(pdf_path)
    extracted_contents = extract_text(pdf.raw)

    for t in tqdm(extracted_contents):
        if t[0] in allowed_page_numbers:
            image_id = f"{pdf_name[:-4]}_{t[0]}"
            image_ids.append(image_id)
            t[3].save(f"../../../datasets/master/images/{image_id}.png")

data_df["image_id"] = image_ids
data_df

100%|██████████| 244/244 [00:05<00:00, 41.21it/s]
100%|██████████| 455/455 [00:12<00:00, 35.07it/s]
100%|██████████| 313/313 [00:07<00:00, 41.73it/s] 
100%|██████████| 375/375 [00:15<00:00, 24.98it/s]
100%|██████████| 548/548 [00:18<00:00, 29.65it/s]
100%|██████████| 182/182 [00:07<00:00, 25.67it/s]
100%|██████████| 479/479 [00:03<00:00, 157.39it/s]
100%|██████████| 755/755 [00:01<00:00, 470.02it/s]
100%|██████████| 596/596 [00:42<00:00, 14.02it/s]
