# Notebook to convert the master labeling .csv to a compatible format for the Apple-VQG model
- The format is shown in src/image_to_text/vqg-multimodal-assistant/data/apple/apple_dev_all_keyword.csv
- Besides, converting the data, the dataset class, which takes care of data loading had to be changed, as the original dataset used images stored on flickr which have been loaded from the web. -> It was changed to load images from disk

In [1]:
import pandas as pd
from pypdfium2 import PdfDocument

In [2]:
data_df = pd.read_csv("./datasets/master/Dataset_Master.csv") # Read in csv
data_df

Unnamed: 0,PDF-Name,Topic,Page Number,Marked for processing,Includes Image Data,Includes formula,Question 1,Question 2,Question 3,Title of the slide,Type of Question,Comment
0,ase_combined.pdf,Agile Software Engineering,1,No,,,,,,,,
1,ase_combined.pdf,Agile Software Engineering,2,No,,,,,,,,
2,ase_combined.pdf,Agile Software Engineering,3,No,,,,,,,,
3,ase_combined.pdf,Agile Software Engineering,4,No,,,,,,,,
4,ase_combined.pdf,Agile Software Engineering,5,No,No,No,,,,Cost of Software Failures,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,it-security-all-slides_no_duplicates.pdf,IT-Security,592,Yes,Yes,No,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition,,
5735,it-security-all-slides_no_duplicates.pdf,IT-Security,593,Yes,Yes,Yes,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...,,
5736,it-security-all-slides_no_duplicates.pdf,IT-Security,594,Yes,Yes,Yes,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺,,
5737,it-security-all-slides_no_duplicates.pdf,IT-Security,595,Yes,Yes,Yes,What is the Privacy Budget? What are challenges?,,,Privacy Budget,,


In [3]:
data_df["PDF-Name"].unique() # check which pdfs are needed

array(['ase_combined.pdf', 'corporate_knowledge_management_combined.pdf',
       'data_mining_combined.pdf', 'database_dhbw_combined.pdf',
       'decision_support_combined.pdf',
       'information_retrieval_combined.pdf', 'large_scale_combined.pdf',
       'leadership_combined.pdf', 'machine_learning_combined.pdf',
       'management_enterprise_combined.pdf',
       'network_science_combined.pdf', 'project_management_combined.pdf',
       'it-security-all-slides_no_duplicates.pdf'], dtype=object)

In [3]:
data_df = data_df.dropna(subset="Question 1").drop(columns=["Topic", "Marked for processing", "Includes Image Data", "Includes formula", "Type of Question", "Comment"]) # filter to include only the samples that have at least one questino, drop unnecessary slides
data_df

Unnamed: 0,PDF-Name,Page Number,Question 1,Question 2,Question 3,Title of the slide
5,ase_combined.pdf,6,What is the definition of Verification and Val...,What are the objectives of Verification and Va...,,Verification and Validation (V&V)
6,ase_combined.pdf,7,What are the goals of verification and validat...,On what does verification and validation depend?,,V&V Goals
7,ase_combined.pdf,8,What is static and what is dynamic V&V?,How does static V&V differentiate from dynamic...,,Static versus Dynamic V&V
8,ase_combined.pdf,9,How the relative cost of fixing defects behave...,,,Relative Cost of Fixing Defects
9,ase_combined.pdf,10,What is model-driven development?,,,Model-Driven Development
...,...,...,...,...,...,...
5734,it-security-all-slides_no_duplicates.pdf,592,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition
5735,it-security-all-slides_no_duplicates.pdf,593,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...
5736,it-security-all-slides_no_duplicates.pdf,594,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺
5737,it-security-all-slides_no_duplicates.pdf,595,What is the Privacy Budget? What are challenges?,,,Privacy Budget


In [4]:
from src.image_to_text.data_preprocessing.util import extract_text
from tqdm import tqdm
import os

# Extract the images from the pdf slides and associate them with a unique id, which consists of the pdf name and the page number of the image and save them accordingly in a folder

pdf_paths = [os.path.join("./datasets/master/", pdf_slides) for pdf_slides in data_df["PDF-Name"].unique()]
image_ids = []
for pdf_path in pdf_paths:

    pdf_name = os.path.split(pdf_path)[-1]
    allowed_page_numbers = data_df.loc[data_df["PDF-Name"] == pdf_name]["Page Number"].to_list() # page numbers of the current pdf in question which should be kept

    # extract images
    pdf = PdfDocument(pdf_path)
    extracted_contents = extract_text(pdf.raw)

    # filter the unnecessary pdf pages
    for t in tqdm(extracted_contents):
        if t[0] in allowed_page_numbers:
            image_id = f"{pdf_name[:-4]}_{t[0]}"
            image_ids.append(image_id)
            t[3].save(f"./datasets/master/images/{image_id}.png")

    # free memory to prevent kernel dying
    del extracted_contents
    del pdf

100%|██████████| 244/244 [00:07<00:00, 31.27it/s]
100%|██████████| 455/455 [00:18<00:00, 24.73it/s]
100%|██████████| 313/313 [00:05<00:00, 54.42it/s] 
100%|██████████| 375/375 [00:21<00:00, 17.73it/s]
100%|██████████| 548/548 [00:25<00:00, 21.51it/s]
100%|██████████| 182/182 [00:10<00:00, 17.62it/s]
100%|██████████| 479/479 [00:04<00:00, 106.64it/s]
100%|██████████| 755/755 [00:02<00:00, 314.97it/s]
100%|██████████| 596/596 [00:34<00:00, 17.19it/s]


In [5]:
data_df.reset_index(inplace=True, drop=True) # reset the index
data_df["image_id"] = image_ids # associate every entry in the csv with its corresponding image_id
data_df

Unnamed: 0,PDF-Name,Page Number,Question 1,Question 2,Question 3,Title of the slide,image_id
0,ase_combined.pdf,6,What is the definition of Verification and Val...,What are the objectives of Verification and Va...,,Verification and Validation (V&V),ase_combined_6
1,ase_combined.pdf,7,What are the goals of verification and validat...,On what does verification and validation depend?,,V&V Goals,ase_combined_7
2,ase_combined.pdf,8,What is static and what is dynamic V&V?,How does static V&V differentiate from dynamic...,,Static versus Dynamic V&V,ase_combined_8
3,ase_combined.pdf,9,How the relative cost of fixing defects behave...,,,Relative Cost of Fixing Defects,ase_combined_9
4,ase_combined.pdf,10,What is model-driven development?,,,Model-Driven Development,ase_combined_10
...,...,...,...,...,...,...,...
2177,it-security-all-slides_no_duplicates.pdf,592,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition,it-security-all-slides_no_duplicates_592
2178,it-security-all-slides_no_duplicates.pdf,593,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...,it-security-all-slides_no_duplicates_593
2179,it-security-all-slides_no_duplicates.pdf,594,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺,it-security-all-slides_no_duplicates_594
2180,it-security-all-slides_no_duplicates.pdf,595,What is the Privacy Budget? What are challenges?,,,Privacy Budget,it-security-all-slides_no_duplicates_595


In [6]:
# rename columns to match the ones in the apple format
column_names = {
    "Title of the slide": "keyword"
}
data_df.rename(columns=column_names, inplace=True)
data_df.to_csv("tmp.csv")

In [3]:
import pandas as pd

data_df = pd.read_csv("tmp.csv", index_col=0)
data_df

Unnamed: 0,PDF-Name,Page Number,Question 1,Question 2,Question 3,keyword,image_id
0,ase_combined.pdf,6,What is the definition of Verification and Val...,What are the objectives of Verification and Va...,,Verification and Validation (V&V),ase_combined_6
1,ase_combined.pdf,7,What are the goals of verification and validat...,On what does verification and validation depend?,,V&V Goals,ase_combined_7
2,ase_combined.pdf,8,What is static and what is dynamic V&V?,How does static V&V differentiate from dynamic...,,Static versus Dynamic V&V,ase_combined_8
3,ase_combined.pdf,9,How the relative cost of fixing defects behave...,,,Relative Cost of Fixing Defects,ase_combined_9
4,ase_combined.pdf,10,What is model-driven development?,,,Model-Driven Development,ase_combined_10
...,...,...,...,...,...,...,...
2177,it-security-all-slides_no_duplicates.pdf,592,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition,it-security-all-slides_no_duplicates_592
2178,it-security-all-slides_no_duplicates.pdf,593,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...,it-security-all-slides_no_duplicates_593
2179,it-security-all-slides_no_duplicates.pdf,594,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺,it-security-all-slides_no_duplicates_594
2180,it-security-all-slides_no_duplicates.pdf,595,What is the Privacy Budget? What are challenges?,,,Privacy Budget,it-security-all-slides_no_duplicates_595


In [4]:
# build image urls / paths, such that the dataloader can load the images from disk later on
data_df["image_url"] = [f"datasets/master/images/{image_id}.png" for image_id in data_df["image_id"].to_list()]
data_df

Unnamed: 0,PDF-Name,Page Number,Question 1,Question 2,Question 3,keyword,image_id,image_url
0,ase_combined.pdf,6,What is the definition of Verification and Val...,What are the objectives of Verification and Va...,,Verification and Validation (V&V),ase_combined_6,datasets/master/images/ase_combined_6.png
1,ase_combined.pdf,7,What are the goals of verification and validat...,On what does verification and validation depend?,,V&V Goals,ase_combined_7,datasets/master/images/ase_combined_7.png
2,ase_combined.pdf,8,What is static and what is dynamic V&V?,How does static V&V differentiate from dynamic...,,Static versus Dynamic V&V,ase_combined_8,datasets/master/images/ase_combined_8.png
3,ase_combined.pdf,9,How the relative cost of fixing defects behave...,,,Relative Cost of Fixing Defects,ase_combined_9,datasets/master/images/ase_combined_9.png
4,ase_combined.pdf,10,What is model-driven development?,,,Model-Driven Development,ase_combined_10,datasets/master/images/ase_combined_10.png
...,...,...,...,...,...,...,...,...
2177,it-security-all-slides_no_duplicates.pdf,592,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition,it-security-all-slides_no_duplicates_592,datasets/master/images/it-security-all-slides_...
2178,it-security-all-slides_no_duplicates.pdf,593,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...,it-security-all-slides_no_duplicates_593,datasets/master/images/it-security-all-slides_...
2179,it-security-all-slides_no_duplicates.pdf,594,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺,it-security-all-slides_no_duplicates_594,datasets/master/images/it-security-all-slides_...
2180,it-security-all-slides_no_duplicates.pdf,595,What is the Privacy Budget? What are challenges?,,,Privacy Budget,it-security-all-slides_no_duplicates_595,datasets/master/images/it-security-all-slides_...


In [8]:
for i, row in data_df.iterrows():
    print(row["Question 1"])
    break

What is the definition of Verification and Validation and what are the objectives?


In [5]:
# concatenate the questions into one column as specified in the apple vqg dataset format
questions = []
for _, row in data_df.iterrows():
    q = row["Question 1"]
    if isinstance(row["Question 2"], str):
        q += f"---{row['Question 2']}"
    if isinstance(row["Question 3"], str):
        q += f"---{row['Question 3']}"
    questions.append(q)

data_df["questions"] = questions
data_df

Unnamed: 0,PDF-Name,Page Number,Question 1,Question 2,Question 3,keyword,image_id,image_url,questions
0,ase_combined.pdf,6,What is the definition of Verification and Val...,What are the objectives of Verification and Va...,,Verification and Validation (V&V),ase_combined_6,datasets/master/images/ase_combined_6.png,What is the definition of Verification and Val...
1,ase_combined.pdf,7,What are the goals of verification and validat...,On what does verification and validation depend?,,V&V Goals,ase_combined_7,datasets/master/images/ase_combined_7.png,What are the goals of verification and validat...
2,ase_combined.pdf,8,What is static and what is dynamic V&V?,How does static V&V differentiate from dynamic...,,Static versus Dynamic V&V,ase_combined_8,datasets/master/images/ase_combined_8.png,What is static and what is dynamic V&V?---How ...
3,ase_combined.pdf,9,How the relative cost of fixing defects behave...,,,Relative Cost of Fixing Defects,ase_combined_9,datasets/master/images/ase_combined_9.png,How the relative cost of fixing defects behave...
4,ase_combined.pdf,10,What is model-driven development?,,,Model-Driven Development,ase_combined_10,datasets/master/images/ase_combined_10.png,What is model-driven development?
...,...,...,...,...,...,...,...,...,...
2177,it-security-all-slides_no_duplicates.pdf,592,What is the intuition of Differential Privacy?,,,Differential Privacy\nIntuition,it-security-all-slides_no_duplicates_592,datasets/master/images/it-security-all-slides_...,What is the intuition of Differential Privacy?
2178,it-security-all-slides_no_duplicates.pdf,593,What is the definition of Differential Privacy?,,,Differential Privacy\nDefinition (Simplified V...,it-security-all-slides_no_duplicates_593,datasets/master/images/it-security-all-slides_...,What is the definition of Differential Privacy?
2179,it-security-all-slides_no_duplicates.pdf,594,What is the parameter 𝜖 and the composition t...,,,On the Parameter 𝜺,it-security-all-slides_no_duplicates_594,datasets/master/images/it-security-all-slides_...,What is the parameter 𝜖 and the composition t...
2180,it-security-all-slides_no_duplicates.pdf,595,What is the Privacy Budget? What are challenges?,,,Privacy Budget,it-security-all-slides_no_duplicates_595,datasets/master/images/it-security-all-slides_...,What is the Privacy Budget? What are challenges?


In [6]:
data_df.to_csv("tmp.csv", index=False)

In [7]:
# drop unnecessary columns
data_df.drop(columns=["Question 1", "Question 2", "Question 3", "PDF-Name", "Page Number"], inplace=True)
data_df

Unnamed: 0,keyword,image_id,image_url,questions
0,Verification and Validation (V&V),ase_combined_6,datasets/master/images/ase_combined_6.png,What is the definition of Verification and Val...
1,V&V Goals,ase_combined_7,datasets/master/images/ase_combined_7.png,What are the goals of verification and validat...
2,Static versus Dynamic V&V,ase_combined_8,datasets/master/images/ase_combined_8.png,What is static and what is dynamic V&V?---How ...
3,Relative Cost of Fixing Defects,ase_combined_9,datasets/master/images/ase_combined_9.png,How the relative cost of fixing defects behave...
4,Model-Driven Development,ase_combined_10,datasets/master/images/ase_combined_10.png,What is model-driven development?
...,...,...,...,...
2177,Differential Privacy\nIntuition,it-security-all-slides_no_duplicates_592,datasets/master/images/it-security-all-slides_...,What is the intuition of Differential Privacy?
2178,Differential Privacy\nDefinition (Simplified V...,it-security-all-slides_no_duplicates_593,datasets/master/images/it-security-all-slides_...,What is the definition of Differential Privacy?
2179,On the Parameter 𝜺,it-security-all-slides_no_duplicates_594,datasets/master/images/it-security-all-slides_...,What is the parameter 𝜖 and the composition t...
2180,Privacy Budget,it-security-all-slides_no_duplicates_595,datasets/master/images/it-security-all-slides_...,What is the Privacy Budget? What are challenges?


In [8]:
# Filter misplaced \n
data_df["keyword"] = data_df["keyword"].str.replace("\n"," ")
data_df.to_csv("tmp.csv", index=False)

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

data_df = pd.read_csv("tmp.csv")[["image_id", "image_url", "questions", "keyword"]]

# Create train, test split
train, tmp = train_test_split(data_df, test_size=0.3, random_state=42)
test, val = train_test_split(tmp, test_size=0.3, random_state=42)

train

Unnamed: 0,image_id,image_url,questions,keyword
1823,it-security-all-slides_no_duplicates_112,datasets/master/images/it-security-all-slides_...,What are the problems of storing passwords in ...,Storing Passwords on Server Plaintexts
439,decision_support_combined_89,datasets/master/images/decision_support_combin...,How is the expected discounted reward value of...,Value of a Policy
271,corporate_knowledge_management_combined_238,datasets/master/images/corporate_knowledge_man...,What is clustering in the context of text mini...,Functionalities of text mining: 5) Clustering
244,corporate_knowledge_management_combined_200,datasets/master/images/corporate_knowledge_man...,"What is the difference between structured, sem...",Data continuum
1919,it-security-all-slides_no_duplicates_258,datasets/master/images/it-security-all-slides_...,What is the Buffer Overflow?,Buffer Overflow Attack Idea
...,...,...,...,...
1638,network_science_combined_102,datasets/master/images/network_science_combine...,What are the problems in proving the idea of s...,6 DEGREES OF SEPARATION
1095,large_scale_combined_410,datasets/master/images/large_scale_combined_41...,Explain the centralized two phase locking---Ho...,Centralized 2PL
1130,large_scale_combined_453,datasets/master/images/large_scale_combined_45...,How does the HBase data model work?---Why is H...,HBase data model
1294,leadership_combined_107,datasets/master/images/leadership_combined_107...,What is the relationship between the different...,Types of motivation on desirable and undesirab...


In [3]:
train.loc[train["image_id"] == "leadership_combined_172"]

Unnamed: 0,image_id,image_url,questions,keyword
1355,leadership_combined_172,datasets/master/images/leadership_combined_172...,What are universally desirable leadership\natt...,Universally desirable leadership attributes


In [4]:
# save to csv
train.to_csv("datasets/master/apple_vqg_train.csv", index=False, sep=';')
test.to_csv("datasets/master/apple_vqg_test.csv", index=False, sep=';')
val.to_csv("datasets/master/apple_vqg_val.csv", index=False, sep=';')
test

Unnamed: 0,image_id,image_url,questions,keyword
937,large_scale_combined_226,datasets/master/images/large_scale_combined_22...,How can words be counted with MapReduce?,Counting words: Map
1018,large_scale_combined_321,datasets/master/images/large_scale_combined_32...,Describe the RDD workflow---How does the RDD w...,Working with RDDs (1)
1211,leadership_combined_16,datasets/master/images/leadership_combined_16.png,What are key aspects and differences of the ki...,Which is the Kindergarten Report Card? Which i...
916,large_scale_combined_194,datasets/master/images/large_scale_combined_19...,What are the communication costs for semi join...,Communication cost (1)
1196,large_scale_combined_542,datasets/master/images/large_scale_combined_54...,What is a Merkle tree?---Why is the Merkle tre...,Merkle tree
...,...,...,...,...
1029,large_scale_combined_332,datasets/master/images/large_scale_combined_33...,Which transformations exist in spark?,Transformations (2)
450,decision_support_combined_137,datasets/master/images/decision_support_combin...,What is the internal structure of the formula ...,Internal structure of the Formula
65,ase_combined_112,datasets/master/images/ase_combined_112.png,What are the two approaches to Input Domain Mo...,Two Approaches to Input Domain Modeling
634,information_retrieval_combined_172,datasets/master/images/information_retrieval_c...,How does the ranking work when relevance judge...,Binary independence model


## Running the model on the dataset
- Runnning it like this in the notebook is not advised, as it logs too much info. It should thus only be run from the terminal, as described in the models readme.md
- It is important to first create the correct config.yml for the run, which specifies all needed parameters and settings. For more info on the yml, look at the training-config.yml in ssrc/image_to_text/vqg-multimodal-assistant/config/training-config.yaml

In [None]:
from src.apple_vqg import run

class arguments:
    
    def __init__(self, model_dir, c):
        self.model_dir = model_dir
        self.c = c

run.main(arguments(model_dir="./model", c="src/apple_vqg/config/training-config-Master.yaml"))

2023-07-28 15:09:44.271503: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-28 15:09:44.319974: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'datasets': {'embedding_file': 'src/apple_vqg/data/glove_words.txt',
              'keyword': 'NO',
              'max_train_size': 3000,
              'name': 'master',
              'test_file': 'datasets/master/apple_vqg_test.csv',
              'train_file': 'datasets/master/apple_vqg_train.csv',
              'validation_file': 'datasets/master/apple_vqg_val.csv'},
 'is_training': 'YES',
 'logging_level': 'i',
 'model_parameters': {'decoder': {'algorithm': 'greedy',
                                  'beam_size': 5,
                                  'hidden_dim': 256},
                      'image_encoder': {'algorithm': 'DenseNet',
                                        'image_embedding_dim': 94080,
                                        'image_height': 224,
                                        'image_width': 224},
                      'inference': {'model_file': 'apple/bert/densenet/model_121.h5',
                                    'user_input': 'NO'},
                   

2023-07-28 15:09:51.595770: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30916 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0


[2023-07-28 15:09:55,838] {datasets.py:87} INFO - DenseNet model loaded
[2023-07-28 15:09:55,867] {run.py:167} INFO - Max question len: 34
[2023-07-28 15:09:55,867] {run.py:168} INFO - Max training samples: 3000
[2023-07-28 15:09:55,868] {run.py:169} INFO - Vocabulary: 2291
[2023-07-28 15:09:55,868] {question_generation_model.py:54} INFO - Initialize model
[2023-07-28 15:09:56,255] {question_generation_model.py:86} INFO - Embedding_index: 400000
[2023-07-28 15:09:56,309] {datasets.py:184} INFO - Done creating unique training question set. Size: 1830


2023-07-28 15:10:00.202143: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8700


[2023-07-28 15:10:11,745] {datasets.py:192} INFO - Processing image id # : 100
[2023-07-28 15:10:20,804] {datasets.py:192} INFO - Processing image id # : 200
[2023-07-28 15:10:30,034] {datasets.py:192} INFO - Processing image id # : 300
[2023-07-28 15:10:39,197] {datasets.py:192} INFO - Processing image id # : 400
[2023-07-28 15:10:44,633] {datasets.py:207} INFO - Test data loaded
[2023-07-28 15:10:53,806] {datasets.py:223} INFO - Processing image id # : 100
[2023-07-28 15:11:02,989] {datasets.py:223} INFO - Processing image id # : 200
[2023-07-28 15:11:12,119] {datasets.py:223} INFO - Processing image id # : 300
[2023-07-28 15:11:21,162] {datasets.py:223} INFO - Processing image id # : 400
[2023-07-28 15:11:30,309] {datasets.py:223} INFO - Processing image id # : 500
[2023-07-28 15:11:39,513] {datasets.py:223} INFO - Processing image id # : 600
[2023-07-28 15:11:49,632] {datasets.py:223} INFO - Processing image id # : 700
[2023-07-28 15:11:59,008] {datasets.py:223} INFO - Processing i