In [33]:
# Extracción de texto 
data_path = "../data/Databricks Certified Associate Developer for Apache Spark.pdf"
pattern = r'(?m)^\s*\d+\s*$' # eliminar líneas con sólo números (numero de página del libro)

import fitz
import re
import os
import json

extract_plain_text = False
export_json_questions = True

def delete_lines_w_digits_only(text):
    return re.sub(pattern, '', text) 

def delete_strange_characters(text):
    """Son caracteres que salen de los bloques de código, queremos eliminarlos"""
    return re.sub("\u202f", "", text)

def extract_cleaned_text(data_path, start_page, end_page):
    doc = fitz.open(data_path)
    selected_text = ""

    for n_page in range(start_page, end_page+1):
        page = doc.load_page(n_page)
        text_from_page = page.get_text()

        # Clean text
        cleaned_text = delete_lines_w_digits_only(text_from_page)
        cleaned_text = delete_strange_characters(cleaned_text)

        # Specific noise:
        cleaned_text = re.sub(r"(Mock Test\s*\d+|Questions)", "", cleaned_text, flags=re.IGNORECASE)
        cleaned_text = re.sub(r"\n{2,}", "\n", cleaned_text)
        
        # Add important text
        if cleaned_text.strip():
            selected_text += cleaned_text + "\n"
    
    return selected_text

def extract_options(text_block): 
    """
    Extraer las opciones de respuesta A-E manteniendo las indexaciones, para casos donde haya un listado dentro de la opción de respuesta
    del tipo i. ii. iii. iv. v. etc.
    """

    split = re.split(
        r'\n(?=([A-E])\.\s)', # splittea por A. B. C. D. E. 
        text_block.strip()
    )

    results = {} # resultado final
    current_key = None # inicializamos en None, pero irá recogiendo A, B, C, etc.
    buffer = [] # el contenido de la opción

    for part in split: 
        m = re.match(r'^([A-E])\.\s+(.*)', part, re.DOTALL) 
        if m: # si la parte es el comienzo de una opción
            if current_key: # si hay ya un indicador de A, B, C
                results[current_key] = "\n".join(buffer).strip() # le añadimos lo que haya más el buffer que tendrá la siguiente indexación
            current_key = m.group(1) # guardará el primer grupo de captura que corresponde a  "([A-E])" # actualiza la clave con la nueva opción
            buffer = [m.group(2)] # guardará el segundo grupo de captura que corresponde a el texto de la opción 
        elif current_key:
            buffer.append(part) # si no es un inicio de opción con A, B, etc entonces es otra línea de la anterior opción, lo guardamos en buffer.

    if current_key: # cuando terminamos de ver todas las partes, guardamos el último buffer en el current_key si este existiera.
        # desp del último match no hay una opción que guarde el último buffer procesado, entonces se añade
        results[current_key] = "\n".join(buffer).strip()
    return results

def extract_questions_from_text(cleaned_text: str, test_number: int):
    # sep questions
    pattern = re.compile(
        r"^Question\s+(\d+):\s*((?:.*(?:\n(?!Question\s+\d+:|\nAnswers|\Z).*)*))",
        re.MULTILINE
    )
    
    # sep options  -- ahora las opciones se procesan con una nueva función
    # option_splitter = re.compile(r"\n([A-E])\.\s+")
    
    # sep answers by "id"
    answers_pattern = re.compile(r"\n\s*(\d+)\.\s*([A-E])")
    answers = {int(q): a for q, a in answers_pattern.findall(cleaned_text)}

    questions = []
    i = 0
    for match in pattern.finditer(cleaned_text):
        q_num = int(match.group(1)) # "Question X"
        q_text = match.group(2).strip() # text from question
        i+=1
        # Dividir el enunciado de bloque de opciones:
        # Extraer todas las opciones
        options = extract_options(q_text)
        
        # Extraer el texto anterior a la primera opción encontrada
        if options:
            # Buscar la posición exacta de la opción A. (no su contenido)
            option_a_match = re.search(r"\nA\.\s", q_text)
            if option_a_match:
                question_text = q_text[:option_a_match.start()].strip()
            else:
                question_text = q_text.strip()
        else:
            question_text = q_text.strip()

        #parts = option_splitter.split(q_text) # sep text
        #if len(parts) > 3:
        #    question_text = parts[0].strip()
        #   options = {
        #        parts[i]: parts[i + 1].strip()
        #        for i in range(1, len(parts) - 1, 2)
        #    }
        #else: # if it is not a questions save only text
        #    question_text = q_text
        #    options = {}

        question_id = f"Test_{test_number}_{q_num}"  # create a ID
        correct = answers.get(q_num, None) # get the answer by num of question

        questions.append({ # save info
            "question_id": question_id,
            "question": question_text,
            "options": options,
            "correct_answer": correct
        })

    return questions

# limits of tests: (pages)
mock_test_1_pages = [211, 234]
mock_test_2_pages = [235, 258]

# Extraer las preguntas de forma ordenada

In [34]:
# paths for cleaned_texts
plain_text_1_path = "../data/exams_test2/plain_text/clean_text_test1.txt"
plain_text_2_path = "../data/exams_test2/plain_text/clean_text_test2.txt"

if extract_plain_text:
    text_test1 = extract_cleaned_text(data_path, *mock_test_1_pages)
    text_test2 = extract_cleaned_text(data_path, *mock_test_2_pages)

    os.makedirs("../data/exams_testsv2/plain_text", exist_ok=True)
    with open(plain_text_1_path, "w", encoding="utf-8") as f:
        f.write(text_test1)
    with open(plain_text_2_path, "w", encoding="utf-8") as f:
        f.write(text_test2)
else:
    with open(plain_text_1_path, "r", encoding="utf-8") as f:
        text_test1 = f.read()
    with open(plain_text_2_path, "r", encoding="utf-8") as f:
        text_test2 = f.read()

In [35]:
# --- Extracción estructurada ---
questions_test1 = extract_questions_from_text(text_test1, test_number=1)
questions_test2 = extract_questions_from_text(text_test2, test_number=2)
all_questions = questions_test1 + questions_test2

# --- Exportación JSON ---
if export_json_questions:
    os.makedirs("../data/exams_tests2/clean_questions/", exist_ok=True)
    with open("../data/exams_tests2/clean_questions/questions_all_tests.json", "w", encoding="utf-8") as f:
        json.dump(all_questions, f, ensure_ascii=False, indent=4)
    print(f"✅ Exportadas {len(all_questions)} preguntas en JSON.")
else:
    print(f"✅ Procesadas {len(all_questions)} preguntas (sin exportar).")

✅ Exportadas 120 preguntas en JSON.


In [36]:
all_questions[20]

{'question_id': 'Test_1_21',
 'question': 'Which of the following code blocks performs an inner join of the salarydf and employeedf \nDataFrames for columns employeeSalaryID and employeeID, respectively?',
 'options': {'A': 'salarydf.join(employeedf, salarydf.employeeID == employeedf.\nemployeeSalaryID)\n\n\nB',
  'B': 'i.\t\nSalarydf.createOrReplaceTempView(salarydf)\nii.\t employeedf.createOrReplaceTempView(\'employeedf\')\niii.\t spark.sql("SELECT * FROM salarydf CROSS JOIN employeedf ON \nemployeeSalaryID ==employeeID")\nC',
  'C': 'i.\t\nsalarydf\nii.\t .join(employeedf, col(employeeID)==col(employeeSalaryID))\nD',
  'D': "i.\t\nSalarydf.createOrReplaceTempView(salarydf)\nii.\t employeedf.createOrReplaceTempView('employeedf')\niii.\t SELECT * FROM salarydf\niv.\t INNER JOIN employeedf\nv.\t\nON salarydf.employeeSalaryID == employeedf. employeeID"},
 'correct_answer': 'D'}

In [37]:
all_questions[24]

{'question_id': 'Test_1_25',
 'question': 'The following code block should return a df DataFrame, where the employeeID column is converted \ninto an integer. Choose the answer that correctly fills the blanks in the code block to accomplish this:\ndf.__1__(__2__.__3__(__4__))',
 'options': {'A': 'i.\t\nselect\nii.\t col("employeeID")\niii.\t as\niv.\t IntegerType\nB',
  'B': 'i.\t\nselect\nii.\t col("employeeID")\niii.\t as\niv.\t Integer\n\n\nC',
  'C': 'i.\t\ncast\nii.\t "employeeID"\niii.\t as\niv.\t IntegerType()\nD',
  'D': 'i.\t\nselect\nii.\t col("employeeID")\niii.\t cast\niv.\t IntegerType()'},
 'correct_answer': 'D'}

In [38]:
all_questions[37]

{'question_id': 'Test_1_38',
 'question': 'The following code block should write the df DataFrame as a Parquet file to the filePath path, \nreplacing any existing file. Choose the answer that correctly fills the blanks in the code block to \naccomplish this:\ndf.__1__.format("parquet").__2__(__3__).__4__(filePath)',
 'options': {'A': 'i.\t\nsave\nii.\t mode\niii.\t "ignore"\niv.\t path\n\n\nB',
  'B': 'i.\t\nstore\nii.\t with\niii.\t "replace"\niv.\t path\nC',
  'C': 'i.\t\nwrite\nii.\t mode\niii.\t "overwrite"\niv.\t save\nD',
  'D': 'i.\t\nsave\nii.\t mode\niii.\t "overwrite"\niv.\t path'},
 'correct_answer': 'C'}