# Testing Notebook for Cvlac Page Type Detection.
- Author: David Santiago Barreto Mora
- Last updated: 08/01/2024

---

In [17]:
# Import Python libraries
from typing import List, NamedTuple
from enum import Enum
import traceback

In [18]:
# Import third party modules
from bs4 import BeautifulSoup
# import knime.scripting.io as knio

In [19]:
# Cvlac type enum definition
class CvlacType(Enum):
    NORMAL = "Normal"
    PRIVATE = "Privado"
    EMPTY = "Vacio"
    UNKNOWN = "Tipo desconocido"
    ERROR = "Error processando la URL"


In [20]:
# Named tuple definition for type hinting
# Data structure for each row of all dataframes.
class ParsedCvlac(NamedTuple):
    url_cvlac: str
    html_document: str
    cvlac_type: CvlacType


In [21]:
# Definition of detectiion function
def parse_and_identify_page_type(df_row) -> str:

    html_string = df_row["Document"]
    cvlac_url = df_row["url_cvlac"]

    soup = BeautifulSoup(html_string, "html.parser")

    try:
        # 1. Identification of private pages. They are the simplest ones.
        private_page_string = "La información de este currículo no está disponible por solicitud del investigador"
        only_blockquote_element = soup.find("blockquote")
        if private_page_string == only_blockquote_element.text.strip:
            return CvlacType.PRIVATE.value

        # 2. Identification of empty pages. Second simplest.
        academic_formation_string = "Formación Académica"
        academic_formation_element_string = soup.find(string=academic_formation_string)

        if academic_formation_element_string is None:
            return CvlacType.EMPTY.value

        if academic_formation_element_string is not None:
            return CvlacType.NORMAL.value


        # 4. If none of the if statements were entered, this is an unknown page.
        return CvlacType.UNKNOWN.value
    except:
        print(f"Error ocurred in processing of cvlac: {cvlac_url}")
        print(traceback.format_exc())
        return CvlacType.ERROR.value


In [22]:
# Function for reading HTML files as strings
# Read the entire contents of an HTML file into a string
def read_html_file(file_path: str):
    with open(file_path, 'r', encoding='latin-1') as file:
        file_contents = file.read()
    return file_contents


---

## Private page sampling

In [23]:
# Read private samples as strings
private_sample_1 = read_html_file("../sample_scraped_cvlacs/private_sample_1.html")
private_sample_2 = read_html_file("../sample_scraped_cvlacs/private_sample_2.html")


---

## Empty page sampling

In [24]:
# Read empty samples as strings
empty_sample_1 = read_html_file("../sample_scraped_cvlacs/empty_sample_1.html")
empty_sample_2  = read_html_file("../sample_scraped_cvlacs/empty_sample_2.html")


---

## Normal page sampling

In [25]:
# Read normal samples as strings
normal_sample_1 = read_html_file("../sample_scraped_cvlacs/normal_sample_1.html")
normal_sample_2 = read_html_file("../sample_scraped_cvlacs/normal_sample_2.html")
