# Testing Notebook for Cvlac Page Type Detection.
- Author: David Santiago Barreto Mora
- Last updated: 08/01/2024

---

In [74]:
# Import Python libraries
from typing import List, NamedTuple
from enum import Enum
import traceback

In [75]:
# Import third party modules
from bs4 import BeautifulSoup
# import knime.scripting.io as knio

In [76]:
# Cvlac type enum definition
class CvlacType(Enum):
    NORMAL = "Normal"
    PRIVATE = "Privado"
    EMPTY = "Vacio"
    UNKNOWN = "Tipo desconocido"
    ERROR = "Error processando la URL"


In [77]:
# Named tuple definition for type hinting
# Data structure for each row of all dataframes.
class ParsedCvlac(NamedTuple):
    url_cvlac: str
    html_document: str
    cvlac_type: CvlacType


In [78]:
# Definition of detectiion function
def parse_and_identify_page_type(df_row) -> str:

    html_string = df_row["Document"]
    cvlac_url = df_row["url_cvlac"]

    soup = BeautifulSoup(html_string, "html.parser")

    try:
        # 1. Identification of private pages. They are the simplest ones.
        private_msg_baseline = "La información de este currículo no está disponible por solicitud del investigador"
        private_msg_scraped = soup.find_all("blockquote")[1].text.strip()
        if private_msg_scraped == private_msg_baseline:
            return CvlacType.PRIVATE.value

        # 2. Identification of empty pages. Second simplest.
        academic_formation_string = "Formación Académica"
        academic_formation_element_string = soup.find(string=academic_formation_string)

        if academic_formation_element_string is None:
            return CvlacType.EMPTY.value

        if academic_formation_element_string is not None:
            return CvlacType.NORMAL.value


        # 4. If none of the if statements were entered, this is an unknown page.
        return CvlacType.UNKNOWN.value
    except:
        print(f"Error ocurred in processing of cvlac: {cvlac_url}")
        print(traceback.format_exc())
        return CvlacType.ERROR.value


In [79]:
# Definition of detectiion function
def parse_and_identify_page_type_test(html_string) -> str:


    soup = BeautifulSoup(html_string, "html.parser")

    try:
        # 1. Identification of private pages. They are the simplest ones.
        private_msg_baseline = "La información de este currículo no está disponible por solicitud del investigador"
        private_msg_scraped = soup.find_all("blockquote")[1].text.strip()
        if private_msg_scraped == private_msg_baseline:
            return CvlacType.PRIVATE.value

        # 2. Identification of empty pages. Second simplest.
        academic_formation_string = "Formación Académica"
        academic_formation_element_string = soup.find(string=academic_formation_string)

        if academic_formation_element_string is None:
            return CvlacType.EMPTY.value

        if academic_formation_element_string is not None:
            return CvlacType.NORMAL.value


        # 4. If none of the if statements were entered, this is an unknown page.
        return CvlacType.UNKNOWN.value
    except:
        print(traceback.format_exc())
        return CvlacType.ERROR.value


In [80]:
# Function for reading HTML files as strings
# Read the entire contents of an HTML file into a string
def read_html_file(file_path: str):
    with open(file_path, 'r', encoding='latin-1') as file:
        file_contents = file.read()
    return file_contents


---

## Private page sampling

In [81]:
# Read private samples as strings
private_sample_1 = read_html_file("../sample_scraped_cvlacs/private_sample_1.html")
private_sample_2 = read_html_file("../sample_scraped_cvlacs/private_sample_2.html")


In [82]:
# Creation of soup object
soup = BeautifulSoup(private_sample_1 ,"html.parser")
private_page_string = "La información de este currículo no está disponible por solicitud del investigador"

In [83]:
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   CvLAC - RG
  </title>
  <script>
   var texto="";
        var NS4 = (document.layers); // Which browser?
        var IE4 = (document.all);
        var win = window; // window to search.
        var n = 0;

        function resaltar(valor){
            var encontro=0;
            var donde=0;
            //valor=document.getElementById('tt').value;
            reemplazar=RegExp(valor,"i");
            por="<span style=\u0022background-Color:yellow;\u0022>"+valor+"</span>";
            if(texto==""){texto=document.body.innerHTML};
            txt=texto.split(">");
            for (x=0;x<txt.length;x++){
                desde=(txt[x].indexOf("<")!=-1)?txt[x].indexOf("<"):0;
                tempP=txt[x].slice(0,desde);
                tempU=txt[x].slice(desde);
                tempP

In [84]:
private_string_dirty = soup.body.find_all("blockquote")[1].text
private_string_clean = private_string_dirty.strip()

In [85]:
private_string_clean == private_page_string

True

In [86]:
repr(soup.body.find_all("blockquote")[1].text.strip())

"'La información de este currículo no está disponible por solicitud del investigador'"

In [87]:
repr(private_page_string)


"'La información de este currículo no está disponible por solicitud del investigador'"

In [88]:
print(soup.body.find_all("blockquote")[1].text.strip())

La información de este currículo no está disponible por solicitud del investigador


In [89]:
parse_and_identify_page_type_test(private_sample_1)

'Privado'

---

## Empty page sampling

In [90]:
# Read empty samples as strings
empty_sample_1 = read_html_file("../sample_scraped_cvlacs/empty_sample_1.html")
empty_sample_2  = read_html_file("../sample_scraped_cvlacs/empty_sample_2.html")


---

## Normal page sampling

In [91]:
# Read normal samples as strings
normal_sample_1 = read_html_file("../sample_scraped_cvlacs/normal_sample_1.html")
normal_sample_2 = read_html_file("../sample_scraped_cvlacs/normal_sample_2.html")


In [None]:
soup = BeautifulSoup(normal_sample_1, "html.parser")
