# Testing Notebook for Cvlac Page Type Detection.
- Author: David Santiago Barreto Mora
- Last updated: 08/01/2024

---

In [303]:
# Import Python libraries
from typing import List, NamedTuple
from enum import Enum
import traceback

In [304]:
# Import third party modules
from bs4 import BeautifulSoup
# import knime.scripting.io as knio

In [305]:
# Cvlac type enum definition
class CvlacType(Enum):
    NORMAL = "Normal"
    PRIVATE = "Privado"
    EMPTY = "Vacio"
    UNKNOWN = "Tipo desconocido"
    ERROR = "Error processando la URL"


In [306]:
# Named tuple definition for type hinting
# Data structure for each row of all dataframes.
class ParsedCvlac(NamedTuple):
    url_cvlac: str
    html_document: str
    cvlac_type: CvlacType


In [307]:
# Definition of detectiion function
def parse_and_identify_page_type(df_row) -> str:

    html_string = df_row["Document"]
    cvlac_url = df_row["url_cvlac"]

    soup = BeautifulSoup(html_string, "html.parser")

    try:
        # 1. Identification of private pages. They are the simplest ones.
        private_msg_baseline = "La información de este currículo no está disponible por solicitud del investigador"
        private_msg_scraped = soup.find_all("blockquote")[1].text.strip()
        if private_msg_scraped == private_msg_baseline:
            return CvlacType.PRIVATE.value

        # 2. Identification of empty pages. Second simplest.

        # 4. If none of the if statements were entered, this is an unknown page.
        return CvlacType.UNKNOWN.value
    except:
        print(f"Error ocurred in processing of cvlac: {cvlac_url}")
        print(traceback.format_exc())
        return CvlacType.ERROR.value


In [308]:
# Definition of detectiion function
def parse_and_identify_page_type_test(html_string) -> str:

    soup = BeautifulSoup(html_string, "html.parser")
    try:
        # 2 - 3. Identification of empty and normal pages. The detection for both goes hand in hand.
        # Firstly, the image element in the green message is searched. It serves as the initial reference point.
        green_msg_tick_img = soup.find("img", {"height": "15px", "width": "15px"})
        # Now, we go back to it's parent <td>. We have to go 3 levels up. img < blockquote < td <td
        green_msg_parent_td = green_msg_tick_img.parent.parent.parent
        green_msg_parent_td_siblings = [element for element in green_msg_parent_td.next_siblings if element.name is not None]

        # Now that we are on the same level as all other <tr>s (which then contain the products themselves). We iterate over all next_siblings.
        is_table_empty: bool = True

        for sibling_tr in green_msg_parent_td_siblings:
            # Inside each sibling tr, we look if the child <td> has more than 1 child. If it DOES, then it's not empty.
            # If it has 1 or 0 childs, the page so far is empty.
            try:
                sibling_tr_children_list = [element for element in sibling_tr.td.children if element.name is not None]
                if len(sibling_tr_children_list) > 1:
                    is_table_empty = False
                    return CvlacType.NORMAL.value
            except:
                print("Error in parsing sibling <tr>s.")
                print(sibling_tr)
                return CvlacType.ERROR.value

        if is_table_empty == True:
            return CvlacType.EMPTY.value

        # 4. If none of the if statements were entered, this is an unknown page.
        return CvlacType.UNKNOWN.value
    except:
        # 1. Identification of private pages. They are the simplest ones.
        private_msg_baseline = "La información de este currículo no está disponible por solicitud del investigador"
        private_msg_scraped = soup.find_all("blockquote")[1].text.strip()
        if private_msg_scraped == private_msg_baseline:
            return CvlacType.PRIVATE.value


In [309]:
# Function for reading HTML files as strings
# Read the entire contents of an HTML file into a string
def read_html_file(file_path: str):
    with open(file_path, 'r', encoding='latin-1') as file:
        file_contents = file.read()
    return file_contents


---

## Private page sampling

In [310]:
# Read private samples as strings
private_sample_1 = read_html_file("../sample_scraped_cvlacs/private_sample_1.html")
private_sample_2 = read_html_file("../sample_scraped_cvlacs/private_sample_2.html")


In [311]:
# Creation of soup object
soup = BeautifulSoup(private_sample_1 ,"html.parser")
private_page_string = "La información de este currículo no está disponible por solicitud del investigador"

In [312]:
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   CvLAC - RG
  </title>
  <script>
   var texto="";
        var NS4 = (document.layers); // Which browser?
        var IE4 = (document.all);
        var win = window; // window to search.
        var n = 0;

        function resaltar(valor){
            var encontro=0;
            var donde=0;
            //valor=document.getElementById('tt').value;
            reemplazar=RegExp(valor,"i");
            por="<span style=\u0022background-Color:yellow;\u0022>"+valor+"</span>";
            if(texto==""){texto=document.body.innerHTML};
            txt=texto.split(">");
            for (x=0;x<txt.length;x++){
                desde=(txt[x].indexOf("<")!=-1)?txt[x].indexOf("<"):0;
                tempP=txt[x].slice(0,desde);
                tempU=txt[x].slice(desde);
                tempP

In [313]:
private_string_dirty = soup.body.find_all("blockquote")[1].text
private_string_clean = private_string_dirty.strip()

In [314]:
private_string_clean == private_page_string

True

In [315]:
repr(soup.body.find_all("blockquote")[1].text.strip())

"'La información de este currículo no está disponible por solicitud del investigador'"

In [316]:
repr(private_page_string)


"'La información de este currículo no está disponible por solicitud del investigador'"

In [317]:
print(soup.body.find_all("blockquote")[1].text.strip())

La información de este currículo no está disponible por solicitud del investigador


In [318]:
parse_and_identify_page_type_test(private_sample_1)

'Privado'

In [319]:
parse_and_identify_page_type_test(private_sample_2)

'Privado'

---

## Empty page sampling

In [320]:
# Read empty samples as strings
empty_sample_1 = read_html_file("../sample_scraped_cvlacs/empty_sample_1.html")
empty_sample_2  = read_html_file("../sample_scraped_cvlacs/empty_sample_2.html")

In [321]:
soup = BeautifulSoup(empty_sample_1, 'html.parser')

In [322]:
print(soup.body.prettify())

<body>
 <div class="wrapper">
  <div class="header">
   <div class="menubar">
    <ul class="mmenu" style="height:23px;padding:2px;">
     <li>
      <a class="menu" href="#" style="width:100px">
       Datos generales
      </a>
      <ul>
       <li>
        <a class="menu" href="#datos_generales">
         Identificación
        </a>
       </li>
       <li>
        <a class="menu" href="#redes_identificadores">
         Redes sociales académicas
        </a>
       </li>
       <li>
        <a class="menu" href="#red_identificadores">
         Identificadores de autor
        </a>
       </li>
       <li>
        <a class="menu" href="#formacion_acad">
         Formación académica
        </a>
       </li>
       <li>
        <a class="menu" href="#formacion_comp">
         Formación complementaria
        </a>
       </li>
       <li>
        <a class="menu" href="#estancias_posdoctorales">
         Estancias posdoctorales
        </a>
       </li>
       <li>
        <a class="me

In [323]:
# Firstly, the image element in the green message is searched. It serves as the initial reference point.
green_msg_tick_img = soup.find("img", {"height": "15px", "width": "15px"})
green_msg_tick_img

<img height="15px" src="/cvlac/images/chulo.jpg" width="15px"/>

In [324]:
# Now, we go back to it's parent <td>. We have to go 3 levels up. img < blockquote < td <td
green_msg_parent_td = green_msg_tick_img.parent.parent.parent
green_msg_parent_td_siblings = [element for element in green_msg_parent_td.next_siblings if element.name is not None]
green_msg_parent_td_siblings

[<tr>
 <td width="100%"><a name="asesorias"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="curso_corta_du"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="trabajos_dirigi"></a></td>
 </tr>,
 <tr>
 <td align="right" width="100%"> </td>
 </tr>,
 <tr>
 <td width="100%"><a name="jurado"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="comite"></a></td>
 </tr>,
 <tr>
 <td><a name="par"></a></td>
 </tr>,
 <tr>
 <td width="100%"></td>
 </tr>,
 <tr>
 <td width="100%"><a name="edicion"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="evento"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="re_co"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="pr_ex_re"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="asc_for"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="asc_tra"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="asc_gen"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="asc_cad"></a></td>
 </tr>,
 <tr>
 <td width="100%"><a name="div_ced"></a></td>
 </tr>,
 <tr>
 <td wi

In [325]:
# Now that we are on the same level as all other <tr>s (which then contain the products themselves). We iterate over all next_siblings.
is_table_empty: bool = True

for sibling_tr in green_msg_parent_td_siblings:
    # Inside each sibling tr, we look if the child <td> has more than 1 child. If it DOES, then it's not empty.
    # If it has 1 or 0 childs, the page so far is empty.
    try:
        sibling_tr_children_list = [element for element in sibling_tr.children if element.name is not None]
        if len(sibling_tr_children_list) > 1:
            is_table_empty = False

        if is_table_empty == False:
            print(CvlacType.NORMAL.value)
    except:
        print("Error parsing.")
        print(sibling_tr)

print(CvlacType.EMPTY.value)


Vacio


In [326]:
parse_and_identify_page_type_test(empty_sample_1)

<tr>
 <td width="100%">
  <a name="asesorias">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="curso_corta_du">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="trabajos_dirigi">
  </a>
 </td>
</tr>

<tr>
 <td align="right" width="100%">
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="jurado">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="comite">
  </a>
 </td>
</tr>

<tr>
 <td>
  <a name="par">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="edicion">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="evento">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="re_co">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="pr_ex_re">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_for">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_tra">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_gen">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_cad">
  </a>
 </t

'Vacio'

In [327]:
parse_and_identify_page_type_test(empty_sample_2)

<tr>
 <td width="100%">
  <a name="asesorias">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="curso_corta_du">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="trabajos_dirigi">
  </a>
 </td>
</tr>

<tr>
 <td align="right" width="100%">
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="jurado">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="comite">
  </a>
 </td>
</tr>

<tr>
 <td>
  <a name="par">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="edicion">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="evento">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="re_co">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="pr_ex_re">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_for">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_tra">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_gen">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="asc_cad">
  </a>
 </t

'Vacio'

---

## Normal page sampling

In [328]:
# Read normal samples as strings
normal_sample_1 = read_html_file("../sample_scraped_cvlacs/normal_sample_1.html")
normal_sample_2 = read_html_file("../sample_scraped_cvlacs/normal_sample_2.html")

In [329]:
parse_and_identify_page_type_test(normal_sample_1)

<tr>
 <td width="100%">
  <a name="asesorias">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="curso_corta_du">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="trabajos_dirigi">
  </a>
  <table style="border:#999 1px solid" width="100%">
   <tr>
    <td>
     <h3>
      Trabajos dirigidos/tutorías
     </h3>
    </td>
   </tr>
   <tr>
    <td>
     <li>
      <b>
       Trabajos dirigidos/Tutorías - Trabajos de grado de pregrado
      </b>
     </li>
    </td>
   </tr>
   <tr>
    <td>
     <blockquote>
      CARLOS LUIS DEL CAIRO SILVA,

                                ¿Naturaleza para producir o para conservar? : una aproximación a los procesos turísticos comunitarios de las veredas de Piñalito y Maracaibo en Vista Hermosa, Meta 
                                PONTIFICIA UNIVERSIDAD JAVERIANA 
                                Estado: Tesis concluida 
                                ANTROPOLOGIA,
                                 2021.
      <i>
       Dirigió como:
   

'Normal'

In [330]:
parse_and_identify_page_type_test(normal_sample_2)

<tr>
 <td width="100%">
  <a name="asesorias">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="curso_corta_du">
  </a>
 </td>
</tr>

<tr>
 <td width="100%">
  <a name="trabajos_dirigi">
  </a>
  <table style="border:#999 1px solid" width="100%">
   <tr>
    <td>
     <h3>
      Trabajos dirigidos/tutorías
     </h3>
    </td>
   </tr>
   <tr>
    <td>
     <li>
      <img height="15px" src="/cvlac/images/chulo.jpg" width="15px"/>
      <b>
       Trabajos dirigidos/Tutorías - Trabajos de grado de pregrado
      </b>
     </li>
    </td>
   </tr>
   <tr>
    <td>
     <blockquote>
      GLORIA STELLA BARRERA JURADO,

                                El estrado como referente de función y ámbito, reflejado en la forma del mueble actual 
                                PONTIFICIA UNIVERSIDAD JAVERIANA 
                                Estado: Tesis concluida 
                                DISEÑO INDUSTRIAL,
                                 2004.
      <i>
       Dirigió como:
    

'Normal'