# Testing Notebook for Cvlac Page Type Detection.
- Author: David Santiago Barreto Mora
- Last updated: 08/01/2024

---

In [300]:
# Import Python libraries
from typing import List, NamedTuple
from enum import Enum
import traceback

In [301]:
# Import third party modules
from bs4 import BeautifulSoup
# import knime.scripting.io as knio

In [302]:
# Cvlac type enum definition
class CvlacType(Enum):
    NORMAL = "Normal"
    PRIVATE = "Privado"
    EMPTY = "Vacio"
    UNKNOWN = "Tipo desconocido"
    ERROR = "Error processando la URL"


In [303]:
# Named tuple definition for type hinting
# Data structure for each row of all dataframes.
class ParsedCvlac(NamedTuple):
    url_cvlac: str
    html_document: str
    cvlac_type: CvlacType


In [304]:
def detect_private_pages(bs4Soup) -> str or None:
    private_msg_baseline = "La información de este currículo no está disponible por solicitud del investigador"
    private_msg_scraped = bs4Soup.find_all("blockquote")[1].text.strip()
    if private_msg_scraped == private_msg_baseline:
        return CvlacType.PRIVATE.value
    else:
        return CvlacType.NORMAL.value


In [305]:
# Identification of empty and normal pages. The detection for both goes hand in hand.
def detect_empty_and_normal_pages(bs4Soup) -> str:
    try:
        green_msg_tick_img = bs4Soup.find("img", {"height": "15px", "width": "15px"})
        green_msg_parent_td = green_msg_tick_img.parent.parent.parent
        green_msg_parent_td_siblings = [element for element in green_msg_parent_td.next_siblings if element.name is not None]

        is_table_empty: bool = True
        for sibling_tr in green_msg_parent_td_siblings:
            sibling_tr_children_list = [
                element for element in sibling_tr.td.children if element.name is not None
            ]
            if len(sibling_tr_children_list) > 1:
                is_table_empty = False
                return CvlacType.NORMAL.value

        return CvlacType.EMPTY.value

    except Exception as e:
        print(f"Error in detect_private_and_normal_pages: {e}")
        print(traceback.format_exc())
        return CvlacType.ERROR.value



In [306]:
# Definition of function. GOES IN THE MAIN SCRIPT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
def parse_and_identify_page_type(df_row) -> str:
    html_string = df_row["Document"]
    cvlac_url = df_row["url_cvlac"]

    soup = BeautifulSoup(html_string, "html.parser")
    try:
        return detect_private_pages(soup)
    except:
        return detect_empty_and_normal_pages(soup)



In [307]:
def parse_and_identify_page_type_test(html_string) -> str:

    soup = BeautifulSoup(html_string, "html.parser")
    try:
        return detect_private_pages(soup)
    except:
        return detect_empty_and_normal_pages(soup)

In [308]:
# Read the entire contents of an HTML file into a string
def read_html_file(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        file_contents = file.read()
    return file_contents


---

## Private page sampling

In [309]:
# Read private samples as strings
private_sample_1 = read_html_file("../sample_scraped_cvlacs/private_sample_1.html")
private_sample_2 = read_html_file("../sample_scraped_cvlacs/private_sample_2.html")


In [310]:
# Creation of soup object
soup = BeautifulSoup(private_sample_1 ,"html.parser")
private_page_string = "La información de este currículo no está disponible por solicitud del investigador"

In [311]:
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   CvLAC - RG
  </title>
  <script>
   var texto="";
        var NS4 = (document.layers); // Which browser?
        var IE4 = (document.all);
        var win = window; // window to search.
        var n = 0;

        function resaltar(valor){
            var encontro=0;
            var donde=0;
            //valor=document.getElementById('tt').value;
            reemplazar=RegExp(valor,"i");
            por="<span style=\u0022background-Color:yellow;\u0022>"+valor+"</span>";
            if(texto==""){texto=document.body.innerHTML};
            txt=texto.split(">");
            for (x=0;x<txt.length;x++){
                desde=(txt[x].indexOf("<")!=-1)?txt[x].indexOf("<"):0;
                tempP=txt[x].slice(0,desde);
                tempU=txt[x].slice(desde);
                tempP

In [312]:
repr(soup.body.find_all("blockquote")[1].text.strip())

"'La información de este currículo no está disponible por solicitud del investigador'"

In [313]:
repr(private_page_string)


"'La información de este currículo no está disponible por solicitud del investigador'"

In [314]:
print(soup.body.find_all("blockquote")[1].text.strip())

La información de este currículo no está disponible por solicitud del investigador


In [315]:
parse_and_identify_page_type_test(private_sample_1)

'Privado'

In [316]:
parse_and_identify_page_type_test(private_sample_2)

'Privado'

---

## Empty page sampling

In [317]:
# Read empty samples as strings
empty_sample_1 = read_html_file("../sample_scraped_cvlacs/empty_sample_1.html")
empty_sample_2  = read_html_file("../sample_scraped_cvlacs/empty_sample_2.html")

In [318]:
soup = BeautifulSoup(empty_sample_1, 'html.parser')

In [319]:
print(soup.body.prettify())

<body>
 <div class="wrapper">
  <div class="header">
   <div class="menubar">
    <ul class="mmenu" style="height:23px;padding:2px;">
     <li>
      <a class="menu" href="#" style="width:100px">
       Datos generales
      </a>
      <ul>
       <li>
        <a class="menu" href="#datos_generales">
         Identificación
        </a>
       </li>
       <li>
        <a class="menu" href="#redes_identificadores">
         Redes sociales académicas
        </a>
       </li>
       <li>
        <a class="menu" href="#red_identificadores">
         Identificadores de autor
        </a>
       </li>
       <li>
        <a class="menu" href="#formacion_acad">
         Formación académica
        </a>
       </li>
       <li>
        <a class="menu" href="#formacion_comp">
         Formación complementaria
        </a>
       </li>
       <li>
        <a class="menu" href="#estancias_posdoctorales">
         Estancias posdoctorales
        </a>
       </li>
       <li>
        <a class="me

In [320]:
parse_and_identify_page_type_test(empty_sample_1)

'Vacio'

In [321]:
parse_and_identify_page_type_test(empty_sample_2)

'Vacio'

---

## Normal page sampling

In [322]:
# Read normal samples as strings
normal_sample_1 = read_html_file("../sample_scraped_cvlacs/normal_sample_1.html")
normal_sample_2 = read_html_file("../sample_scraped_cvlacs/normal_sample_2.html")

In [323]:
parse_and_identify_page_type_test(normal_sample_1)

'Normal'

In [324]:
parse_and_identify_page_type_test(normal_sample_2)

'Normal'