In [1]:
len("Intervention__HASH__Vaccine-preventable-disease__HASH__Rotaviru")

63

In [4]:
def map_user_selection_to_column(user_selection, structured_data):
    """
    Maps a user's selection to the corresponding column name for database search.

    :param user_selection: The term or keyword selected by the user (e.g., "efficacy").
    :param structured_data: The structured dictionary containing mappings of display values and synonyms.
    :return: The original column name from filtered_columns or None if no match is found.
    """
    # Ensure user_selection is lowercased for case-insensitive comparison
    user_selection_lower = user_selection.lower()
    print(f"User selection: {user_selection_lower}")  # Debugging user input

    for category, subgroups in structured_data.items():
        if not isinstance(subgroups, dict):
            print(f"Skipping invalid subgroups in category {category}")
            continue
        for subgroup, values in subgroups.items():
            if not isinstance(values, dict):
                print(f"Skipping invalid values in subgroup {subgroup}")
                continue
            for column_name, details in values.items():
                if not isinstance(details, dict) or "display" not in details or "synonyms" not in details:
                    print(f"Skipping invalid details for column {column_name}")
                    continue

                # Debugging comparisons
                print(f"Checking column: {column_name}")
                print(f"Display: {details['display']}, Synonyms: {details['synonyms']}")

                # Match against display
                if user_selection_lower == details["display"].lower():
                    print(f"Matched display: {details['display']}")
                    return f"{category}__HASH__{subgroup}__HASH__{column_name}"

                # Match against synonyms
                if user_selection_lower in [syn.lower() for syn in details["synonyms"]]:
                    print(f"Matched synonym: {user_selection_lower}")
                    return f"{category}__HASH__{subgroup}__HASH__{column_name}"

    # If no match is found
    print(f"No match found for user selection: {user_selection_lower}")
    return None


user_selection = "efficacy"
structured_data = {'Population': {'AgeGroup': {'Newborn_0-1': {'display': 'Newborn_0-1', 'synonyms': ['newborn', 'babies', 'baby', 'infant', 'toddlers', 'young ones', 'youngsters', 'small children', 'Newborn_0-1'], 'additional_context': None}, 'Children_2-9': {'display': 'Children_2-9', 'synonyms': ['child', 'children', 'Children_2-9'], 'additional_context': None}, 'Adolescents_10-17': {'display': 'Adolescents_10-17', 'synonyms': ['adolescents', 'adolescent', 'young adults', 'Adolescents_10-17'], 'additional_context': None}, 'Adults_18-64': {'display': 'Adults_18-64', 'synonyms': ['adults', 'adult', 'Adults_18-64'], 'additional_context': None}, 'OlderAdults_65-10000': {'display': 'OlderAdults_65-10000', 'synonyms': ['elderly', 'older adults', 'OlderAdults_65-10000'], 'additional_context': None}}, 'SpecificGroup': {'HealthcareWorkers': {'display': 'HealthcareWorkers', 'synonyms': ['Physician', 'Nurse', 'Surgeon', 'Dentist', 'Pharmacist', 'Physical Therapist', 'Occupational Therapist', 'Medical Laboratory Technologist', 'Radiologist', 'Dietitian/Nutritionist', 'Respiratory Therapist', 'Speech-Language Pathologist', 'Physician Assistant', 'Nurse Practitioner', 'Certified Nursing Assistant (CNA)', 'Medical Assistant', 'Paramedic/EMT', 'Midwife', 'Psychologist', 'Social Worker (Clinical)', 'Hospital Administrator', 'Medical Researcher', 'Health Educator', 'Orthopedic Technician', 'Optometrist', 'Podiatrist', 'Anesthesiologist', 'Neurologist', 'Cardiologist', 'Gastroenterologist', 'HealthcareWorkers'], 'additional_context': None}, 'PregnantWomen': {'display': 'PregnantWomen', 'synonyms': ['pregnant', 'pregnant women', 'PregnantWomen'], 'additional_context': None}, 'Travellers': {'display': 'Travellers', 'synonyms': ['traveller', 'Travellers'], 'additional_context': None}, 'ParentsCaregivers': {'display': 'ParentsCaregivers', 'synonyms': ['parents', 'caregivers', 'ParentsCaregivers'], 'additional_context': None}}, 'ImmuneStatus': {'Immunocompromised': {'display': 'Immunocompromised', 'synonyms': ['immunocompromised', 'Immunocompromised'], 'additional_context': None}, 'Healthy': {'display': 'Healthy', 'synonyms': ['healthy', 'Healthy'], 'additional_context': None}}}, 'Topic': {'Efficacy-Effectiveness': {'Efficacy-Effectiveness': {'display': 'Efficacy-Effectiveness', 'synonyms': ['effectiveness', 'impact of', 'effectiveness of', 'efficacy', 'Efficacy-Effectiveness'], 'additional_context': None}}, 'Safety': {'Safety': {'display': 'Safety', 'synonyms': ['safety', 'adverse effects', 'adverse events', 'Safety'], 'additional_context': None}}, 'Risk-Factor': {'Risk-Factor': {'display': 'Risk-Factor', 'synonyms': ['risk factor', 'risk', 'Risk-Factor'], 'additional_context': None}}, 'Administration': {'Administration': {'display': 'Administration', 'synonyms': ['administration', 'vaccine types', 'dose schedules', 'vaccine types and dose schedules', 'different dose schedules', 'Two doses of', 'Administration'], 'additional_context': None}}, 'Economic-Aspects': {'Economic-Aspects': {'display': 'Economic-Aspects', 'synonyms': ['economic', 'cost', 'financial', 'economic impact', 'cost effectiveness', 'cost-effectiveness', 'cost', 'cost effectiveness', 'economic evaluation', 'Cost-effectiveness of HPV vaccination strategies', 'Economic-Aspects'], 'additional_context': None}}, 'Acceptance': {'Acceptance': {'display': 'Acceptance', 'synonyms': ['acceptance', 'Barrier', 'vaccine barriers', 'knowledge', 'vaccination willingness and intentions', 'HPV vaccine acceptability, acceptability', 'Awareness and knowledge', 'Awareness', 'facilitators of and barriers', 'awareness,knowledge, acceptability, and intention', 'knowledge and acceptability', 'knowledge and awareness', 'attitudes and beliefs', 'Knowledge and Attitude', 'attitude', 'knowledge, awareness, and attitude', 'Acceptance'], 'additional_context': None}}, 'Modeling': {'Modeling': {'display': 'Modeling', 'synonyms': ['modeling', 'Modeling'], 'additional_context': None}}, 'Ethical-Issues': {'Ethical-Issues': {'display': 'Ethical-Issues', 'synonyms': ['racial', 'ethnic', 'ethnic minority', 'racial minority', 'racial/ethnic', 'racial/ethnic minority', 'racial disparity', 'ethnic disparity', 'minority', 'minority population', 'Ethical-Issues'], 'additional_context': None}}, 'Coverage': {'Coverage': {'display': 'Coverage', 'synonyms': ['coverage', 'uptake', 'the uptake', 'actual uptake', 'vaccine uptake', 'Coverage'], 'additional_context': None}}}, 'Outcome': {'Infection': {'Infection': {'display': 'Infection', 'synonyms': ['infection', 'Infection'], 'additional_context': None}}, 'ICU': {'ICU': {'display': 'ICU', 'synonyms': ['ICU', 'intensive care unit', 'intensive care'], 'additional_context': None}}, 'Death': {'Death': {'display': 'Death', 'synonyms': ['death', 'mortality', 'overall mortality', 'cancer related mortality', 'on overall and cancer mortality', 'Death'], 'additional_context': None}}, 'Hospitalization': {'Hospitalization': {'display': 'Hospitalization', 'synonyms': ['hospitalization', 'Hospitalization'], 'additional_context': None}}}, 'Reviews': {'Reviews': {'review': {'display': 'review', 'synonyms': ['systematic review', 'Literature Review', 'review', 'Meta-Analysis', 'Critical Review', 'Peer Review', 'Book Review', 'Editorial Review', 'Review Article'], 'additional_context': None}}}, 'Studies': {'NoOfStudies': {'number_of_studies': {'display': 'number_of_studies', 'synonyms': ['studies', 'studies', 'number_of_studies'], 'additional_context': None}}, 'RCT': {'RCT_terms': {'display': 'RCT_terms', 'synonyms': ['brandomized controlled trial', 'RCT', 'brandomised controlled trial', 'brandomized trial', 'brandomised trial', 'RCT_terms'], 'additional_context': None}}}, 'Intervention': {'Vaccine-preventable-disease': {'COVID-19': {'display': 'COVID-19', 'synonyms': ['COVID-19', 'COVID', 'COVID 19'], 'additional_context': None}, 'Influenza': {'display': 'Influenza', 'synonyms': ['influenza', 'Influenza'], 'additional_context': None}, 'Dengue': {'display': 'Dengue', 'synonyms': ['dengue', 'Dengue'], 'additional_context': None}, 'Rotavirus': {'display': 'Rotavirus', 'synonyms': ['rotavirus', 'Rotavirus'], 'additional_context': None}}, 'Vaccine-Options': {'Live': {'display': 'Live', 'synonyms': ['live', 'Live'], 'additional_context': None}, 'Adjuvants': {'display': 'Adjuvants', 'synonyms': ['adjuvants', 'Adjuvants'], 'additional_context': None}, 'Non-Live': {'display': 'Non-Live', 'synonyms': ['non-live', 'Non-Live'], 'additional_context': None}}}}
result = map_user_selection_to_column(user_selection, structured_data)
print(f"Result: {result}")

User selection: efficacy
Checking column: Newborn_0-1
Display: Newborn_0-1, Synonyms: ['newborn', 'babies', 'baby', 'infant', 'toddlers', 'young ones', 'youngsters', 'small children', 'Newborn_0-1']
Checking column: Children_2-9
Display: Children_2-9, Synonyms: ['child', 'children', 'Children_2-9']
Checking column: Adolescents_10-17
Display: Adolescents_10-17, Synonyms: ['adolescents', 'adolescent', 'young adults', 'Adolescents_10-17']
Checking column: Adults_18-64
Display: Adults_18-64, Synonyms: ['adults', 'adult', 'Adults_18-64']
Checking column: OlderAdults_65-10000
Display: OlderAdults_65-10000, Synonyms: ['elderly', 'older adults', 'OlderAdults_65-10000']
Checking column: HealthcareWorkers
Display: HealthcareWorkers, Synonyms: ['Physician', 'Nurse', 'Surgeon', 'Dentist', 'Pharmacist', 'Physical Therapist', 'Occupational Therapist', 'Medical Laboratory Technologist', 'Radiologist', 'Dietitian/Nutritionist', 'Respiratory Therapist', 'Speech-Language Pathologist', 'Physician Assista

In [18]:
import re
from collections import defaultdict
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import requests

class DocumentParser:
    def __init__(self):
        self.sections = defaultdict(dict)

    def extract_text_from_html(self, html_content):
        """
        Extracts and cleans text from an HTML document.
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")
        return text.strip()

    def extract_text_from_pdf(self, pdf_path):
        """
        Extracts text from a PDF file using PyPDF2 with enhanced text normalization.
        Handles multi-column layouts and embedded images more robustly.
        """
        text = []
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                # Normalize spaces and handle multi-column artifacts
                cleaned_text = re.sub(r"\s+", " ", page_text).strip()
                text.append(cleaned_text)
            else:
                text.append("[Page content could not be extracted]")
        return "\n".join(text).strip()

    def extract_text_from_url(self, url):
        """
        Extracts text from a URL, handling both HTML and PDF resources.
        """
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            content_type = response.headers.get("Content-Type", "")
            if "text/html" in content_type:
                return self.extract_text_from_html(response.text)
            elif "application/pdf" in content_type:
                with open("temp.pdf", "wb") as f:
                    f.write(response.content)
                return self.extract_text_from_pdf("temp.pdf")
        else:
            raise ValueError(f"Failed to fetch URL. Status code: {response.status_code}")

    def identify_sections_with_subsections(self, text):
        """
        Dynamically identifies sections and their subsections in a document using flexible patterns.
        """
        # Define regex patterns for sections and subsections
        section_pattern = re.compile(r"^(?:\d+\.\s*)?(Abstract|Introduction|Methods|Materials|Data Collection|Results|Discussion|Conclusion|Acknowledgments|Appendix):?$", re.IGNORECASE | re.MULTILINE)
        subsection_pattern = re.compile(r"^\d+\.\d+\s+[A-Za-z0-9 \-]+:?")

        self.sections = defaultdict(dict)  # Reset sections
        current_section = None
        current_subsection = None

        for line in text.split("\n"):
            line = line.strip()
            if not line:
                continue

            # Match sections
            section_match = section_pattern.match(line)
            if section_match:
                current_section = section_match.group(1).strip()
                # Exclude "References" section explicitly
                if "reference" not in current_section.lower():
                    self.sections[current_section] = {"content": [], "subsections": {}}
                current_subsection = None
                continue

            # Match subsections
            subsection_match = subsection_pattern.match(line)
            if subsection_match and current_section:
                current_subsection = subsection_match.group(0).strip()
                self.sections[current_section]["subsections"][current_subsection] = []
                continue

            # Append content to the appropriate section or subsection
            if current_subsection:
                self.sections[current_section]["subsections"][current_subsection].append(line)
            elif current_section and "reference" not in current_section.lower():
                self.sections[current_section]["content"].append(line)

        # Combine subsection content
        for section in self.sections:
            for subsection in self.sections[section]["subsections"]:
                self.sections[section]["subsections"][subsection] = "\n".join(self.sections[section]["subsections"][subsection]).strip()
            self.sections[section]["content"] = "\n".join(self.sections[section]["content"]).strip()

    def get_section_content(self, key):
        """
        Retrieves content for a given section or subsection key.

        Args:
            key (str): The section or subsection key to retrieve content for.

        Returns:
            dict or str: The content of the section or subsection. If it has subsections, a dictionary is returned.
        """
        # Check for top-level section
        if key in self.sections:
            return {
                "content": self.sections[key]["content"],
                "subsections": self.sections[key]["subsections"]
            }

        # Check for subsection
        for section, details in self.sections.items():
            if key in details["subsections"]:
                return details["subsections"][key]

        return None  # Key not found

    def has_subsections(self, key):
        """
        Checks if a given section has subsections.

        Args:
            key (str): The section key to check.

        Returns:
            bool: True if the section has subsections, False otherwise.
        """
        if key in self.sections and self.sections[key]["subsections"]:
            return True
        return False

    def get_all_section_content(self):
        """
        Retrieves content of all sections and their subsections, excluding references.

        Returns:
            dict: A dictionary containing all sections and their content, excluding "References".
        """
        all_content = {}
        for section, details in self.sections.items():
            if "reference" in section.lower():
                continue
            all_content[section] = {
                "content": details["content"],
                "subsections": details["subsections"]
            }
        return all_content

    def save_content_to_file(self, content_dict, output_file):
        """
        Save extracted content to a file, excluding sections named 'References' or similar.

        Parameters:
        - content_dict (dict): Dictionary with sections and subsections.
        - output_file (str): Path to save the content.
        """
        with open(output_file, "w", encoding="utf-8") as file:
            for section, content in content_dict.items():
                file.write(f"--- {section} ---\n")
                file.write("Content:\n")
                file.write(content["content"] + "\n")
                file.write("Subsections:\n")
                for subsection, subsection_content in content["subsections"].items():
                    file.write(f"  - {subsection}: {subsection_content}\n")
                file.write("\n")

# Example usage
parser = DocumentParser()

# Test with a URL for a PDF
pdf_url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10278373/pdf/johm-8-S3-93.pdf"
pdf_text = parser.extract_text_from_url(pdf_url)
parser.identify_sections_with_subsections(pdf_text)
pdf_content = parser.get_all_section_content()
parser.save_content_to_file(pdf_content, "extracted_pdf_content.txt")
print("PDF content saved to extracted_pdf_content.txt, excluding references.")

# Test with a URL for HTML
html_url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10278373/"
html_text = parser.extract_text_from_url(html_url)
parser.identify_sections_with_subsections(html_text)
html_content = parser.get_all_section_content()
parser.save_content_to_file(html_content, "extracted_html_content.txt")
print("HTML content saved to extracted_html_content.txt, excluding references.")


PDF content saved to extracted_pdf_content.txt, excluding references.
HTML content saved to extracted_html_content.txt, excluding references.


In [13]:
import requests
from PyPDF2 import PdfReader

def save_content_to_file(content_dict, output_file):
    """
    Save extracted content to a file, excluding sections named 'References' or similar.
    
    Parameters:
    - content_dict (dict): Dictionary with sections and subsections.
    - output_file (str): Path to save the content.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        for section, content in content_dict.items():
            # Skip sections named "References" or similar
            if "reference" in section.lower():
                continue
            
            file.write(f"--- {section} ---\n")
            file.write("Content:\n")
            file.write(content["content"] + "\n")
            file.write("Subsections:\n")
            for subsection, subsection_content in content["subsections"].items():
                file.write(f"  - {subsection}: {subsection_content}\n")
            file.write("\n")

def save_pdf_content_to_file(pdf_content, output_file):
    """
    Save extracted PDF content to a file.
    
    Parameters:
    - pdf_content (list): List of strings, one per page.
    - output_file (str): Path to save the content.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        for page_number, text in enumerate(pdf_content, start=1):
            file.write(f"--- Page {page_number} ---\n")
            file.write(text + "\n")

def fetch_and_process_url(url):
    """
    Fetch content from a URL, process based on content type (HTML or PDF),
    and save extracted content to a file.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        content_type = response.headers.get("Content-Type", "")
        
        if "text/html" in content_type:
            # Handle HTML content
            html_content = response.text
            parser = DocumentParser()

            # Extract text from the HTML
            document_text = parser.extract_text_from_html(html_content)
            
            # Identify sections and subsections
            parser.identify_sections_with_subsections(document_text)
            
            # Retrieve all sections
            all_content = parser.get_all_section_content()
            
            # Save content to a file
            output_file = "extracted_content_html.txt"
            save_content_to_file(all_content, output_file)
            print(f"HTML content saved to {output_file}, excluding references.")

        elif "application/pdf" in content_type:
            # Handle PDF content using PyPDF2
            output_file = "extracted_content_pdf.txt"
            
            with open("temp.pdf", "wb") as temp_pdf:
                temp_pdf.write(response.content)

            reader = PdfReader("temp.pdf")
            pdf_content = [page.extract_text() for page in reader.pages]
            
            save_pdf_content_to_file(pdf_content, output_file)
            print(f"PDF content saved to {output_file}.")
        else:
            print("Unsupported content type.")
    else:
        print(f"Failed to fetch the URL. Status code: {response.status_code}")

# Example usage
url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10278373/pdf/johm-8-S3-93.pdf"  # Replace with your URL
fetch_and_process_url(url)


PDF content saved to extracted_content_pdf.txt.


In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = "https://cidades.ibge.gov.br/brasil/se/aracaju/panorama"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    print(response.content)
    # Extract specific information (e.g., population data)
    # Adjust the selectors based on the webpage structure
    population = soup.find('span', class_='population-number')  # Example selector
    if population:
        print(f"Population: {population.text.strip()}")
    else:
        print("Population data not found.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


b'\xef\xbb\xbf<!DOCTYPE html>\r\n<html lang="pt-br">\r\n\r\n<head>\r\n<meta charset="utf-8">\r\n  <meta http-equiv="x-ua-compatible" content="ie=edge">\r\n  <meta http-equiv="cache-control" content="no-transform" />\r\n  <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">\r\n\r\n  <!-- Google tag (gtag.js) --> <script async src="https://www.googletagmanager.com/gtag/js?id=G-YLYPPRQS7S"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-YLYPPRQS7S\'); </script>\r\n\r\n  <link rel="icon" href="data:;base64,iVBORw0KGgo=">\r\n  <link rel="stylesheet" href="/css/global.css">\r\n\r\n  <!--font awesome-->\r\n  <link rel="stylesheet" href="/css/font-awesome/font-awesome.min.css">\r\n\r\n  <base href="/">\r\n\r\n</head>\r\n\r\n<body class="estado">\r\n\r\n  <!-- Componente principal -->\r\n  <app></app>\r\n\r\n  <!-- integrity gerado por https://www.srihash.org