In [3]:
import altair as alt
import pandas as pd
from vega_datasets import data

# Load TopoJSON countries
countries = alt.topo_feature(data.world_110m.url, 'countries')

# Your country data (using ISO numeric codes)
source = pd.DataFrame({
    'id': [840, 124, 356],  # USA, Canada, India
    'country': ['United States', 'Canada', 'India'],
    'value': [300, 100, 1200]
})

# Choropleth with fallback value
choropleth = alt.Chart(countries).mark_geoshape().encode(
    color='value:Q'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['value'])
).transform_calculate(
    value="datum.value !== null ? datum.value : 0"
).project(
    type='naturalEarth1'
).properties(
    width=800,
    height=400
)

choropleth


In [4]:
def map_user_selection_to_column(user_selection, structured_data):
    """
    Maps a user's selection to the corresponding column name for database search.

    :param user_selection: The term or keyword selected by the user (e.g., "efficacy").
    :param structured_data: The structured dictionary containing mappings of display values and synonyms.
    :return: The original column name from filtered_columns or None if no match is found.
    """
    # Ensure user_selection is lowercased for case-insensitive comparison
    user_selection_lower = user_selection.lower()
    print(f"User selection: {user_selection_lower}")  # Debugging user input

    for category, subgroups in structured_data.items():
        if not isinstance(subgroups, dict):
            print(f"Skipping invalid subgroups in category {category}")
            continue
        for subgroup, values in subgroups.items():
            if not isinstance(values, dict):
                print(f"Skipping invalid values in subgroup {subgroup}")
                continue
            for column_name, details in values.items():
                if not isinstance(details, dict) or "display" not in details or "synonyms" not in details:
                    print(f"Skipping invalid details for column {column_name}")
                    continue

                # Debugging comparisons
                print(f"Checking column: {column_name}")
                print(f"Display: {details['display']}, Synonyms: {details['synonyms']}")

                # Match against display
                if user_selection_lower == details["display"].lower():
                    print(f"Matched display: {details['display']}")
                    return f"{category}__HASH__{subgroup}__HASH__{column_name}"

                # Match against synonyms
                if user_selection_lower in [syn.lower() for syn in details["synonyms"]]:
                    print(f"Matched synonym: {user_selection_lower}")
                    return f"{category}__HASH__{subgroup}__HASH__{column_name}"

    # If no match is found
    print(f"No match found for user selection: {user_selection_lower}")
    return None


user_selection = "efficacy"
structured_data = {'Population': {'AgeGroup': {'Newborn_0-1': {'display': 'Newborn_0-1', 'synonyms': ['newborn', 'babies', 'baby', 'infant', 'toddlers', 'young ones', 'youngsters', 'small children', 'Newborn_0-1'], 'additional_context': None}, 'Children_2-9': {'display': 'Children_2-9', 'synonyms': ['child', 'children', 'Children_2-9'], 'additional_context': None}, 'Adolescents_10-17': {'display': 'Adolescents_10-17', 'synonyms': ['adolescents', 'adolescent', 'young adults', 'Adolescents_10-17'], 'additional_context': None}, 'Adults_18-64': {'display': 'Adults_18-64', 'synonyms': ['adults', 'adult', 'Adults_18-64'], 'additional_context': None}, 'OlderAdults_65-10000': {'display': 'OlderAdults_65-10000', 'synonyms': ['elderly', 'older adults', 'OlderAdults_65-10000'], 'additional_context': None}}, 'SpecificGroup': {'HealthcareWorkers': {'display': 'HealthcareWorkers', 'synonyms': ['Physician', 'Nurse', 'Surgeon', 'Dentist', 'Pharmacist', 'Physical Therapist', 'Occupational Therapist', 'Medical Laboratory Technologist', 'Radiologist', 'Dietitian/Nutritionist', 'Respiratory Therapist', 'Speech-Language Pathologist', 'Physician Assistant', 'Nurse Practitioner', 'Certified Nursing Assistant (CNA)', 'Medical Assistant', 'Paramedic/EMT', 'Midwife', 'Psychologist', 'Social Worker (Clinical)', 'Hospital Administrator', 'Medical Researcher', 'Health Educator', 'Orthopedic Technician', 'Optometrist', 'Podiatrist', 'Anesthesiologist', 'Neurologist', 'Cardiologist', 'Gastroenterologist', 'HealthcareWorkers'], 'additional_context': None}, 'PregnantWomen': {'display': 'PregnantWomen', 'synonyms': ['pregnant', 'pregnant women', 'PregnantWomen'], 'additional_context': None}, 'Travellers': {'display': 'Travellers', 'synonyms': ['traveller', 'Travellers'], 'additional_context': None}, 'ParentsCaregivers': {'display': 'ParentsCaregivers', 'synonyms': ['parents', 'caregivers', 'ParentsCaregivers'], 'additional_context': None}}, 'ImmuneStatus': {'Immunocompromised': {'display': 'Immunocompromised', 'synonyms': ['immunocompromised', 'Immunocompromised'], 'additional_context': None}, 'Healthy': {'display': 'Healthy', 'synonyms': ['healthy', 'Healthy'], 'additional_context': None}}}, 'Topic': {'Efficacy-Effectiveness': {'Efficacy-Effectiveness': {'display': 'Efficacy-Effectiveness', 'synonyms': ['effectiveness', 'impact of', 'effectiveness of', 'efficacy', 'Efficacy-Effectiveness'], 'additional_context': None}}, 'Safety': {'Safety': {'display': 'Safety', 'synonyms': ['safety', 'adverse effects', 'adverse events', 'Safety'], 'additional_context': None}}, 'Risk-Factor': {'Risk-Factor': {'display': 'Risk-Factor', 'synonyms': ['risk factor', 'risk', 'Risk-Factor'], 'additional_context': None}}, 'Administration': {'Administration': {'display': 'Administration', 'synonyms': ['administration', 'vaccine types', 'dose schedules', 'vaccine types and dose schedules', 'different dose schedules', 'Two doses of', 'Administration'], 'additional_context': None}}, 'Economic-Aspects': {'Economic-Aspects': {'display': 'Economic-Aspects', 'synonyms': ['economic', 'cost', 'financial', 'economic impact', 'cost effectiveness', 'cost-effectiveness', 'cost', 'cost effectiveness', 'economic evaluation', 'Cost-effectiveness of HPV vaccination strategies', 'Economic-Aspects'], 'additional_context': None}}, 'Acceptance': {'Acceptance': {'display': 'Acceptance', 'synonyms': ['acceptance', 'Barrier', 'vaccine barriers', 'knowledge', 'vaccination willingness and intentions', 'HPV vaccine acceptability, acceptability', 'Awareness and knowledge', 'Awareness', 'facilitators of and barriers', 'awareness,knowledge, acceptability, and intention', 'knowledge and acceptability', 'knowledge and awareness', 'attitudes and beliefs', 'Knowledge and Attitude', 'attitude', 'knowledge, awareness, and attitude', 'Acceptance'], 'additional_context': None}}, 'Modeling': {'Modeling': {'display': 'Modeling', 'synonyms': ['modeling', 'Modeling'], 'additional_context': None}}, 'Ethical-Issues': {'Ethical-Issues': {'display': 'Ethical-Issues', 'synonyms': ['racial', 'ethnic', 'ethnic minority', 'racial minority', 'racial/ethnic', 'racial/ethnic minority', 'racial disparity', 'ethnic disparity', 'minority', 'minority population', 'Ethical-Issues'], 'additional_context': None}}, 'Coverage': {'Coverage': {'display': 'Coverage', 'synonyms': ['coverage', 'uptake', 'the uptake', 'actual uptake', 'vaccine uptake', 'Coverage'], 'additional_context': None}}}, 'Outcome': {'Infection': {'Infection': {'display': 'Infection', 'synonyms': ['infection', 'Infection'], 'additional_context': None}}, 'ICU': {'ICU': {'display': 'ICU', 'synonyms': ['ICU', 'intensive care unit', 'intensive care'], 'additional_context': None}}, 'Death': {'Death': {'display': 'Death', 'synonyms': ['death', 'mortality', 'overall mortality', 'cancer related mortality', 'on overall and cancer mortality', 'Death'], 'additional_context': None}}, 'Hospitalization': {'Hospitalization': {'display': 'Hospitalization', 'synonyms': ['hospitalization', 'Hospitalization'], 'additional_context': None}}}, 'Reviews': {'Reviews': {'review': {'display': 'review', 'synonyms': ['systematic review', 'Literature Review', 'review', 'Meta-Analysis', 'Critical Review', 'Peer Review', 'Book Review', 'Editorial Review', 'Review Article'], 'additional_context': None}}}, 'Studies': {'NoOfStudies': {'number_of_studies': {'display': 'number_of_studies', 'synonyms': ['studies', 'studies', 'number_of_studies'], 'additional_context': None}}, 'RCT': {'RCT_terms': {'display': 'RCT_terms', 'synonyms': ['brandomized controlled trial', 'RCT', 'brandomised controlled trial', 'brandomized trial', 'brandomised trial', 'RCT_terms'], 'additional_context': None}}}, 'Intervention': {'Vaccine-preventable-disease': {'COVID-19': {'display': 'COVID-19', 'synonyms': ['COVID-19', 'COVID', 'COVID 19'], 'additional_context': None}, 'Influenza': {'display': 'Influenza', 'synonyms': ['influenza', 'Influenza'], 'additional_context': None}, 'Dengue': {'display': 'Dengue', 'synonyms': ['dengue', 'Dengue'], 'additional_context': None}, 'Rotavirus': {'display': 'Rotavirus', 'synonyms': ['rotavirus', 'Rotavirus'], 'additional_context': None}}, 'Vaccine-Options': {'Live': {'display': 'Live', 'synonyms': ['live', 'Live'], 'additional_context': None}, 'Adjuvants': {'display': 'Adjuvants', 'synonyms': ['adjuvants', 'Adjuvants'], 'additional_context': None}, 'Non-Live': {'display': 'Non-Live', 'synonyms': ['non-live', 'Non-Live'], 'additional_context': None}}}}
result = map_user_selection_to_column(user_selection, structured_data)
print(f"Result: {result}")

User selection: efficacy
Checking column: Newborn_0-1
Display: Newborn_0-1, Synonyms: ['newborn', 'babies', 'baby', 'infant', 'toddlers', 'young ones', 'youngsters', 'small children', 'Newborn_0-1']
Checking column: Children_2-9
Display: Children_2-9, Synonyms: ['child', 'children', 'Children_2-9']
Checking column: Adolescents_10-17
Display: Adolescents_10-17, Synonyms: ['adolescents', 'adolescent', 'young adults', 'Adolescents_10-17']
Checking column: Adults_18-64
Display: Adults_18-64, Synonyms: ['adults', 'adult', 'Adults_18-64']
Checking column: OlderAdults_65-10000
Display: OlderAdults_65-10000, Synonyms: ['elderly', 'older adults', 'OlderAdults_65-10000']
Checking column: HealthcareWorkers
Display: HealthcareWorkers, Synonyms: ['Physician', 'Nurse', 'Surgeon', 'Dentist', 'Pharmacist', 'Physical Therapist', 'Occupational Therapist', 'Medical Laboratory Technologist', 'Radiologist', 'Dietitian/Nutritionist', 'Respiratory Therapist', 'Speech-Language Pathologist', 'Physician Assista

In [18]:
import re
from collections import defaultdict
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import requests

class DocumentParser:
    def __init__(self):
        self.sections = defaultdict(dict)

    def extract_text_from_html(self, html_content):
        """
        Extracts and cleans text from an HTML document.
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator="\n")
        return text.strip()

    def extract_text_from_pdf(self, pdf_path):
        """
        Extracts text from a PDF file using PyPDF2 with enhanced text normalization.
        Handles multi-column layouts and embedded images more robustly.
        """
        text = []
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                # Normalize spaces and handle multi-column artifacts
                cleaned_text = re.sub(r"\s+", " ", page_text).strip()
                text.append(cleaned_text)
            else:
                text.append("[Page content could not be extracted]")
        return "\n".join(text).strip()

    def extract_text_from_url(self, url):
        """
        Extracts text from a URL, handling both HTML and PDF resources.
        """
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            content_type = response.headers.get("Content-Type", "")
            if "text/html" in content_type:
                return self.extract_text_from_html(response.text)
            elif "application/pdf" in content_type:
                with open("temp.pdf", "wb") as f:
                    f.write(response.content)
                return self.extract_text_from_pdf("temp.pdf")
        else:
            raise ValueError(f"Failed to fetch URL. Status code: {response.status_code}")

    def identify_sections_with_subsections(self, text):
        """
        Dynamically identifies sections and their subsections in a document using flexible patterns.
        """
        # Define regex patterns for sections and subsections
        section_pattern = re.compile(r"^(?:\d+\.\s*)?(Abstract|Introduction|Methods|Materials|Data Collection|Results|Discussion|Conclusion|Acknowledgments|Appendix):?$", re.IGNORECASE | re.MULTILINE)
        subsection_pattern = re.compile(r"^\d+\.\d+\s+[A-Za-z0-9 \-]+:?")

        self.sections = defaultdict(dict)  # Reset sections
        current_section = None
        current_subsection = None

        for line in text.split("\n"):
            line = line.strip()
            if not line:
                continue

            # Match sections
            section_match = section_pattern.match(line)
            if section_match:
                current_section = section_match.group(1).strip()
                # Exclude "References" section explicitly
                if "reference" not in current_section.lower():
                    self.sections[current_section] = {"content": [], "subsections": {}}
                current_subsection = None
                continue

            # Match subsections
            subsection_match = subsection_pattern.match(line)
            if subsection_match and current_section:
                current_subsection = subsection_match.group(0).strip()
                self.sections[current_section]["subsections"][current_subsection] = []
                continue

            # Append content to the appropriate section or subsection
            if current_subsection:
                self.sections[current_section]["subsections"][current_subsection].append(line)
            elif current_section and "reference" not in current_section.lower():
                self.sections[current_section]["content"].append(line)

        # Combine subsection content
        for section in self.sections:
            for subsection in self.sections[section]["subsections"]:
                self.sections[section]["subsections"][subsection] = "\n".join(self.sections[section]["subsections"][subsection]).strip()
            self.sections[section]["content"] = "\n".join(self.sections[section]["content"]).strip()

    def get_section_content(self, key):
        """
        Retrieves content for a given section or subsection key.

        Args:
            key (str): The section or subsection key to retrieve content for.

        Returns:
            dict or str: The content of the section or subsection. If it has subsections, a dictionary is returned.
        """
        # Check for top-level section
        if key in self.sections:
            return {
                "content": self.sections[key]["content"],
                "subsections": self.sections[key]["subsections"]
            }

        # Check for subsection
        for section, details in self.sections.items():
            if key in details["subsections"]:
                return details["subsections"][key]

        return None  # Key not found

    def has_subsections(self, key):
        """
        Checks if a given section has subsections.

        Args:
            key (str): The section key to check.

        Returns:
            bool: True if the section has subsections, False otherwise.
        """
        if key in self.sections and self.sections[key]["subsections"]:
            return True
        return False

    def get_all_section_content(self):
        """
        Retrieves content of all sections and their subsections, excluding references.

        Returns:
            dict: A dictionary containing all sections and their content, excluding "References".
        """
        all_content = {}
        for section, details in self.sections.items():
            if "reference" in section.lower():
                continue
            all_content[section] = {
                "content": details["content"],
                "subsections": details["subsections"]
            }
        return all_content

    def save_content_to_file(self, content_dict, output_file):
        """
        Save extracted content to a file, excluding sections named 'References' or similar.

        Parameters:
        - content_dict (dict): Dictionary with sections and subsections.
        - output_file (str): Path to save the content.
        """
        with open(output_file, "w", encoding="utf-8") as file:
            for section, content in content_dict.items():
                file.write(f"--- {section} ---\n")
                file.write("Content:\n")
                file.write(content["content"] + "\n")
                file.write("Subsections:\n")
                for subsection, subsection_content in content["subsections"].items():
                    file.write(f"  - {subsection}: {subsection_content}\n")
                file.write("\n")

# Example usage
parser = DocumentParser()

# Test with a URL for a PDF
pdf_url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10278373/pdf/johm-8-S3-93.pdf"
pdf_text = parser.extract_text_from_url(pdf_url)
parser.identify_sections_with_subsections(pdf_text)
pdf_content = parser.get_all_section_content()
parser.save_content_to_file(pdf_content, "extracted_pdf_content.txt")
print("PDF content saved to extracted_pdf_content.txt, excluding references.")

# Test with a URL for HTML
html_url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10278373/"
html_text = parser.extract_text_from_url(html_url)
parser.identify_sections_with_subsections(html_text)
html_content = parser.get_all_section_content()
parser.save_content_to_file(html_content, "extracted_html_content.txt")
print("HTML content saved to extracted_html_content.txt, excluding references.")


PDF content saved to extracted_pdf_content.txt, excluding references.
HTML content saved to extracted_html_content.txt, excluding references.


In [13]:
import requests
from PyPDF2 import PdfReader

def save_content_to_file(content_dict, output_file):
    """
    Save extracted content to a file, excluding sections named 'References' or similar.
    
    Parameters:
    - content_dict (dict): Dictionary with sections and subsections.
    - output_file (str): Path to save the content.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        for section, content in content_dict.items():
            # Skip sections named "References" or similar
            if "reference" in section.lower():
                continue
            
            file.write(f"--- {section} ---\n")
            file.write("Content:\n")
            file.write(content["content"] + "\n")
            file.write("Subsections:\n")
            for subsection, subsection_content in content["subsections"].items():
                file.write(f"  - {subsection}: {subsection_content}\n")
            file.write("\n")

def save_pdf_content_to_file(pdf_content, output_file):
    """
    Save extracted PDF content to a file.
    
    Parameters:
    - pdf_content (list): List of strings, one per page.
    - output_file (str): Path to save the content.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        for page_number, text in enumerate(pdf_content, start=1):
            file.write(f"--- Page {page_number} ---\n")
            file.write(text + "\n")

def fetch_and_process_url(url):
    """
    Fetch content from a URL, process based on content type (HTML or PDF),
    and save extracted content to a file.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        content_type = response.headers.get("Content-Type", "")
        
        if "text/html" in content_type:
            # Handle HTML content
            html_content = response.text
            parser = DocumentParser()

            # Extract text from the HTML
            document_text = parser.extract_text_from_html(html_content)
            
            # Identify sections and subsections
            parser.identify_sections_with_subsections(document_text)
            
            # Retrieve all sections
            all_content = parser.get_all_section_content()
            
            # Save content to a file
            output_file = "extracted_content_html.txt"
            save_content_to_file(all_content, output_file)
            print(f"HTML content saved to {output_file}, excluding references.")

        elif "application/pdf" in content_type:
            # Handle PDF content using PyPDF2
            output_file = "extracted_content_pdf.txt"
            
            with open("temp.pdf", "wb") as temp_pdf:
                temp_pdf.write(response.content)

            reader = PdfReader("temp.pdf")
            pdf_content = [page.extract_text() for page in reader.pages]
            
            save_pdf_content_to_file(pdf_content, output_file)
            print(f"PDF content saved to {output_file}.")
        else:
            print("Unsupported content type.")
    else:
        print(f"Failed to fetch the URL. Status code: {response.status_code}")

# Example usage
url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10278373/pdf/johm-8-S3-93.pdf"  # Replace with your URL
fetch_and_process_url(url)


PDF content saved to extracted_content_pdf.txt.


In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = "https://cidades.ibge.gov.br/brasil/se/aracaju/panorama"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    print(response.content)
    # Extract specific information (e.g., population data)
    # Adjust the selectors based on the webpage structure
    population = soup.find('span', class_='population-number')  # Example selector
    if population:
        print(f"Population: {population.text.strip()}")
    else:
        print("Population data not found.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


b'\xef\xbb\xbf<!DOCTYPE html>\r\n<html lang="pt-br">\r\n\r\n<head>\r\n<meta charset="utf-8">\r\n  <meta http-equiv="x-ua-compatible" content="ie=edge">\r\n  <meta http-equiv="cache-control" content="no-transform" />\r\n  <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">\r\n\r\n  <!-- Google tag (gtag.js) --> <script async src="https://www.googletagmanager.com/gtag/js?id=G-YLYPPRQS7S"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-YLYPPRQS7S\'); </script>\r\n\r\n  <link rel="icon" href="data:;base64,iVBORw0KGgo=">\r\n  <link rel="stylesheet" href="/css/global.css">\r\n\r\n  <!--font awesome-->\r\n  <link rel="stylesheet" href="/css/font-awesome/font-awesome.min.css">\r\n\r\n  <base href="/">\r\n\r\n</head>\r\n\r\n<body class="estado">\r\n\r\n  <!-- Componente principal -->\r\n  <app></app>\r\n\r\n  <!-- integrity gerado por https://www.srihash.org

In [2]:
import pandas as pd

# Sample DataFrames
df1 = pd.read_csv('./Data/data-1741088817841-Medline-from-OVID.csv')
df2 = pd.read_csv('./Data/MedlineData/medline_results.csv')

# Finding PMIDs in df1 that are NOT in df2 based on different column names
pmid_not_in_df2 = df1[~df1['verification_id'].isin(df2['pmid'])]

# Converting to list
pmid_list = pmid_not_in_df2['verification_id'].tolist()

print(pmid_list)


[34448061, 34409510, 34406258, 34398033, 34397701, 34390554, 34389021, 34380606, 34334291, 34293142, 34281461, 34280318, 34278928, 34271590, 34251034, 34227927, 34215337, 34212499, 34193261, 34187854, 34183114, 34180541, 34163109, 34155795, 32914222, 32792167, 32729494, 32882868, 32831118, 32706618, 32702822, 32503874, 32413914, 32392223, 33350746, 32849612, 30900514, 34394213, 33415957, 33375455, 33371042, 33355022, 33293627, 33290426, 33285720, 33263539, 33215698, 33194969, 33112875, 33079042, 33053757, 33023604, 32996342, 35088108, 35189336, 35005777, 34953911, 34931002, 35148313, 35164069, 35262185, 35390077, 35385087, 35381324, 35379242, 35583528, 35550190, 35545195, 35534064, 35533977, 35533820, 35514131, 35490790, 35482791, 35475801, 35440456, 35065702, 35024992, 32991604, 32965785, 32951718, 32925078, 32887515, 32845580, 32829177, 32797762, 32763206, 32758175, 32723442, 32717028, 32709768, 32696862, 32645099, 32622723, 32617764, 34143581, 34126302, 34124973, 34118455, 34116205,

In [3]:
verification_ids = [34448061, 34409510, 34406258, 34398033, 34397701, 34390554, 34389021, 34380606, 34334291, 34293142, 34281461, 34280318, 34278928, 34271590, 34251034, 34227927, 34215337, 34212499, 34193261, 34187854, 34183114, 34180541, 34163109, 34155795, 32914222, 32792167, 32729494, 32882868, 32831118, 32706618, 32702822, 32503874, 32413914, 32392223, 33350746, 32849612, 30900514, 34394213, 33415957, 33375455, 33371042, 33355022, 33293627, 33290426, 33285720, 33263539, 33215698, 33194969, 33112875, 33079042, 33053757, 33023604, 32996342, 35088108, 35189336, 35005777, 34953911, 34931002, 35148313, 35164069, 35262185, 35390077, 35385087, 35381324, 35379242, 35583528, 35550190, 35545195, 35534064, 35533977, 35533820, 35514131, 35490790, 35482791, 35475801, 35440456, 35065702, 35024992, 32991604, 32965785, 32951718, 32925078, 32887515, 32845580, 32829177, 32797762, 32763206, 32758175, 32723442, 32717028, 32709768, 32696862, 32645099, 32622723, 32617764, 34143581, 34126302, 34124973, 34118455, 34116205, 34103346, 34092969, 34082822, 34074544, 34056627, 34048911, 34046307, 34046032, 34015063, 34008187, 33945214, 33921297, 33920974, 33909610, 33881157, 33879659, 32787752, 32728946, 32616087, 32529235, 32513515, 32854869, 32813276, 32911778, 32892060, 33382764, 32777301, 32828990, 32827490, 32416028, 32410266, 33170174, 33320875, 33202670, 33147729, 33038930, 32842497, 32821086, 32575078, 32481690, 34787261, 34727115, 34717586, 34715754, 34714888, 34711208, 34711198, 34689810, 34689007, 34675033, 34663683, 34660516, 34643720, 34607398, 34591924, 34529507, 34480379, 34471261, 34469561, 34452492, 24427261, 24373292, 24304274, 23932722, 23902720, 23949423, 24992803, 24626555, 24331071, 24331070, 24325413, 24291849, 24290365, 24237786, 24228011, 24226506, 22077219, 22028840, 21991313, 21902863, 21861105, 21824381, 21791124, 21738449, 21737179, 21671988, 21668312, 21667067, 21664876, 21660385, 21576068, 21575812, 21575193, 21534638, 21496359, 21483258, 21482560, 21478555, 21438669, 21375760, 21371656, 21318608, 21309171, 21182987, 21181055, 21167301, 21131643, 21099431, 21094268, 21093952, 21091799, 21087687, 21075400, 21074471, 21059168, 21056436, 21030451, 20978734, 20950645, 20950412, 20933258, 20819147, 20473886, 24139883, 24136148, 24103432, 24096986, 24076706, 23973651, 23961854, 23929253, 23908082, 23895136, 23894436, 23816527, 23805224, 23776599, 23761216, 23750731, 23728690, 23675927, 39757959, 39788159, 39930651, 39951087, 39951070, 39951072, 39930478, 39936665, 39928631, 39923010, 39918006, 39913513, 39861798, 39910591, 39232246, 34394299, 34376550, 34788344, 34270559, 34762250, 34681727, 34484202, 34452974, 34448865, 34414937, 34090396, 34070882, 34042592, 34642315, 34339628, 35132594, 34957025, 34940978, 34930233, 34898480, 34893071, 34852166, 34852079, 34813628, 34797952, 34797303, 34795554, 33412917, 33397308, 33356651, 33342308, 33319339, 33307158, 33249937, 33243663, 33213320, 33212031, 33198609, 33191633, 33081670, 33065238, 33057888, 33041248, 32996257, 32996196, 32979069, 32978651, 32901936, 32886331, 32882158, 32563940, 32562207, 32549334, 32533989, 32504662, 32498762, 32496473, 32481460, 32478627, 32466392, 32463373, 32423441, 32412918, 32385189, 32358619, 32150561, 31808223, 32332221, 32019797, 35339379, 35298107, 35296886, 35294748, 35286025, 35285767, 35278387, 35269752, 35266523, 35263648, 35247015, 35246288, 35240312, 35227863, 35224760, 35218424, 35193695, 35184718, 35180563, 39530931, 39486798, 39454228, 39459416, 39441204, 38734965, 38607726, 39390439, 39240908, 39467652, 39377125, 38287745, 36411974, 38365415, 39369502, 39134221, 39034014, 39377857, 39367313, 39284784, 39351227, 39179150, 22409368, 21592627, 22247330, 22258956, 23002356, 22463370, 22957088, 23394756, 23278779, 23247940, 23235578, 23139836, 23116704, 23114004, 23046688, 22995852, 22972063, 22962237, 22955857, 22913779, 26880356, 27716549, 27832195, 27643596, 27724950, 27999275, 27320215, 27331659, 27167347, 28130920, 27470177, 27912886, 27792776, 27768704, 28294048, 28279679, 28276942, 28270112, 28249638, 28249256, 28173739, 28128861, 28124442, 28056035, 27837654, 27810155, 27666813, 27592304, 27528364, 23639601, 23562352, 23561006, 23522098, 23465404, 23445751, 23394936, 23291098, 23265575, 23173567, 23158472, 23068425, 22907663, 22897897, 39136085, 39225854, 39591452, 39481165, 39637904, 39609763, 39593006, 39410814, 39082141, 39377138, 39097395, 39564365, 39527765, 39450582, 39576841, 39509362, 39279016, 39498113, 39487391, 39201386, 35178772, 35171096, 35156555, 35150885, 35144879, 35132841, 35124258, 35124205, 35113445, 35094397, 35086975, 35058332, 35057437, 35033141, 35030267, 34997606, 34968773, 34931724, 34929178, 34919974, 34919291, 32205287, 32339832, 32294922, 32203504, 32201449, 32138748, 32218667, 32341087, 32301919, 32247892, 32199484, 32157552, 32100260, 32084177, 32068617, 32049702, 32014711, 33757612, 33563075, 33196834, 34454704, 34406962, 34111207, 34830179, 34308537, 34057201, 34870327, 34009478, 34264565, 34684596, 31338835, 31796479, 31685019, 31145773, 31095414, 30367972, 31045799, 31422772, 31383779, 31367943, 31340325, 31303294, 31276186, 31271816, 31269150, 39907423, 39900427, 39903770, 38954503, 39891054, 39612926, 39154171, 39482559, 39860998, 39648831, 39485772, 39375278, 39815259, 39792714, 39601904, 39099185, 36767517, 36609748, 36519386, 36634119, 36242537, 36100820, 35882473, 36044136, 37222292, 36214966, 37207448, 36436722, 36600489, 37079486, 36737358, 36668818, 37204441, 37175571, 22840968, 22828243, 22713438, 22707613, 22592689, 22570701, 22497584, 22491335, 22490872, 22439809, 22438265, 22431528, 22422631, 22418039, 22411451, 22393352, 22392170, 22262586, 22256779, 22249298, 22244869, 22134594, 21969136, 21735402, 20381206, 21366869, 21212434, 21554768, 21715571, 21651833, 25546897, 23250288, 22155902, 37126363, 37118717, 37111124, 37076543, 37054386, 37042856, 37015800, 37013490, 37004038, 36988896, 36978166, 36951832, 36949170, 36948364, 36946625, 36946287, 36933562, 36927284, 36926664, 39093565, 38796166, 39768896, 39756312, 39719283, 39622563, 39115612, 39774607, 39762751, 39480211, 39731473, 39526433, 39378018, 38509766, 39657032, 39631509, 39707867, 39446164, 32012320, 31985170, 31941488, 31911163, 31843272, 31784293, 31609922, 31603999, 31574337, 31518547, 30608986, 31643081, 31486661, 34913814, 34840314, 34768006, 34719440, 34687377, 34636434, 34617409, 34582072, 34522970, 34481951, 34353257, 34341310, 34296276, 34225540, 34056681, 34034598, 39016354, 39297593, 39166402, 39281686, 39105848, 38817046, 39269761, 39186717, 38636536, 39116701, 39199321, 38233256, 39088540, 39193975, 39002399, 37461882, 39099418, 39150180, 39121048, 39123161, 38901886, 38652046, 39077433, 39066294, 39035352, 38954400, 38268412, 37940941, 37336265, 37517000, 37643989, 37848293, 37916817, 37811673, 37649897, 37489818, 37824507, 37729043, 37907290, 37543809, 37326928, 37840452, 37365505, 37813543, 37560816, 37434326, 37291382, 37415976, 37348256, 37243259, 37967291, 37339345, 37314064, 37260086, 38005864, 37379775, 37691952, 36899311, 36897899, 36876704, 36876352, 36857972, 36814257, 36801619, 36795574, 36790832, 36753461, 36737741, 36731926, 36730054, 36729186, 36723288, 36715243, 36707043, 36656099, 26752783, 26739959, 26689276, 26683034, 26659461, 26614486, 26607813, 26565396, 26545940, 26545825, 26501470, 26485216, 26143136, 25100307, 25681862, 26667696, 25662901, 26671418, 26437454, 26344479, 25886390, 26421849, 26500036, 29068485, 29063560, 28994899, 28984764, 28945755, 28937963, 28895659, 28890016, 28864138, 28859915, 28843576, 28836955, 28804053, 28786997, 28760796, 28745220, 28738456, 28738364, 28712416, 28703537, 28681432, 28675017, 28637906, 28604080, 28577297, 28541811, 28529116, 28501259, 28484110, 28484063, 28416263, 28414969, 28407780, 28372531, 28302925, 26467099, 26682048, 26629725, 26614181, 26606712, 26602892, 26545988, 26527017, 26423651, 26419360, 26411952, 26411658, 26398196, 26360527, 26354800, 26335699, 26334607, 26289787, 36600225, 36585944, 36576013, 36528329, 36515814, 36422466, 36400319, 36385404, 37641523, 37437914, 37676146, 37858583, 37994839, 37957723, 38058963, 37796104, 38019122, 37574113, 37386209, 37269693, 37944357, 38025679, 38019849, 37986340, 37893427, 37891440, 37880112, 37853477, 37780134, 37738270, 38319447, 38206763, 37989477, 38804437, 38790006, 38777029, 38621245, 38521534, 38195300, 38691664, 37001813, 38958049, 38767209, 31235861, 31219079, 31174598, 31155366, 31152501, 31148958, 31131102, 31118091, 31116241, 31102782, 31067425, 31062139, 31022383, 31016626, 31014351, 31014341, 31002716, 30950921, 30928928, 30920131, 30901044, 30887157, 35780088, 36076271, 35985590, 35594413, 35488305, 35507224, 36323422, 36519226, 36554727, 35714803, 35654349, 36444634, 36402953, 36361269, 36355774, 35878897, 35785439, 36127666, 35973747, 36062247, 35943669, 35948944, 37338314, 37331759, 37317497, 37315264, 37304269, 37282529, 37259032, 37254143, 37062862, 36346483, 36940962, 37170886, 37194007, 36901164, 36473552, 28604054, 28461161, 31832252, 29149907, 28901005, 27256033, 28891235, 29083344, 28205208, 28024955, 29034866, 28431558, 28284227, 29302319, 29268704, 29202939, 29182688, 29117324, 29117322, 29096710, 27170307, 27148808, 27144393, 27121635, 27103114, 27098823, 27011229, 27008464, 26984008, 26980094, 26946216, 26907218, 26845348, 26828512, 26822522, 26782158, 27633910, 27575957, 27556659, 27538880, 27523606, 27501085, 27442424, 27442227, 27372293, 27369644, 27352628, 27329249, 27322016, 27311404, 27277138, 27269963, 27230737, 29558894, 29547616, 29530388, 29516900, 29427804, 29411951, 29410012, 29381647, 29380694, 29309934, 29273516, 29248196, 29238906, 29206909, 29188363, 37734676, 37721345, 37714919, 37673449, 37649300, 37634322, 37629112, 37596678, 37583888, 37579698, 37572572, 37569757, 37531338, 37506267, 37482014, 37469290, 37446247, 37433256, 37432384, 37419502, 37409466, 37406073, 37343960, 36289100, 36260234, 36259471, 36226344, 36197867, 36177994, 36169764, 36162724, 36087236, 35950255, 35894465, 35678261, 35581701, 35476141, 35272523, 35085475, 36600327, 36405527, 35978635, 35996832, 36459045, 35655401, 35964613, 30867101, 30813166, 30808593, 30702545, 30688349, 30670267, 30660981, 30646588, 30624761, 30601847, 30573285, 30562840, 30561092, 30515574, 33725996, 33724440, 33694179, 33682323, 33647033, 33620505, 33600346, 33597073, 33581526, 33578206, 33538407, 33530866, 33524777, 33508989, 36112128, 36111784, 36107855, 36098097, 36089149, 36052843, 36044120, 36041271, 36007752, 36002211, 35992748, 35986958, 35982184, 35977596, 35973976, 35961786, 35961504, 35955119, 35929975, 35927752, 35898117, 38496394, 38360130, 38382894, 38102230, 38783184, 38697269, 38323419, 38696415, 38714091, 38140557, 38765932, 37758179, 38700304, 38724898, 37327873, 38142791, 38459000, 30499259, 30499246, 30439545, 30376041, 30327274, 30276425, 30192393, 30109720, 30029997, 29160642, 31868655, 31808383, 31766548, 31731985, 31647842, 26132432, 26084515, 26061370, 26053385, 25902619, 25889398, 25882127, 25881523, 25875167, 25871949, 25821066, 25819310, 25797423, 25703280, 25687791, 25665986, 25578513, 25548931, 25410372, 25156655, 25099824, 24670899, 25243777, 24902967, 25153531, 24618584, 25089265, 24416267, 31617076, 31581258, 31565734, 31561111, 31473618, 31445795, 31426825, 31400670, 30517341, 29996785, 30440022, 29995152, 29699534, 29388206, 29214413, 29158102, 30307935, 28922786, 28559000, 37957052, 37861207, 38183123, 38372809, 38305628, 38294896, 38149977, 37770654, 38240249, 37748752, 38231101, 37979645, 38172669, 38131876, 38006819, 37951460, 35875040, 35867217, 35850685, 35841663, 35836262, 35820399, 35816990, 35812486, 35799187, 35793756, 35778340, 35757771, 35713300, 35712857, 35703623, 35681165, 35676891, 35654646, 35643432, 35641955, 35638315, 35634954, 35593343, 33740483, 33227532, 33181111, 33561093, 33137200, 33341909, 33248481, 33872520, 33410879, 33271483, 33870878, 33853490, 33844952, 33829888, 33797819, 33793885, 33787418, 33785275, 33785117, 33779460, 33775644, 33768442, 33764569, 33761591, 38616385, 38547496, 38365414, 38353841, 38340606, 38309460, 37594090, 38102985, 37688482, 38449013, 37646204, 38084876, 38592606, 38235934, 38968496, 38928901, 38727991, 37279074, 38380588, 38903206, 38578180, 38880893, 38857639, 38796394, 38597099, 38858211, 38555274, 38064012, 36457038, 36452989, 36441785, 36410119, 36384255, 36380373, 36368311, 36309355, 36301825, 36301760, 36284392, 36273121, 36266486, 36260970, 36242521, 36230934, 36135095, 25053215, 25041535, 24988414, 24981041, 24952233, 24938376, 24935861, 24918128, 24916088, 24861032, 24830699, 24798424, 24793547, 24767811, 24729996, 24702735, 24657969, 24606701, 24551265, 24459000, 24456561, 29974131, 29961604, 29922039, 29887221, 29873906, 29801753, 29778396, 29766776, 29722445, 29698769, 29667473, 29664167, 29661109, 29635417, 29582403, 30684347, 30657071, 30526505, 30388992, 30344281, 30319131, 30288799, 30280768, 30269303, 30215678, 30179565, 30032674, 29996864, 25518427, 25455992, 25439690, 25422237, 25412506, 25404529, 25375222, 25305206, 25287997, 25244650, 25236586, 25231591, 25199532, 25124771, 25106759, 25092624, 35777055, 35742404, 35617363, 35476297, 35564714, 35550674, 36323488, 36229989, 36066176, 36058544, 36757247, 36613810, 36600349, 36576399, 36554279, 36541296, 36534668, 36481750, 38016805, 38773588, 38093369, 38193593, 38743575, 38232612, 38442123, 38400053, 38515023, 38064215, 38714075, 38447388, 38545706, 38228087, 38185405, 38419461]

In [5]:
df1[df1["verification_id"].isin(verification_ids)].to_csv("NotFound-on-Medline-using-our-Query.csv", index=False)

In [7]:
import pandas as pd

# Sample DataFrames
df1 = pd.read_csv('./Data/data-1741088817841-Medline-from-OVID.csv')
df2 = pd.read_csv('./Data/MedlineData/medline_results.csv')

# Finding common PMIDs (present in both DataFrames)
common_pmids = df1[df1['verification_id'].isin(df2['pmid'])]

# Convert to list
common_pmids_list = len(common_pmids['verification_id'].tolist())

print(common_pmids_list)


5842


In [8]:
5842+1047

6889

In [1]:
import spacy
import json
import re
from typing import Dict, List, Optional


class PRISMAEntityExtractor:
    def __init__(self, model_name: str = "en_core_web_sm", keyword_dict: Optional[Dict[str, List[str]]] = None):
        self.nlp = spacy.load(model_name)
        self.keyword_dict = keyword_dict or {}
        self.results = {}

    def extract(self, text: str) -> Dict[str, List[Dict]]:
        doc = self.nlp(text)
        self.results = {key: [] for key in self.keyword_dict}

        for sent in doc.sents:
            sent_lower = sent.text.lower()

            for key, patterns in self.keyword_dict.items():
                if any(kw in sent_lower for kw in patterns):
                    for ent in sent.ents:
                        if ent.label_ in {"CARDINAL", "QUANTITY"} and ent.text.strip():
                            self.results[key].append({
                                "value": ent.text,
                                "sentence": sent.text.strip()
                            })
        return self.results

    def extract_from_file(self, path: str) -> Dict[str, List[Dict]]:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        return self.extract(text)

    def save_to_json(self, data: Dict[str, List[Dict]], output_path: str):
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
        print(f"✅ Saved extracted PRISMA data to {output_path}")

    def get_combined_sentences(self) -> str:
        """
        Combines all extracted sentences across PRISMA categories into one clean text.
        Deduplicates sentences and sorts them for readability.

        :return: A single string with all relevant PRISMA-related sentences.
        """
        all_sentences = set()

        for entries in self.results.values():
            for entry in entries:
                sentence = entry.get("sentence", "").strip()
                if sentence:
                    all_sentences.add(sentence)

        # Sort sentences (optional: by length or alphabetically)
        combined_text = "\n".join(sorted(all_sentences))
        return combined_text

    def get_best_value(self, key: str, prefer_max: bool = True, min_val: int = 5) -> Optional[Dict]:
        def clean_number(val: str):
            try:
                return int(re.sub(r"[^\d]", "", val))
            except:
                return None

        def score_entry(entry: Dict[str, str], keywords: List[str]):
            sentence = entry["sentence"].lower()
            return sum(1 for kw in keywords if kw in sentence)

        entries = self.results.get(key, [])
        if not entries:
            return None

        scored = []
        for entry in entries:
            val = clean_number(entry["value"])
            if val is None or val < min_val:
                continue
            score = score_entry(entry, self.keyword_dict.get(key, []))
            scored.append((score, val, entry["sentence"]))

        if not scored:
            return None

        scored.sort(key=lambda x: (x[0], x[1] if prefer_max else -x[1]), reverse=True)
        best = scored[0]
        return {"value": best[1], "sentence": best[2]}



In [8]:
# PRISMA-style keyword dictionary
KEYWORDS = {
    "total_studies": [
        "identified", "retrieved", "found", "yielded", "obtained", "available",
        "initial", "returned", "sourced", "located", "recorded", "records identified",
        "search results", "hits", "articles found", "publications retrieved",
        "citations identified", "initial search", "database search", "from databases",
        "via search", "through search", "search yielded", "total number of records"
    ],
    "duplicates": [
        "duplicate", "duplicates", "removed", "excluded as duplicates",
        "eliminated", "filtered out", "after removing", "after deduplication",
        "deduplicated", "de-duplicated", "duplicates removed", "duplicate records",
        "removal of duplicates", "duplicate citations", "duplicate entries"
    ],
    "screened": [
        "screened", "reviewed", "screening", "underwent screening",
        "screened for eligibility", "abstract screening", "title screening",
        "records screened", "articles screened", "documents screened",
        "titles and abstracts reviewed", "initial screening", "screened by title",
        "screening process", "titles/abstracts screened"
    ],
    "eligible": [
        "eligible", "assessed for eligibility", "met eligibility criteria",
        "retrieved for full-text review", "reviewed", "full-text assessed",
        "fulltext", "included for eligibility", "potentially eligible",
        "considered eligible", "qualified for full-text review",
        "full texts reviewed", "eligibility assessment", "full-text retrieved",
        "screened full texts", "full articles reviewed"
    ],
    "excluded": [
        "excluded", "removed", "not eligible", "did not meet criteria",
        "excluded from review", "not included", "excluded based on criteria",
        "excluded after full-text review", "excluded articles", "excluded studies",
        "exclusion criteria", "full-text exclusions", "studies not included",
        "ineligible", "reasons for exclusion", "excluded records", "were not eligible"
    ],
    "included": [
        "included", "selected", "included in review", "included in analysis",
        "retained", "remained for inclusion", "underwent extraction",
        "were deemed relevant", "studies included", "final inclusion", "included in final synthesis",
        "considered in review", "synthesized", "studies retained", "eligible and included",
        "included for analysis", "included for synthesis", "final selection"
    ]
}

text  = open("tetetetete.txt").read()
text = """
===== Abstract =====
Influenza is an acute respiratory virus that results in significant worldwide morbidity and mortality each year. As emergency physicians, we are often the first to encounter patients with seasonal influenza . It is therefore critical that we draw on the most recent and relevant research when we make clinical decisions regarding the diagnosis, treatment, and prophylaxis of this disease.
A MEDLINE literature search from August 2009 to August 2015 was performed using the keywords influenza vaccination efficacy AND systematic , influenza AND rapid antigen testing , and Oseltamivir AND systematic , while limiting the search to human studies written in the English language. General review articles and case reports were omitted. Each of the selected articles then underwent a structured review.
We identified 163 articles through our literature search, of which 68 were found to be relevant to our clinical questions. These studies then underwent a rigorous review from which recommendations were given.
Influenza vaccine efficacy continues to range between 40% and 80%. Vaccination has the potential to decrease disease severity and is recommended for individuals older than 6 months of age. If resources permit, vaccination can be offered to patients presenting to the emergency department . Rapid antigen detection for influenza is a simple bedside test with high specificity, but generally low sensitivity. If a patient presents with a syndrome consistent with influenza and has negative rapid antigen detection, they should either receive a confirmatory reverse transcriptase polymerase chain reaction or be treated as if they have influenza. Treatment with neuraminidase inhibitors can decrease the duration of influenza and is recommended in hospitalized patients, or in those with high risk of complications.

===== Methods =====
Three structured literature reviews were performed using MEDLINE and were all limited to studies that were published in the English language between August 2009 and August 2015. Search terms included influenza vaccination efficacy AND systematic , influenza AND, and Oseltamivir AND systematic . Two emergency physicians analyzed the abstract of each identified article to determine which ones should be pulled for more detailed review, based on the suspected relevance to the topic of interest. If

===== Results =====
Through the influenza vaccination review, 44 abstracts were identified, of which 18 were thought to be relevant by the reviewers and were pulled for detailed formal review. The rapid antigen testing review identified 66 articles, with 29 articles being deemed relevant by reviewers. Finally, the oseltamivir review identified 53 total articles, of which 21 were deemed relevant.
The primary goal of this literature search was to determine the appropriate ED approach with regard to prevention,

"""

extractor = PRISMAEntityExtractor(model_name="en_core_web_sm", keyword_dict=KEYWORDS)
extracted = extractor.extract(text)

print(extractor.get_best_value("total_studies"))
print(extractor.get_best_value("included"))
print(extractor.get_best_value("excluded"))
print(extractor.get_best_value("duplicates"))

combined_text = extractor.get_combined_sentences()
combined_text

{'value': 163, 'sentence': 'We identified 163 articles through our literature search, of which 68 were found to be relevant to our clinical questions.'}
{'value': 53, 'sentence': 'Finally, the oseltamivir review identified 53 total articles, of which 21 were deemed relevant.'}
None
None


'Finally, the oseltamivir review identified 53 total articles, of which 21 were deemed relevant.\nIf\n\n===== Results =====\nThrough the influenza vaccination review, 44 abstracts were identified, of which 18 were thought to be relevant by the reviewers and were pulled for detailed formal review.\nThe rapid antigen testing review identified 66 articles, with 29 articles being deemed relevant by reviewers.\nTwo emergency physicians analyzed the abstract of each identified article to determine which ones should be pulled for more detailed review, based on the suspected relevance to the topic of interest.\nWe identified 163 articles through our literature search, of which 68 were found to be relevant to our clinical questions.'

In [None]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import re

def extract_prisma_info(context: str, model_path: str = "./models/tinyroberta") -> dict:
    # Load the local model + tokenizer
    qa_pipeline = pipeline(
        "question-answering",
        model=AutoModelForQuestionAnswering.from_pretrained(model_path),
        tokenizer=AutoTokenizer.from_pretrained(model_path)
    )

    # Expanded natural language questions for each PRISMA category
    QUESTION_TEMPLATES = {
        "total_studies": [
            "How many records were identified?",
            "What is the total number of articles retrieved?",
            "How many search results were found?",
            "How many publications were located from databases?",
            "What was the total yield from the initial search?"
        ],
        "duplicates": [
            "How many duplicates were removed?",
            "How many duplicate records were excluded?",
            "How many duplicate entries were found?",
            "What number of records were removed as duplicates?",
            "How many were eliminated due to duplication?"
        ],
        "screened": [
            "How many records were screened?",
            "How many articles underwent screening?",
            "How many titles and abstracts were reviewed?",
            "What number of documents were screened?",
            "How many records were reviewed for relevance?"
        ],
        "eligible": [
            "How many were assessed for eligibility?",
            "How many full-text articles were reviewed?",
            "How many met the eligibility criteria?",
            "How many were considered eligible?",
            "How many full texts were retrieved for assessment?"
        ],
        "excluded": [
            "How many studies were excluded?",
            "How many articles did not meet criteria?",
            "How many were not eligible?",
            "How many were removed after full-text review?",
            "What number of studies were excluded from review?"
        ],
        "included": [
            "How many studies were included in the final analysis?",
            "How many were included in the review?",
            "How many studies were selected for inclusion?",
            "How many studies were deemed relevant"
        ]
    }

    results = {}

    for label, questions in QUESTION_TEMPLATES.items():
        for question in questions:
            try:
                answer = qa_pipeline(question=question, context=context)
                if answer and any(char.isdigit() for char in answer['answer']):
                    # Clean to get just the number
                    number_match = re.search(r'\d+', answer['answer'])
                    if number_match:
                        results[label] = int(number_match.group(0))
                        break
            except Exception as e:
                continue

    return results


# ✅ Test it
context = open("tetetetete.txt").read()
context = """
===== Abstract =====
Influenza is an acute respiratory virus that results in significant worldwide morbidity and mortality each year. As emergency physicians, we are often the first to encounter patients with seasonal influenza . It is therefore critical that we draw on the most recent and relevant research when we make clinical decisions regarding the diagnosis, treatment, and prophylaxis of this disease.
A MEDLINE literature search from August 2009 to August 2015 was performed using the keywords influenza vaccination efficacy AND systematic , influenza AND rapid antigen testing , and Oseltamivir AND systematic , while limiting the search to human studies written in the English language. General review articles and case reports were omitted. Each of the selected articles then underwent a structured review.
We identified 163 articles through our literature search, of which 68 were found to be relevant to our clinical questions. These studies then underwent a rigorous review from which recommendations were given.
Influenza vaccine efficacy continues to range between 40% and 80%. Vaccination has the potential to decrease disease severity and is recommended for individuals older than 6 months of age. If resources permit, vaccination can be offered to patients presenting to the emergency department . Rapid antigen detection for influenza is a simple bedside test with high specificity, but generally low sensitivity. If a patient presents with a syndrome consistent with influenza and has negative rapid antigen detection, they should either receive a confirmatory reverse transcriptase polymerase chain reaction or be treated as if they have influenza. Treatment with neuraminidase inhibitors can decrease the duration of influenza and is recommended in hospitalized patients, or in those with high risk of complications.

===== Methods =====
Three structured literature reviews were performed using MEDLINE and were all limited to studies that were published in the English language between August 2009 and August 2015. Search terms included influenza vaccination efficacy AND systematic , influenza AND, and Oseltamivir AND systematic . Two emergency physicians analyzed the abstract of each identified article to determine which ones should be pulled for more detailed review, based on the suspected relevance to the topic of interest. If

===== Results =====
Through the influenza vaccination review, 44 abstracts were identified, of which 18 were thought to be relevant by the reviewers and were pulled for detailed formal review. The rapid antigen testing review identified 66 articles, with 29 articles being deemed relevant by reviewers. Finally, the oseltamivir review identified 53 total articles, of which 21 were deemed relevant.
The primary goal of this literature search was to determine the appropriate ED approach with regard to prevention,

"""

# with open("tetetetete.txt") as f:
#     context = f.read()

output = extract_prisma_info(context)
from pprint import pprint
pprint(output)


Device set to use mps:0


{'duplicates': '44',
 'eligible': '21',
 'excluded': '163',
 'included': '53',
 'screened': '66',
 'total_studies': '163'}


In [4]:
combined_text

'Finally, the oseltamivir review identified 53 total articles,\nof which 21 were deemed relevant.\nThe rapid antigen testing review identified 66 articles, with 29 articles being deemed relevant by reviewers.\nThrough the influenza vaccination review, 44 abstracts\nwere identified, of which 18 were thought to be relevant\nby the reviewers and were pulled for detailed formal review.'