### Fetch Data from Hosts Online

In [None]:
from src.api_utils import ilove_access, cochrane_access, medline_class_access, ovid_new_access


In [None]:
from src.api_utils import cochrane_access

cochrane_service = cochrane_access()
print(cochrane_service)


In [None]:
from src.api_utils import ilove_access

ilove_access()

In [None]:
from src.api_utils import  medline_class_access

medline_class_access(searchText=["""
(
  (
    ("Review"[Publication Type] OR "Tutorial"[Publication Type])
    AND
    (
      medline[tiab] OR medlars[tiab] OR embase[tiab] OR pubmed[tiab] OR cochrane[tiab]
      OR scisearch[tiab] OR psychinfo[tiab] OR psycinfo[tiab]
      OR psychlit[tiab] OR psyclit[tiab] OR cinahl[tiab]
      OR (hand[tiab] AND search*[tiab]) OR (manual*[tiab] AND search*[tiab])
      OR "electronic database*"[tiab] OR "bibliographic database*"[tiab]
      OR computerized[tiab] OR computerised[tiab]
      OR "computerized database*"[tiab] OR "computerised database*"[tiab]
      OR "online database*"[tiab]
      OR pooling[tiab] OR pooled[tiab] OR "mantel haenszel"[tiab]
      OR peto[tiab] OR dersimonian[tiab] OR "der simonian"[tiab] OR "fixed effect"[tiab]
      OR "Retraction of Publication"[Publication Type] OR "Retracted Publication"[Publication Type]
    )
  )
  OR
  (
    "Meta-Analysis"[Publication Type]
    OR "Meta-Analysis as Topic"[Mesh]
    OR (meta-analys*[tiab] OR "meta analys*"[tiab] OR metaanalys*[tiab])
    OR (systematic*[tiab] AND review*[tiab])
    OR (systematic*[tiab] AND overview*[tiab])
    OR (quantitativ*[tiab] AND review*[tiab])
    OR (quantitativ*[tiab] AND overview*[tiab])
    OR (quantitativ*[tiab] AND synthesis*[tiab])
    OR (methodologic*[tiab] AND review*[tiab])
    OR (methodologic*[tiab] AND overview*[tiab])
    OR ("integrative research review*"[tiab] OR "research integration"[tiab])
  )
)
AND
(
  "Vaccines"[Mesh]
  OR "Immunization"[Mesh]
  OR "Immunization Programs"[Mesh]
  OR (
    immunisation[tiab] OR immunization[tiab] OR immunise[tiab] OR immunize[tiab]
    OR immunising[tiab] OR immunizing[tiab] OR immunised[tiab] OR immunized[tiab]
    OR immunises[tiab] OR immunizes[tiab]
    OR (vaccine*[tiab] AND immun*[tiab])
  )
  OR (vaccine*[tiab] OR vaccination*[tiab] OR vaccinat*[tiab])
)
AND humans[Mesh]
AND english[lang]
AND (pmc free full text[sb])
AND ("2011/01/01"[EDAT] : "3000"[EDAT])
"""])

In [None]:
from src.api_utils import ovid_new_access

ovid_new_access()

### Database Import Stops Here

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Load CSV
df = pd.read_csv('all_db.csv')

# Connect to PostgreSQL
engine = create_engine('postgresql://sense_database:sense_database@localhost:5433/sense_database')

# Write to DB
df.to_sql('all_db', engine, if_exists='replace', index=False)


## Real Tagging Process is Here

In [None]:
from src.Commands.PaperProcessorPipeline import PaperProcessorPipeline
from concurrent.futures import ThreadPoolExecutor

from src.Commands.TaggingSystem import Tagging
def main():
    pipeline = PaperProcessorPipeline(
        table_name='all_db',
        column_mapping={'id': 'primary_id'}
    )

    sources = [
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='Cochrane'",
        #     "csv_file_path": "Data/output/papers_data",
        #     "db_name": "Cochrane"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='LOVE'",
        #     "csv_file_path": "Data/output/papers_data_love",
        #     "db_name": "LOVE"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='OVID'",
        #     "csv_file_path": "Data/output/papers_data_OVID",
        #     "db_name": "OVID"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='Medline'",
        #     "csv_file_path": "Data/output/papers_data_medline",
        #     "db_name": "Medline"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", \"source\" FROM all_db WHERE \"doi\" IS NOT NULL AND \"doi\" != '' AND primary_id=1",
        #     "csv_file_path": "Data/output/papers_data_all",
        #     "db_name": "all"
        # },
        {
            "query": "SELECT primary_id, \"doi\", \"source\" FROM all_db WHERE \"doi\" IS NOT NULL AND \"doi\" != ''",
            "csv_file_path": "Data/output/papers_data_all",
            "db_name": "all_db_with_doi"
        }
    ]
    #  AND primary_id=1
    # Use parallel processing to process multiple sources simultaneously
    with ThreadPoolExecutor(max_workers=4) as executor:
        tagger = Tagging()
        executor.map(
            lambda source: pipeline.process_source_in_batches(
                query=source["query"],
                csv_file_path=source["csv_file_path"],
                db_name=source["db_name"],
                tagger=tagger,
                batch_size=1
            ),
            sources
        )

if __name__ == "__main__":
    main()

In [None]:
from datetime import date
from src.Commands.Amstar2 import amstar2

today = date.today()
date_str = today.strftime("%Y-%m-%d")
context = """
Introduction:Human papillomavirus (HPV) is one of the world’s most common sexually transmitted infections, and has been associated with a number of cervical and non-cervical diseases, including cancer. HPV vaccines have been licensed for use in females for some time, but the quadrivalent vaccine has only recently become licensed for use in males. Many countries have adopted a vaccination programme for adolescent females based on results of cost-effectiveness analyses. However, given the new indications for use of the vaccine in males, decision makers require information on the cost effectiveness of vaccinating males in order to make policy decisions on whether or not to fund such programmes.

Objective:Our objective was to conduct a qualitative systematic review to update a previously conducted review of HPV vaccine studies.

Methods:Articles were obtained from an extensive literature search to determine the cost effectiveness of implementing an HPV vaccination programme with routine cervical cancer screening. A total of 29 studies were included in this review. Seventeen of the included articles looked only at cervical disease outcomes, and 12 studies also included non-cervical disease outcomes. Four studies explored the economic impact of vaccinating both boys and girls. One study focused on a population of men who have sex with men (MSM).

Results:While different model structures, input parameters and baseline assumptions were used, the consistent message in studies that focused on female-only vaccination programmes was that routine vaccination of females is cost effective compared with cervical cancer screening alone.

Discussion:Based on the currently available literature, it appears that the addition of boys to a vaccination programme generally exceeds traditional cost-effectiveness thresholds. The MSM population represents a potential additional target for routine HPV vaccination; however, more cost-effectiveness studies are required before making such a policy change.

"""
checker = amstar2(review_date=date_str)
results = checker.evaluate_all(context)
summary = checker.amstar_label_and_flaws(results)
print(checker.prepare_amstar_update_dict(results, summary))
update_dict = checker.prepare_amstar_update_dict(results, summary)
print(update_dict)
update_dict

In [None]:
import pandas as pd

ddd = pd.read_csv("./Data/output/unified_output.csv")
ddd[ddd["doi"].isna()].groupby(["source"]).count()

#### Tagging for date of last lit and study count

In [None]:
from src.Commands.NERInference import NERTester
from src.Commands.PaperProcessorPipeline import PaperProcessorPipeline
from concurrent.futures import ThreadPoolExecutor

from src.Commands.TaggingSystem import Tagging
def main():
    pipeline = PaperProcessorPipeline(
        table_name='all_db',
        column_mapping={'id': 'primary_id'}
    )

    sources = [
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='Cochrane'",
        #     "csv_file_path": "Data/output/papers_data",
        #     "db_name": "Cochrane"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='LOVE'",
        #     "csv_file_path": "Data/output/papers_data_love",
        #     "db_name": "LOVE"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='OVID'",
        #     "csv_file_path": "Data/output/papers_data_OVID",
        #     "db_name": "OVID"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", doi_url, \"source\" FROM all_db WHERE \"source\"='Medline'",
        #     "csv_file_path": "Data/output/papers_data_medline",
        #     "db_name": "Medline"
        # },
        # {
        #     "query": "SELECT primary_id, \"doi\", \"source\" FROM all_db WHERE \"doi\" IS NOT NULL AND \"doi\" != '' AND primary_id=1",
        #     "csv_file_path": "Data/output/papers_data_all",
        #     "db_name": "all"
        # },
        {
            "query": "SELECT primary_id, \"doi\", \"source\" FROM all_db WHERE \"doi\" IS NOT NULL AND \"doi\" != ''",
            "csv_file_path": "Data/output/papers_data_all",
            "db_name": "all_db_inference"
        }
    ]
    #  AND primary_id=1
    # Use parallel processing to process multiple sources simultaneously
    with ThreadPoolExecutor(max_workers=4) as executor:
        tagger = NERTester()
        executor.map(
            lambda source: pipeline.process_source_in_batches(
                query=source["query"],
                csv_file_path=source["csv_file_path"],
                db_name=source["db_name"],
                tagger=tagger,
                batch_size=10
            ),
            sources
        )

if __name__ == "__main__":
    main()

In [None]:
from src.AIModels.Inference import SRPredictor
from src.Services.Factories.scrapers.OVIDPDFWebScraper import OVIDPDFWebScraper
from src.Services.Factories.scrapers.CochranePDFWebScraper import CochranePDFWebScraper
from src.Services.Factories.scrapers.LOVEPDFWebScraper import LOVEPDFWebScraper
from src.Commands.TaggingSystem import Tagging

from itertools import chain
import pandas as pd

from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory

def save_to_file(file_path, data):
    """
    Save the given data to a file.

    :param file_path: The file path where the data will be saved.
    :param data: The data to save (e.g., a list or string).
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        if isinstance(data, list):
            file.write('\n'.join(data))  # Write each item in a new line
        else:
            file.write(data)  # Write directly if it's a string

def flatten_tags(tags):
    """Convert nested lists or other complex data types to flattened strings."""
    for key, value in tags.items():
        if isinstance(value, list):
            # Select only the last list if multiple lists exist
            if isinstance(value[-1], list):
                value = value[-1]  # Take the last list for processing
            
            # Convert the selected list to a comma-separated string
            tags[key] = ", ".join(map(str, value)) if isinstance(value, list) else str(value)

        elif isinstance(value, str):
            tags[key] = value.strip()  # Remove unnecessary whitespace
        
    return tags

gen = LOVEPDFWebScraper(DB_name="LOVE")
document = gen.set_doi_url(
    # "https://dx.doi.org/10.1002/14651858.CD013479"
    # "https://journals.sagepub.com/doi/10.1177/2150131917742299"
    # "https://www.scielo.br/j/rpp/a/3MSXkXzft7QTJg3ZXg8RPbq/?lang=en"
    # "https://dx.doi.org/10.1002/14651858.CD013479"
    # "https://journal.waocp.org/article_88640.html"
    # "http://dx.doi.org/10.1016/j.pcd.2016.05.005"
    # "https://onlinelibrary.wiley.com/doi/10.1111/irv.12871"
    # "https://www.sciencedirect.com/science/article/pii/S0264410X22004406?via%3Dihub"
    # "https://www.sciencedirect.com/science/article/pii/S0264410X22004406"
    # "https://www.tandfonline.com/doi/full/10.1080/21645515.2016.1201623#d1e223"
    # "https://pmc.ncbi.nlm.nih.gov/articles/PMC8021610/"
    # "https://www.bumc.bu.edu/emergencymedicine/files/2017/02/Influenza-in-the-ED.pdf",
    # "https://www.sciencedirect.com/science/article/abs/pii/S0736467915011476"
    # "https://www.sciencedirect.com/science/article/abs/pii/S0264410X13011687?via%3Dihub"
    # "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0268625"
    # "https://doi.org/http://dx.doi.org/10.2165/11599470-000000000-00000"
    # "https://link.springer.com/article/10.2165/11599470-000000000-00000?error=cookies_not_supported&code=26db9efa-4d5a-469c-8ba9-8b2ee54a8cd8",
    "https://www.sciencedirect.com/science/article/abs/pii/S0091743514003260?via=ihub"
).fetch_and_extract_first_valid_pdf_text()
save_to_file("new_extracted_text.html", document)
# document

# model_path = "sentence-transformers/all-MiniLM-L6-v2"  # or "allenai/specter2_base" or "sentence-transformers/all-MiniLM-L6-v2"
# print(f"Device set to use: auto")
# pred = SRPredictor(model_path=model_path, device=None, top_k=12)
# out = pred.predict_all(document)
# out 
    
tag = Tagging()
tag.process(document)
from src.Commands.regexp import searchRegEx
data = tag.create_columns_from_text()
data = flatten_tags(data)

# data
# tag.sections.available_sections()

# tag.extract_last_literature_search_dates_ml()

In [None]:
data

In [None]:
from src.Commands.regexp import searchRegEx
result_columns = {}
for category, subcategories in searchRegEx.items():
    for subcategory, terms_dict in subcategories.items():
        for term_key, term_list in terms_dict.items():
            column_name = f"{category}#{subcategory}#{term_key}"
            
            if category == "popu" and subcategory == "age__group":
                age_range = out.get("age_groups", {})
                result_columns[column_name] = "" if term_key not in age_range else term_key
                print(f"Category: {category}, Subcategory: {subcategory}, {column_name}")
            elif (subcategory == "studie__no" or subcategory == "rct"):
                print("Entering here")
                study_info = out.get("studies", {})
                if study_info and isinstance(study_info, dict) and study_info['total'] == 0:
                    study_info = out.get("articles", {})
                print(study_info)
                result_columns["total_study_count"] = 0 if "total" not in study_info else study_info["total"]
                result_columns["total_rct_count"] = 0 if "rct" not in study_info else study_info["rct"]
                result_columns["total_nrsi_count"] = 0 if "nrsi" not in study_info else study_info["nrsi"]
                result_columns["total_cross_sectional_count"] = 0 if "cross_sectional" not in study_info else study_info["cross_sectional"]
                result_columns["total_case_control_count"] = 0 if "case_control" not in study_info else study_info["case_control"]
                result_columns["total_cohort_count"] = 0 if "cohort" not in study_info else study_info["cohort"]
                
            elif category == "topic":
                topics = out.get("topics_terms", {})
                topic_data = ", ".join([f"{k}:{v}" for k,v in topics.items()])
                result_columns["topics"] = topic_data if topic_data else ""
            elif category == "intervention":
                interventions = out.get("intervention_terms", {})
                intervention_data = ", ".join([f"{k}:{v}" for k,v in interventions.items()])
                result_columns["intervention"] = intervention_data if intervention_data else ""

            elif category == "outcome":
                outcomes = out.get("outcome_terms", {})
                outcome_data = ", ".join([f"{k}:{v}" for k,v in outcomes.items()])
                result_columns["outcome"] = outcome_data if outcome_data else ""
            else:
                pass
                # print(category)
                
                
result_columns

In [None]:
text = """
===== Main_Content =====
===== Title =====
HPV vaccine acceptability in Africa: A systematic review

===== Abstract =====
• We review factors associated with HPV vaccine acceptability in African countries. • The Health Belief Model was used to guide data abstraction and synthesis. • Acceptability of the HPV vaccine in this region is predicted to be high. • Broad knowledge gaps were highlighted regarding HPV and cervical cancer. • Education on effectiveness and reducing perceived barriers will be useful.
We review factors associated with HPV vaccine acceptability in African countries.
The Health Belief Model was used to guide data abstraction and synthesis.
Acceptability of the HPV vaccine in this region is predicted to be high.
Broad knowledge gaps were highlighted regarding HPV and cervical cancer.
Education on effectiveness and reducing perceived barriers will be useful.
The objective of this study was to provide a systematic review of peer-reviewed literature on the factors associated with HPV vaccine acceptability among adults in African countries.
A systematic search was conducted across five electronic databases: EMBASE, PsychINFO , CINAHL, Global Health and Ovid MEDLINE, to identify studies related to HPV vaccination acceptability in African countries (August 2013). The Health Belief Model was used to guide data abstraction and synthesis.
Fourteen unique studies representing ten sub-Saharan African countries were identified, with more than half published within the last two years. Acceptability of the HPV vaccine for daughters was high (range 59–100%); however, vaccine-related awareness and knowledge were low. Perceived barriers including accessibility and cost concerns were important for acceptance, as were cues to action from healthcare providers and governments.
This review suggests that acceptability of the HPV vaccine in countries in this region will be high. Broad knowledge gaps were highlighted regarding HPV and cervical cancer and these should be addressed. Education on the vaccine's effectiveness and reducing perceived barriers to vaccination would also be useful. Public endorsement by governments and healthcare providers will likely also increase acceptance.

===== Results =====
As shown in Fig. 1, 229 unique articles were identified after database searches, 29 were given full-text review with 14 meeting inclusion criteria. One article was identified by hand-searching (Ports et al., 2013). The final number of articles included in this review is 15 representing 14 unique studies (Table 1). These studies span ten countries from SSA: Botswana (1), South Africa (2), Nigeria (2), Kenya (3), Ghana (1), Uganda (1), Mali (1), Zambia (1), Tanzania (1) and Malawi (1). At the

===== Discussion =====
This systematic review found high acceptance of the HPV vaccine among young adults, adults, and parents in SSA countries, despite low awareness of HPV and the HPV vaccine. This review was structured using the Health Belief Model (HBM) and demonstrates the utility of the HBM for integrating qualitative and quantitative research findings for understanding the factors influencing vaccine acceptance.
The included studies were moderately-sized, quantitative cross-sectional studies and qualitative

===== Conclusion =====
Implementation of the HPV vaccine in African countries is an important step towards reducing the high burden of cervical cancer in this region. Recent announcements from the GAVI of HPV vaccine demonstration projects beginning in SSA offer an encouraging step toward this goal. Based on this review, the acceptability and uptake of the HPV vaccine among these countries are expected to be high. However, broad knowledge gaps have been highlighted and should be addressed. Efforts to educate about

===== Conflict of interest =====
The authors declare no conflict of interest.

===== References =====
A systematic review of measures used in studies of human papillomavirus (HPV) vaccine acceptability
J.D. Allen et al. A systematic review of measures used in studies of human papillomavirus (HPV) vaccine acceptability Vaccine (2010)

HPV vaccine acceptability among Kenyan women
S. Becker-Dreps et al. HPV vaccine acceptability among Kenyan women Vaccine (2010)

Predictors of HPV vaccine acceptability: a theory-informed, systematic review
N.T. Brewer et al. Predictors of HPV vaccine acceptability: a theory-informed, systematic review Prev. Med. (2007)

HPV vaccine acceptability in Ghana, West Africa
M.A. Coleman et al. HPV vaccine acceptability in Ghana, West Africa Vaccine (2011)

Global burden of human papillomavirus and related diseases
D. Forman et al. Global burden of human papillomavirus and related diseases Vaccine (2012)

Examining attitudes and knowledge about HPV and cervical cancer risk among female clinic attendees in Johannesburg, South Africa
S.A. Francis et al. Examining attitudes and knowledge about HPV and cervical cancer risk among female clinic attendees in Johannesburg, South Africa Vaccine (2010)

A qualitative analysis of South African women's knowledge, attitudes, and beliefs about HPV and cervical cancer prevention, vaccine awareness and acceptance, and maternal–child communication about sexual health
S.A. Francis et al. A qualitative analysis of South African women's knowledge, attitudes, and beliefs about HPV and cervical cancer prevention, vaccine awareness and acceptance, and maternal–child communication about sexual health Vaccine (2011)

Chapter 21: Modelling the impact of HPV vaccines on cervical cancer and screening programmes
G.P. Garnett et al. Chapter 21: Modelling the impact of HPV vaccines on cervical cancer and screening programmes Vaccine (2006)

Preparing for HPV vaccination in South Africa: key challenges and opinions
J. Harries et al. Preparing for HPV vaccination in South Africa: key challenges and opinions Vaccine (2009)

Chapter 15: HPV vaccine use in the developing world
M.A. Kane et al. Chapter 15: HPV vaccine use in the developing world Vaccine (2006)

Human papillomavirus (HPV) infection and vaccines: knowledge, attitude and perception among female students at the University of Lagos, Lagos, Nigeria
C.C. Makwe et al. Human papillomavirus (HPV) infection and vaccines: knowledge, attitude and perception among female students at the University of Lagos, Lagos, Nigeria J. Epidemiol. Glob. Health (2012)

Chapter 16: HPV vaccines in immunocompromised women and men
J.M. Palefsky et al. Chapter 16: HPV vaccines in immunocompromised women and men Vaccine (2006)

Factors influencing pandemic influenza vaccination of healthcare workers—a systematic review
C. Prematunge et al. Factors influencing pandemic influenza vaccination of healthcare workers—a systematic review Vaccine (2012)

A qualitative study of HPV vaccine acceptability among health workers, teachers, parents, female pupils, and religious leaders in northwest Tanzania
P. Remes et al. A qualitative study of HPV vaccine acceptability among health workers, teachers, parents, female pupils, and religious leaders in northwest Tanzania Vaccine (2012)

Determinants for HPV vaccine uptake in the Netherlands: a multilevel study
M. Rondy et al. Determinants for HPV vaccine uptake in the Netherlands: a multilevel study Vaccine (2010)

Predictors of HPV vaccine uptake among women aged 19–26: importance of a physician's recommendation
S.L. Rosenthal et al. Predictors of HPV vaccine uptake among women aged 19–26: importance of a physician's recommendation Vaccine (2011)

Worldwide burden of gynaecological cancer: the size of the problem
R. Sankaranarayanan et al. Worldwide burden of gynaecological cancer: the size of the problem Best Pract. Res. Clin. Obstet. Gynaecol. (2006)

Knowledge, attitudes, practices, and perceived risk of cervical cancer among Kenyan women: brief report
S.L. Sudenga et al. Knowledge, attitudes, practices, and perceived risk of cervical cancer among Kenyan women: brief report Int. J. Gynecol. Cancer (2013)

Awareness, acceptability and uptake of human papilloma virus vaccine among Cameroonian school-attending female adolescents
C.A. Ayissi et al. Awareness, acceptability and uptake of human papilloma virus vaccine among Cameroonian school-attending female adolescents J. Community Health (2012)

Achieving high coverage in Rwanda's national human papillomavirus vaccination programme
A. Binagwaho et al. Achieving high coverage in Rwanda's national human papillomavirus vaccination programme Bull. World Health Organ. (2012)

"""

model_path = "sentence-transformers/all-MiniLM-L6-v2"  # or "allenai/specter2_base" or "sentence-transformers/all-MiniLM-L6-v2"
pred = SRPredictor(model_path=model_path, device=None, top_k=12)
out = pred.predict_all(text)
from pprint import pprint
pprint(out)

#### Tagging

In [None]:
from src.Commands.TaggingSystem import Tagging
from src.Commands.regexp import searchRegEx
document = """
===== SearchStrategy =====
In this systematic review and meta-analysis, we searched PubMed and Embase from Jan 1, 2004, to March 31, 2015. Test-negative design studies of influenza VE were eligible if they enrolled outpatients on the basis of predefined illness criteria, reported subtype-level VE by season, used PCR to confirm influenza, and adjusted for age. We excluded studies restricted to hospitalised patients or special populations, duplicate reports, interim reports superseded by a final report, studies of live-attenuated vaccine, and studies of prepandemic seasonal vaccine against H1N1pdm09. Two reviewers independently assessed titles and abstracts to identify articles for full review. Discrepancies in inclusion and exclusion criteria and VE estimates were adjudicated by consensus. Outcomes were VE against H3N2, H1N1pdm09, H1N1 (pre-2009), and type B. We calculated pooled VE using a random-effects model.


"""
# Assuming self.document is already set
tagging = Tagging(document)
result = tagging.extract_last_literature_search_dates()
# result = tagging.create_columns_from_text(searchRegEx)
print(result)
# 18, 20, 23

## General Extractor

In [None]:
from src.Commands.TaggingSystem import Tagging
from src.Commands.regexp import searchRegEx
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory

def extract_all_sections_from_text(text):
    """
    Extracts all sections and their respective content from a string variable.

    Args:
        text (str): The input text containing sections and their content.

    Returns:
        dict: A dictionary where the keys are section headers and values are their respective content.
    """
    sections = {}
    current_section = "Title"  # Default section for content before any section header
    current_content = []

    # Split the text into lines for processing
    lines = text.splitlines()

    for line in lines:
        stripped_line = line.strip()

        # Check if the line is a potential section header
        if stripped_line.isupper() or stripped_line.endswith(":"):
            # Save the previous section and its content
            if current_section:
                sections[current_section] = "\n".join(current_content)

            # Start a new section
            current_section = stripped_line.rstrip(":")
            current_content = []
        else:
            # Add content to the current section
            if stripped_line:  # Skip empty lines
                current_content.append(stripped_line)

    # Add the last section's content
    if current_section:
        sections[current_section] = "\n".join(current_content)

    return sections

def save_to_file(file_path, data):
    """
    Save the given data to a file.

    :param file_path: The file path where the data will be saved.
    :param data: The data to save (e.g., a list or string).
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        if isinstance(data, list):
            file.write('\n'.join(data))  # Write each item in a new line
        else:
            file.write(data)  # Write directly if it's a string
            
# Assuming self.document is already set
# url_tandfonline = 'https://bmcmedicine.biomedcentral.com/articles/10.1186/s12916-018-1098-3'
# url_tandfonline = 'https://www.tandfonline.com/doi/full/10.1080/01443615.2022.2162867'
# url_tandfonline = 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8021610/'
url_tandfonline = 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8477621/'
# url_tandfonline = "https://www.mdpi.com/1660-4601/19/15/9425"
# url_tandfonline = "https://www.nature.com/articles/s41598-021-83727-7"
# url_tandfonline = "https://bmcpublichealth.biomedcentral.com/articles/10.1186/s12889-020-08753-y"
# url_tandfonline = "https://bmcmedicine.biomedcentral.com/articles/10.1186/s12916-018-1098-3"
# url_tandfonline = "https://www.jpmh.org/index.php/jpmh/article/view/998"
# url_tandfonline = "https://obgyn.onlinelibrary.wiley.com/doi/10.1111/aogs.14359"
# url_tandfonline = "https://www.aerzteblatt.de/int/archive/article/161392"
# url_tandfonline = "https://bmcpublichealth.biomedcentral.com/articles/10.1186/1471-2458-14-867"
# url_tandfonline = "https://www.scielo.br/j/rpp/a/3MSXkXzft7QTJg3ZXg8RPbq/?lang=en"
# url_tandfonline = "https://journal.waocp.org/article_88640.html"
# for PDF
# url_tandfonline = "https://ijpsr.com/wp-content/uploads/2015/03/57-Vol.-6-Issue-4-April-2015-IJPSR-RA-5231-Paper-57.pdf"
# url_tandfonline = "https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD013717.pub2/pdf/full"
# url_tandfonline = "https://journals.sagepub.com/doi/10.1177/2150131917742299"
# url_tandfonline = "https://bmjopen.bmj.com/content/8/4/e019206"
# url_tandfonline = "https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD013479/full"
# url_tandfonline = "http://dx.doi.org/10.1111/irv.12871"
url_tandfonline = "https://www.bumc.bu.edu/emergencymedicine/files/2017/02/Influenza-in-the-ED.pdf"
tandfonline_extractor = ArticleExtractorFactory.get_extractor(url=url_tandfonline)
print("Available sections (tandfonline):", tandfonline_extractor.get_available_sections())
document = tandfonline_extractor.get_section("Main Content")

file_path = 'extracted_dates.txt'
save_to_file(file_path, document)

tagging = Tagging(document)
result = tagging.create_columns_from_text(searchRegEx)
# print(result)
from itertools import chain
def flatten_tags(tags):
    """Convert nested lists or other complex data types to flattened strings."""
    for key, value in tags.items():
        if isinstance(value, list):
            # Select only the last list if multiple lists exist
            if isinstance(value[-1], list):
                value = value[-1]  # Take the last list for processing
            
            # Convert the selected list to a comma-separated string
            tags[key] = ", ".join(map(str, value)) if isinstance(value, list) else str(value)

        elif isinstance(value, str):
            tags[key] = value.strip()  # Remove unnecessary whitespace
        
    return tags
print(result)
print(flatten_tags(result))


In [None]:
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory
from bs4 import BeautifulSoup
with open("file_path.html", "r", encoding="utf-8") as file:
    html_soup = BeautifulSoup(file.read(), "html.parser")

extractor = ArticleExtractorFactory.get_extractor(soup=html_soup, url="cochranelibrary.com")
print("Available sections (tandfonline):", extractor.get_available_sections())
extractor._extract_sections()
# extractor.save_extracted_sections(output_path)
# print(f"Extracted sections saved to {output_path}")

In [None]:
import sys
import os
from src.Services.DatabaseHandler import DatabaseHandler
from app import app # Adjust this import to match your project structure

def find_duplicate_dois():
    """
    Finds and returns all DOIs that appear more than once in the all_db table.

    Returns:
        A list of dictionaries, where each dictionary contains a duplicate
        'doi' and its 'occurrence_count'. Returns an empty list if no duplicates are found.
    """
    print("Checking for duplicate DOIs in the database...")
    
    db_handler = DatabaseHandler()
    
    query = """
        SELECT
            "doi",
            COUNT("doi") AS "occurrence_count"
        FROM
            all_db
        WHERE
            "doi" IS NOT NULL AND "doi" != ''
        GROUP BY
            "doi"
        HAVING
            COUNT("doi") > 1
        ORDER BY
            "occurrence_count" DESC;
    """
    
    try:
        duplicates = db_handler.execute_query(query)
        return duplicates
    except Exception as e:
        print(f"An error occurred while checking for duplicates: {e}")
        return []


if __name__ == "__main__":
    # The app context is required to use Flask-SQLAlchemy
    with app.app_context():
        duplicate_records = find_duplicate_dois()
        
        if duplicate_records:
            print(f"\nFound {len(duplicate_records)} DOIs with duplicate entries:")
            for record in duplicate_records:
                print(f"  - DOI: {record['doi']}, Count: {record['occurrence_count']}")
        else:
            print("\nNo duplicate DOIs found in the database.")