### Fetch Data from Hosts Online

In [None]:
from src.api_utils import ilove_access, cochrane_access, medline_class_access, ovid_new_access


In [None]:
cochrane_service = cochrane_access()
print(cochrane_service)


In [None]:
ilove_access()

In [None]:
medline_class_access(searchText=["""
(
  (review[pt] OR "review, tutorial"[pt] OR "review, academic"[pt])
  AND 
  (
    medline[tw] OR medlars[tw] OR embase[tw] OR pubmed[tw] OR cochrane[tw]
    OR scisearch[tw] OR psychinfo[tw] OR psycinfo[tw]
    OR psychlit[tw] OR psyclit[tw] 
    OR cinahl[tw] 
    OR ((hand[tw] AND search*[tw]) OR (manual*[tw] AND search*[tw]))
    OR ("electronic database*"[tw] OR "bibliographic database*"[tw] OR "computerized database*"[tw] OR "online database*"[tw])
    OR pooling[tw] OR pooled[tw] OR "mantel haenszel"[tw]
    OR peto[tw] OR dersimonian[tw] OR "der simonian"[tw] OR "fixed effect"[tw]
    OR "retraction of publication"[pt] OR "retracted publication"[pt]
  )
)
OR
(
  meta-analysis[pt] 
  OR meta-analysis[sh] 
  OR (meta-analys*[tw] OR meta analys*[tw] OR metaanalys*[tw])
  OR (systematic*[tw] AND review*[tw])
  OR (quantitative*[tw] AND review*[tw])
  OR (methodologic*[tw] AND review*[tw])
  OR ("integrative research review"[tw] OR "research integration"[tw])
)
AND
(
  immunization[mesh] 
  OR Immunization Programs[mesh] 
  OR vaccines[mesh]
  OR (immunisation[tiab] OR immunization[tiab] OR immunise[tiab] OR immunize[tiab] OR vaccine[tiab])
)
AND humans[filter]
AND 
("2011"[edat] : "3000"[edat])  # Retrieves records from 2011 till date
"""])

In [None]:
ovid_new_access()

## Real Tagging Process is Here

In [2]:
from src.Commands.PaperProcessorPipeline import PaperProcessorPipeline
from concurrent.futures import ThreadPoolExecutor
def main():
    pipeline = PaperProcessorPipeline(
        table_name='all_db',
        column_mapping={'Id': 'primary_id'}
    )

    sources = [
        # {
        #     "query": "SELECT primary_id, \"DOI\", doi_url, \"Source\" FROM all_db WHERE \"Source\"='Cochrane'",
        #     "csv_file_path": "Data/output/papers_data",
        #     "db_name": "Cochrane"
        # },
        # {
        #     "query": "SELECT primary_id, \"DOI\", doi_url, \"Source\" FROM all_db WHERE \"Source\"='LOVE'",
        #     "csv_file_path": "Data/output/papers_data_love",
        #     "db_name": "LOVE"
        # },
        # {
        #     "query": "SELECT primary_id, \"DOI\", doi_url, \"Source\" FROM all_db WHERE \"Source\"='OVID'",
        #     "csv_file_path": "Data/output/papers_data_OVID",
        #     "db_name": "OVID"
        # },
        # {
        #     "query": "SELECT primary_id, \"DOI\", doi_url, \"Source\" FROM all_db WHERE \"Source\"='Medline'",
        #     "csv_file_path": "Data/output/papers_data_medline",
        #     "db_name": "Medline"
        # },
        {
            "query": "SELECT primary_id, \"DOI\", \"Source\" FROM all_db WHERE \"DOI\" IS NOT NULL AND \"DOI\" != '' AND primary_id=1",
            "csv_file_path": "Data/output/papers_data_all",
            "db_name": "all"
        }
    ]
    #  AND primary_id=1
    # Use parallel processing to process multiple sources simultaneously
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(
            lambda source: pipeline.process_source_in_batches(
                query=source["query"],
                csv_file_path=source["csv_file_path"],
                db_name=source["db_name"],
                batch_size=10
            ),
            sources
        )

if __name__ == "__main__":
    main()

In [None]:
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF for PDFs
import docx  # For Word documents
from urllib.parse import urlparse


def save_to_file(file_path, data):
    """
    Save the given data to a file.

    :param file_path: The file path where the data will be saved.
    :param data: The data to save (e.g., a list or string).
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        if isinstance(data, list):
            file.write('\n'.join(data))  # Write each item in a new line
        else:
            file.write(data)  # Write directly if it's a string


from src.Utils.Helpers import get_contents


# Example 1: Extract text from a URL (HTML)
url = "https://journals.sagepub.com/doi/10.1177/2150131917742299"
# url = "https://www.sciencedirect.com/science/article/abs/pii/S0091743514003260?via=ihub"
# url = "https://www.tandfonline.com/doi/full/10.1080/01443615.2022.2162867"
result = get_contents(url)
save_to_file("new_extracted_text.html", result)

## Trying to test some basic analysis

In [None]:
import pandas as pd
import json
import altair as alt
import ast
from src.Commands.DatabaseUpdater import DatabaseUpdater

# updater = DatabaseUpdater(table_name="all_db", column_mapping={'Id': 'Id'})

def convert_chart(chart):
    try:
        # Get the currently active data transformer
        current_transformer = alt.data_transformers.get()
        # Check if the current transformer is vegafusion
        if 'vegafusion' in str(current_transformer):
            return chart.to_dict(format="vega")
        else:
            return chart.to_dict()
    except ValueError as e:
        print(f"Handling ValueError: {e}")
        return chart.to_dict()
    
def view_trends_by_database_year():
    from src.Services.PostgresService import PostgresService, QueryHelper
    db_service = PostgresService()
    query = f'SELECT * FROM all_db'
    # Execute the query using the db_service
    record = db_service.execute_raw_query(query)
    final_data = record
    if "database" in final_data:
        # Rename databases for better readability
        databases = {
            'OVID': 'OVID',
            'LOVE': 'LOVE DB',
            'Medline': 'Medline',
            'Cochrane': 'Cochrane'
        }
        final_data['Journal'] = final_data['Journal'].replace(databases)
    return final_data

def plot_stacked_bar_chart_altair(data):
    """
    Reads a JSON file, processes the data, and plots a stacked bar chart 
    with Year on the x-axis and Journals as colors using Altair.

    Parameters:
        file_path (str): Path to the JSON file.
    """
    
    # Merge 'Publication_Year' and 'Year' columns to ensure all rows have a year value
    for entry in data:
        entry["Year"] = entry.get("Publication_Year") or entry.get("Year")  # Use Publication_Year first if available

    # Extract relevant fields: Year, Journal, and study counts from study_types field
    records = []
    for entry in data:
        year = entry.get("Year", "Unknown")
        journal = entry.get("Journal", "Unknown")
        study_types = entry.get("study_types", "{}")

        # Convert study_types from string to dict
        try:
            study_types_dict = json.loads(study_types) if isinstance(study_types, str) else study_types
            for study_type, count in study_types_dict.items():
                records.append({"Year": year, "Journal": journal, "Study Type": study_type, "Count": count})
        except json.JSONDecodeError:
            continue

    # Convert to DataFrame
    df = pd.DataFrame(records)

    # Convert Year to numeric, ensuring sorting works correctly
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

    # Generate a stacked bar chart in Altair
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X("Year:O", title="Year", sort="ascending"),
        y=alt.Y("sum(Count)", title="Number of Studies"),
        color=alt.Color("Journal:N", title="Journal"),
        tooltip=["Year", "Journal", "sum(Count)"]
    ).properties(
        title="Stacked Bar Chart of Study Types by Year (Colored by Journal)",
        width=800,
        height=400
    )

    # Display chart
    return chart

# Example usage
data = view_trends_by_database_year()
plot_stacked_bar_chart_altair(data)

## Testing Tagging Process with Rough DB

In [None]:
from src.Commands.PaperProcessorPipeline import PaperProcessorPipeline
from concurrent.futures import ThreadPoolExecutor
def main():
    pipeline = PaperProcessorPipeline(
        table_name='rough_db',
        column_mapping={'Id': 'primary_id'}
    )

    sources = [
        {
            "query": "SELECT primary_id, \"Link to full text\" FROM rough_db WHERE primary_id > 0 and \"Link to full text\" != ''",
            "csv_file_path": "output/papers_data_rough",
            "db_name": "ROUGH"
        }
    ]

    # Use parallel processing to process multiple sources simultaneously
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(
            lambda source: pipeline.process_source_in_batches(
                query=source["query"],
                csv_file_path=source["csv_file_path"],
                db_name=source["db_name"],
                batch_size=10
            ),
            sources
        )

if __name__ == "__main__":
    main()

## My Timeline

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
from datetime import datetime

# Define the Gantt chart data
data = {
    "Task": [
        # Year 1 - Foundation and Methodology Development
        "Initial Literature Review",
        "Data Exploration and Initial AMR Analysis",
        "Data Quality & Initial Preprocessing",
        "Prototype AI/ML and Visualization Techniques",
        "Publication 1 Draft (Methodology & AMR Trends Overview)",
        "Publication 1 Submission & Feedback",

        # Year 2 - Advanced Analysis of Imputation Methods
        "Advanced Data Preprocessing & Feature Engineering",
        "Imputation Method Development and Evaluation",
        "Publication 2 Draft (Comparative Imputation Methods)",
        "Publication 2 Submission & Feedback",


        # Year 3 - Final Analysis and Manuscript Preparation
        "Co-resistance and Pattern Analysis",
        "Visualization and Network Analysis",
        "Model Validation and Feedback Integration",
        "Final Analysis,Interpretation and Synthesis of Findings",
        "Publication 3 Draft (Final Model, Findings & Conclusion)",
         "Publication 3 Submission & Feedback",
        "Dissertation Preparation",
        "Dissertation Submission",
        "Final Defense Preparation",
        "Final Defense and Presentation"
    ],
    "Start": [
        # Year 1
        "2024-10-01", "2024-10-15", "2024-11-15", "2025-01-01", "2025-03-01", "2025-05-01",

        # Year 2
        "2025-07-01", "2025-08-15", "2025-10-01", "2026-01-01", "2026-03-01",

        # Year 3
         "2026-07-01", "2026-08-15","2026-10-01", "2026-11-01", "2027-01-01",
       "2027-05-01","2027-07-01","2027-08-01"

    ],
    "End": [
        # Year 1
        "2024-10-14","2024-12-31", "2024-12-31", "2025-02-28", "2025-04-30", "2025-06-30",

        # Year 2
        "2025-08-14", "2025-09-30",  "2025-12-31",  "2026-02-28", "2026-06-30",

        # Year 3
          "2026-08-14",  "2026-09-30", "2026-10-31", "2026-12-31", "2027-04-30",
         "2027-06-30", "2027-07-31", "2027-12-31"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert Start and End to datetime
df["Start"] = pd.to_datetime(df["Start"])
df["End"] = pd.to_datetime(df["End"])

# Get the current date for the vertical line
current_date = datetime.now()

# Create the Gantt chart
fig, ax = plt.subplots(figsize=(16, 10))

for i, task in df.iterrows():
    color = "skyblue"  # All tasks are considered "In Progress" initially
    ax.barh(task["Task"], date2num(task["End"]) - date2num(task["Start"]),
            left=date2num(task["Start"]), color=color, edgecolor="black")

# Add the current date line
ax.axvline(date2num(current_date), color="red", linestyle="--", linewidth=2, label="Today")

# Add labels and formatting
ax.set_title("3-Year PhD Timeline (Quarterly Breakdown)", fontsize=16)
ax.set_xlabel("Timeline (Quarters)", fontsize=12)
ax.set_ylabel("Tasks", fontsize=12)
ax.xaxis_date()
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
ax.invert_yaxis()  # Flip tasks to start from the top

# Add custom legend
legend_elements = [
    plt.Line2D([0], [0], color="skyblue", lw=6, label="In Progress"),
    plt.Line2D([0], [0], color="red", lw=2, linestyle="--", label="Today")
]
ax.legend(handles=legend_elements, loc="upper right", fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
from src.Services.Factories.GeneralPDFScraper.OVIDPDFWebScraper import OVIDPDFWebScraper
from src.Services.Factories.GeneralPDFScraper.CochranePDFWebScraper import CochranePDFWebScraper
from src.Services.Factories.GeneralPDFScraper.LOVEPDFWebScraper import LOVEPDFWebScraper
from src.Commands.TaggingSystem import Tagging

from itertools import chain
import pandas as pd
def flatten_tags(tags):
    """Convert nested lists or other complex data types to flattened strings."""
    for key, value in tags.items():
        if isinstance(value, list):
            # Select only the last list if multiple lists exist
            if isinstance(value[-1], list):
                value = value[-1]  # Take the last list for processing
            
            # Convert the selected list to a comma-separated string
            tags[key] = ", ".join(map(str, value)) if isinstance(value, list) else str(value)

        elif isinstance(value, str):
            tags[key] = value.strip()  # Remove unnecessary whitespace
        
    return tags

gen = LOVEPDFWebScraper(DB_name="LOVE")
document = gen.set_doi_url(
    # "https://dx.doi.org/10.1002/14651858.CD013479"
    # "https://journals.sagepub.com/doi/10.1177/2150131917742299"
    # "https://www.scielo.br/j/rpp/a/3MSXkXzft7QTJg3ZXg8RPbq/?lang=en"
    # "https://dx.doi.org/10.1002/14651858.CD013479"
    # "https://journal.waocp.org/article_88640.html"
    "https://bmjopen.bmj.com/content/11/12/e052682"
    # "https://www.sciencedirect.com/science/article/pii/S0264410X22004406?via%3Dihub"
    # "https://www.tandfonline.com/doi/full/10.1080/21645515.2016.1201623#d1e223"
).fetch_and_extract_first_valid_pdf_text()

tag = Tagging(document)
from src.Commands.regexp import searchRegEx
data = tag.create_columns_from_text(searchRegEx)
data = flatten_tags(data)
# ff = pd.DataFrame(data)
# ff.to_csv("testing.csv")
data
tag.sections.available_sections()

In [None]:
import requests

def fetch_sagepub_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://google.com"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

# Example usage
url = "https://journals.sagepub.com/doi/10.1177/2150131917742299"
content = fetch_sagepub_content(url)
print(content[:1000])  # Print first 1000 characters for inspection


In [None]:
from src.Commands.TaggingSystem import Tagging
from src.Commands.regexp import searchRegEx
document = """
===== SearchStrategy =====
In this systematic review and meta-analysis, we searched PubMed and Embase from Jan 1, 2004, to March 31, 2015. Test-negative design studies of influenza VE were eligible if they enrolled outpatients on the basis of predefined illness criteria, reported subtype-level VE by season, used PCR to confirm influenza, and adjusted for age. We excluded studies restricted to hospitalised patients or special populations, duplicate reports, interim reports superseded by a final report, studies of live-attenuated vaccine, and studies of prepandemic seasonal vaccine against H1N1pdm09. Two reviewers independently assessed titles and abstracts to identify articles for full review. Discrepancies in inclusion and exclusion criteria and VE estimates were adjudicated by consensus. Outcomes were VE against H3N2, H1N1pdm09, H1N1 (pre-2009), and type B. We calculated pooled VE using a random-effects model.


"""
# Assuming self.document is already set
tagging = Tagging(document)
result = tagging.extract_last_literature_search_dates()
# result = tagging.create_columns_from_text(searchRegEx)
print(result)
# 18, 20, 23

In [None]:
import re

def extract_month_year(text):
    pattern = r"\b(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s*(?:\d{1,2}(?:st|nd|rd|th)?)?,?\s*(\d{4})\b"
    
    match = re.search(pattern, text)
    if match:
        return f"{match.group(1)} {match.group(2)}"
    return None

# Example usage
text = """We conducted a systematic search in three electronic databases from inception up to 13th of January, 2020, without language restrictions."""

result = extract_month_year(text)
print(result)  # Output: January 2020


In [None]:
import re

def extract_study_types(text, terms_list):
    """
    Extract study types and their counts from text.
    Returns a dictionary with study types and counts.
    """
    # Extract study type terms from the terms_list
    study_terms = [item for item, _ in terms_list]
    
    # Regex to match study types and their counts
    study_pattern = re.compile(
        rf"(\d+)\s*({'|'.join(re.escape(term) for term in study_terms)})", 
        flags=re.IGNORECASE
    )
    
    matches = study_pattern.findall(text)
    
    study_types = {}
    for count, study_type in matches:
        study_type_lower = study_type.lower()
        if study_type_lower in study_types:
            study_types[study_type_lower] += int(count)
        else:
            study_types[study_type_lower] = int(count)
    
    return study_types

# Example usage
terms_list = [
    ("study", "sty"), ("studies", "sty"),
    ("RCT", "rct"),
    ('randomized controlled trial', "rct"),
    ('randomised controlled trial', "rct"),
    ('randomized trial', "rct"),
    ('randomised trial', "rct"),
    ('clinical trial', "rct"),
    ("double-blind study", "rct"), 
    ("placebo-controlled", "rct"),
    ("randomised comparative", "rct"),
    ("NRSI", "nrsi"), 
    ("non-randomized studies of interventions", "nrsi"),
    ("observational studies", "nrsi"), 
    ("quasi-experimental", "nrsi"), 
    ("non-randomized controlled study", "nrsi"), 
    ("natural experiment", "nrsi"),
    ("test-negative designs", "nrsi"),
    ("cross-sectional study", "nrsi"), 
    ("controlled clinical", "nrsi"), 
    ("cohort study", "nrsi"), 
    ("prospective study", "nrsi"), 
    ("retrospective study", "nrsi"), 
    ("longitudinal study", "nrsi"),
    ("case-control study", "nrsi"),
    ("pre-post studies", "nrsi"),
    ("interrupted time series", "nrsi"),
    ("case reports", "nrsi"),
    ("case series", "nrsi"),
    ("mixed methods", "mmtd"),
    ("convergent design", "mmtd"), 
    ("explanatory sequential design", "mmtd"),
    ("qualitative study", "quanti"),
]

text = """
20 in Germany, 10 in Nigeria, Ghana(5), five in spain. 2 quantitative study
10 cross sectional, 2 quanitative
"""

print(extract_study_types(text, terms_list))

In [None]:
import re

def extract_sex_distribution(text):
    """
    Extract sex distribution (male, female, other) from text.
    Returns a dictionary with percentages and values.
    """
    # Regex to match percentages and values for male, female, and other genders
    sex_pattern = re.compile(
        r"(\d{1,3}%)\s*(male|female|divers|other)", 
        re.IGNORECASE
    )
    
    matches = sex_pattern.findall(text)
    
    sex_distribution = {}
    for percentage, gender in matches:
        sex_distribution[gender.lower()] = percentage
    
    return sex_distribution

def extract_study_types(text):
    """
    Extract study types and their counts from text.
    Returns a dictionary with study types and counts.
    """
    # Regex to match study types and their counts
    study_pattern = re.compile(
        r"(\d+)\s*(cross\s*sectional|mixed\s*method|qualitative|quantitative|longitudinal|case\s*study)", 
        re.IGNORECASE
    )
    
    matches = study_pattern.findall(text)
    
    study_types = {}
    for count, study_type in matches:
        study_types[study_type.lower()] = int(count)
    
    return study_types

def extract_total_population(text):
    """
    Extract the total population size (N) from text.
    Returns the population size as an integer.
    """
    # Regex to match population size (N)
    population_pattern = re.compile(
        r"N\s*[-—]?\s*population\s*[:]?\s*(\d{1,}(?:\s*\d{3})*)", 
        re.IGNORECASE
    )
    
    match = population_pattern.search(text)
    if match:
        # Remove spaces and commas from the number
        population = match.group(1).replace(" ", "").replace(",", "")
        return int(population)
    return None

def extract_study_characteristics(text):
    """
    Extract characteristics of included studies:
    - Study types and counts
    - Countries and counts
    - Sample size and sex distribution
    - Population health status
    Returns a dictionary with all extracted information.
    """
    characteristics = {}
    
    # Extract study types
    characteristics["study_types"] = extract_study_types(text)
    
    # Extract countries
    country_pattern = re.compile(
        r"(\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b)\s*\((\d+)\)", 
        re.IGNORECASE
    )
    countries = country_pattern.findall(text)
    characteristics["countries"] = {country: int(count) for country, count in countries}
    
    # Extract sample size and sex distribution
    sample_pattern = re.compile(
        r"Sample\s*size:\s*(\d+)\s*\((.*?)\)", 
        re.IGNORECASE
    )
    sample_match = sample_pattern.search(text)
    if sample_match:
        characteristics["sample_size"] = int(sample_match.group(1))
        characteristics["sex_distribution"] = extract_sex_distribution(sample_match.group(2))
    
    # Extract population health status
    health_status_pattern = re.compile(
        r"Population\s*health\s*status:\s*(.*)", 
        re.IGNORECASE
    )
    health_status_match = health_status_pattern.search(text)
    if health_status_match:
        characteristics["health_status"] = health_status_match.group(1).strip()
    
    return characteristics

# Example usage
text = """
49% male, 48% female, 3% divers.
In another study, 99% female, 1% other.
"""
print(extract_sex_distribution(text))
text = """
Study type of included studies: 6 cross sectional, 2 mixed method, 2 qualitative.
Another study: 10 cross sectional, 2 qualitative.
"""
print(extract_study_types(text))
# Example usage
text = """
N - population 411 300.
Another study: N = 1,000,000.
"""
print(extract_total_population(text))

text = """
Characteristics of included studies in review:
• 10 cross sectional, 2 qualitative
• Germany (6), Spain (3), England (2), France (1)
• Sample size: 300 (99% female; 1% other)
• Population health status: Pregnant women
"""
print(extract_study_characteristics(text))

In [None]:
import re
import json

def extract_inclusion_exclusion_counts(text):
    """
    Uses precompiled regex patterns to extract specific study counts.
    Returns a dictionary with the keys: yielded, selected, excluded, and inclusion.
    """
    patterns = {
        # Matches numbers after "yielded" (e.g., "yielded 42")
        "yielded": re.compile(r"yielded\s+(\d+)", re.IGNORECASE),
        
        # Matches numbers after "selected" or "selected for" (e.g., "selected 142 for full review")
        "selected": re.compile(r"selected\s+(\d+)\s+(?:for|as|relevant|full\s+review)", re.IGNORECASE),
        
        # Matches numbers after "excluded" or "were excluded" (e.g., "262 were excluded")
        "excluded": re.compile(r"(?:excluded|were excluded)\s+(\d+)", re.IGNORECASE),
        
        # Matches numbers before "met the inclusion criteria" (e.g., "42 met the inclusion criteria")
        "inclusion": re.compile(r"(\d+)\s+met\s+the\s+inclusion\s+criteria", re.IGNORECASE),
        
        # Matches numbers after "of which" (e.g., "of which 42 met the inclusion criteria")
        "inclusion_of_which": re.compile(r"of\s+which\s+(\d+)\s+met\s+the\s+inclusion\s+criteria", re.IGNORECASE),
        
        # Matches numbers after "included" (e.g., "included 56 in the meta-analysis")
        "included": re.compile(r"included\s+(\d+)\s+(?:in|for)", re.IGNORECASE),
        
        # Matches numbers after "identified" (e.g., "identified 305 studies")
        "identified": re.compile(r"identified\s+(\d+)\s+studies", re.IGNORECASE),
    }
    
    results = {
        "yielded": None,
        "selected": None,
        "excluded": None,
        "inclusion": None,
        "included": None,
        "identified": None,
    }
    
    for key, pattern in patterns.items():
        match = pattern.search(text)
        if match:
            results[key] = int(match.group(1))
    
    # If "inclusion" is not found, try "inclusion_of_which"
    if results["inclusion"] is None:
        match = patterns["inclusion_of_which"].search(text)
        if match:
            results["inclusion"] = int(match.group(1))
    
    # If "inclusion" is still not found, try "included"
    if results["inclusion"] is None:
        match = patterns["included"].search(text)
        if match:
            results["inclusion"] = int(match.group(1))
    
    return json.dumps(results, indent=4)

# Example usage
text = """
Of the full-texts, 42 (11%) met inclusion criteria yielding data from 5 RCTs and 39 observational studies over 23 influenza seasons (from 1983/84 up to the mid-season of 2016/17). 
We identified 3368 unduplicated publications, selected 142 for full review, and included 56 in the meta-analysis. 
After removing duplicates, we identified 2592 potential studies. Following title and abstract screening, 305 studies were identified for full text review. Of these, 262 were excluded, leaving a total of 45 studies, of which 37 studies used the TND.
"""

print(extract_inclusion_exclusion_counts(text))

## General Extractor

In [None]:
from src.Commands.TaggingSystem import Tagging
from src.Commands.regexp import searchRegEx
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory

def extract_all_sections_from_text(text):
    """
    Extracts all sections and their respective content from a string variable.

    Args:
        text (str): The input text containing sections and their content.

    Returns:
        dict: A dictionary where the keys are section headers and values are their respective content.
    """
    sections = {}
    current_section = "Title"  # Default section for content before any section header
    current_content = []

    # Split the text into lines for processing
    lines = text.splitlines()

    for line in lines:
        stripped_line = line.strip()

        # Check if the line is a potential section header
        if stripped_line.isupper() or stripped_line.endswith(":"):
            # Save the previous section and its content
            if current_section:
                sections[current_section] = "\n".join(current_content)

            # Start a new section
            current_section = stripped_line.rstrip(":")
            current_content = []
        else:
            # Add content to the current section
            if stripped_line:  # Skip empty lines
                current_content.append(stripped_line)

    # Add the last section's content
    if current_section:
        sections[current_section] = "\n".join(current_content)

    return sections

def save_to_file(file_path, data):
    """
    Save the given data to a file.

    :param file_path: The file path where the data will be saved.
    :param data: The data to save (e.g., a list or string).
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        if isinstance(data, list):
            file.write('\n'.join(data))  # Write each item in a new line
        else:
            file.write(data)  # Write directly if it's a string
            
# Assuming self.document is already set
# url_tandfonline = 'https://bmcmedicine.biomedcentral.com/articles/10.1186/s12916-018-1098-3'
# url_tandfonline = 'https://www.tandfonline.com/doi/full/10.1080/01443615.2022.2162867'
# url_tandfonline = 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8021610/'
url_tandfonline = 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8477621/'
# url_tandfonline = "https://www.mdpi.com/1660-4601/19/15/9425"
# url_tandfonline = "https://www.nature.com/articles/s41598-021-83727-7"
# url_tandfonline = "https://bmcpublichealth.biomedcentral.com/articles/10.1186/s12889-020-08753-y"
# url_tandfonline = "https://bmcmedicine.biomedcentral.com/articles/10.1186/s12916-018-1098-3"
# url_tandfonline = "https://www.jpmh.org/index.php/jpmh/article/view/998"
# url_tandfonline = "https://obgyn.onlinelibrary.wiley.com/doi/10.1111/aogs.14359"
# url_tandfonline = "https://www.aerzteblatt.de/int/archive/article/161392"
# url_tandfonline = "https://bmcpublichealth.biomedcentral.com/articles/10.1186/1471-2458-14-867"
# url_tandfonline = "https://www.scielo.br/j/rpp/a/3MSXkXzft7QTJg3ZXg8RPbq/?lang=en"
# url_tandfonline = "https://journal.waocp.org/article_88640.html"
# for PDF
# url_tandfonline = "https://ijpsr.com/wp-content/uploads/2015/03/57-Vol.-6-Issue-4-April-2015-IJPSR-RA-5231-Paper-57.pdf"
# url_tandfonline = "https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD013717.pub2/pdf/full"
# url_tandfonline = "https://journals.sagepub.com/doi/10.1177/2150131917742299"
# url_tandfonline = "https://bmjopen.bmj.com/content/8/4/e019206"
url_tandfonline = "https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD013479/full"
tandfonline_extractor = ArticleExtractorFactory.get_extractor(url=url_tandfonline)
print("Available sections (tandfonline):", tandfonline_extractor.get_available_sections())
document = tandfonline_extractor.get_section("Main Content")

file_path = 'extracted_dates.txt'
save_to_file(file_path, document)

tagging = Tagging(document)
result = tagging.create_columns_from_text(searchRegEx)
# print(result)
from itertools import chain
def flatten_tags(tags):
    """Convert nested lists or other complex data types to flattened strings."""
    for key, value in tags.items():
        if isinstance(value, list):
            # Select only the last list if multiple lists exist
            if isinstance(value[-1], list):
                value = value[-1]  # Take the last list for processing
            
            # Convert the selected list to a comma-separated string
            tags[key] = ", ".join(map(str, value)) if isinstance(value, list) else str(value)

        elif isinstance(value, str):
            tags[key] = value.strip()  # Remove unnecessary whitespace
        
    return tags
print(result)
print(flatten_tags(result))


In [None]:
tandfonline_extractor.get_section("SearchStrategy")

In [None]:
import re
from bs4 import BeautifulSoup

def extract_search_strategy(html_content):
    """
    Extracts the Search Strategy section from an HTML document.

    Args:
        html_content (str): The HTML content of the document.

    Returns:
        str: The extracted text from the Search Strategy section.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Find all sections with potential "Search Strategy" headings
    search_patterns = [
        r"search\s*strategy",
        r"literature\s*search",
        r"search\s*methods?",
        r"search\s*terms?",
        r"database\s*search",
        r"electronic\s*databases?",
        r"search\s*methodology",
        r"search\s*and\s*selection\s*process"
    ]

    for section in soup.find_all("section"):
        heading = section.find(["h2", "h3", "h4", "h5", "h6"])
        if heading and any(re.search(pattern, heading.text, re.IGNORECASE) for pattern in search_patterns):
            return section.get_text(separator=" ", strip=True)

    return "Search Strategy section not found."

# Example Usage:
html_data = """
<section id="sec-2-1"><h3>Search Strategy</h3><div role="paragraph">
A comprehensive literature search was conducted using five electronic databases related to health care, namely, 
the PubMed, CINAHL, Cochrane Library, Medline, and PsycInfo. Searches were limited to articles published between 
January 2006 and March 4, 2017, as the HPV vaccine has only been licensed since 2006. A search was conducted using 
the keywords: adolescen* OR girl* OR boy* OR male OR female OR parent*; AND human papillomavirus vaccine* OR HPV; 
AND uptake; AND knowledge* OR barrier* OR accept* OR intent*. Reference lists of review articles were retrieved 
to identify additional sources of literature.
</div></section>
"""

# Extract the search strategy section
result = extract_search_strategy(html_data)
print(result)


### NLP Text Extractor

In [None]:
from src.Services.Factories.Sections.LiteratureSearchQAML import LiteratureSearchQA
# Example usage
if __name__ == "__main__":
    # Sample text from a journal article (you can update this with your own content)
    text = """
    ===== Main Content =====
    ===== Title =====
    Determinants of influenza vaccine hesitancy among pregnant women in Europe: a systematic review - PMC

    ===== Abstract =====
    Background Pregnant women are at high risk for severe influenza. However, maternal influenza vaccination uptake in most World Health Organization (WHO) European Region countries remains low, despite the presence of widespread national recommendations. An influenza vaccination reduces influenza-associated morbidity and mortality in pregnancy, as well as providing newborns with protection in their first months. Potential determinants of vaccine hesitancy need to be identified to develop strategies that can increase vaccine acceptance and uptake among pregnant women. The primary objective of the systematic review is to identify the individual determinants of influenza vaccine hesitancy among pregnant women in Europe, and how to overcome the hesitancy.
    Methods Databases were searched for peer-reviewed qualitative and quantitative studies published between 2009 and 2019 inclusive. Databases included PubMed via MEDLINE, Cochrane Central Register for Controlled Trials, PsycINFO, SAGE Journals, Taylor and Francis and Springer nature. These covered themes including psychology, medicine, and public health. Following the Preferred Reporting Items for Systematic Reviews and Meta-Analyses (PRISMA) approach, 11 studies were eligible and analyzed for significant determinants of influenza vaccine hesitancy among pregnant women in Europe.
    Results The most commonly reported factors were psychological aspects, for example concerns about safety and risks to mother and child, or general low risk perception of becoming ill from influenza. Doubts about the effectiveness of the vaccine and a lack of knowledge about this topic were further factors. There was also influence of contextual factors, such as healthcare workers not providing adequate knowledge about the influenza vaccine or the pregnant lady stating their antivaccine sentiment.
    Conclusion Health promotion that specifically increases knowledge among pregnant women about influenza and vaccination is important, supporting a valid risk judgment by the pregnant lady. The development of new information strategies for dialogue between healthcare providers and pregnant women should form part of this strategy.
    Keywords: Influenza, Vaccination, Infectious diseases, Pregnant women, Europe, Vaccine hesitancy, Vaccine refusal, Vaccine delay, Review, Maternal

    ===== Paper Type =====
    open access

"""

    # Instantiate the class
    qa_model = LiteratureSearchQA()
    
    # Extract information
    last_search_date = qa_model.extract_last_search_date(text)
    total_studies = qa_model.extract_total_studies_included(text)
    total_population = qa_model.extract_total_population(text)
    total_sample_size = qa_model.extract_total_sample_size(text)
    sex_proportion = qa_model.extract_sex_proportion(text)
    rct_count = qa_model.extract_total_RCT_count(text)
    nsri_count = qa_model.extract_total_NSRI_count(text)
    mix=qa_model.extract_total_mix_method_count(text)
    country = qa_model.extract_country_proportion(text)
    popu = qa_model.extract_population_proportion(text)

    # Print the results
    print(f"Last literature search date: {last_search_date}")
    print(f"Total studies included: {total_studies}")
    print(f"Total population: {total_population}")
    print(f"Total sample size: {total_sample_size}")
    print(f"Sex proportion distribution: {sex_proportion}")
    print(f"RCT count: {rct_count}")
    print(f"NSRI count: {nsri_count}")
    print(f"Mix count: {mix}")
    print(f"country count: {country}")
    print(f"popu count: {popu}")


In [None]:
from src.Utils.Helpers import html_to_plain_text_selenium
from src.Commands.TaggingSystem import Tagging
from src.Commands.regexp import searchRegEx
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory

def extract_all_sections_from_text(text):
    """
    Extracts all sections and their respective content from a string variable.

    Args:
        text (str): The input text containing sections and their content.

    Returns:
        dict: A dictionary where the keys are section headers and values are their respective content.
    """
    sections = {}
    current_section = "Title"  # Default section for content before any section header
    current_content = []

    # Split the text into lines for processing
    lines = text.splitlines()

    for line in lines:
        stripped_line = line.strip()

        # Check if the line is a potential section header
        if stripped_line.isupper() or stripped_line.endswith(":"):
            # Save the previous section and its content
            if current_section:
                sections[current_section] = "\n".join(current_content)

            # Start a new section
            current_section = stripped_line.rstrip(":")
            current_content = []
        else:
            # Add content to the current section
            if stripped_line:  # Skip empty lines
                current_content.append(stripped_line)

    # Add the last section's content
    if current_section:
        sections[current_section] = "\n".join(current_content)

    return sections

def save_to_file(file_path, data):
    """
    Save the given data to a file.

    :param file_path: The file path where the data will be saved.
    :param data: The data to save (e.g., a list or string).
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        if isinstance(data, list):
            file.write('\n'.join(data))  # Write each item in a new line
        else:
            file.write(data)  # Write directly if it's a string
            
url_tandfonline = "https://journals.sagepub.com/doi/10.1177/2150131917742299"
soup = html_to_plain_text_selenium(
    url_tandfonline, 
    headless=False
)

tandfonline_extractor = ArticleExtractorFactory.get_extractor(soup=soup, url=url_tandfonline)
print("Available sections (tandfonline):", tandfonline_extractor.get_available_sections())
document = tandfonline_extractor.get_section("Main Content")

file_path = 'extracted_dates.txt'
save_to_file(file_path, document)

tagging = Tagging(document)
result = tagging.create_columns_from_text(searchRegEx)
print(result)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Configure Chrome options
options = Options()
options.add_argument("--headless")  # Run in headless mode (no GUI)
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid detection

# Set up WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Target URL
url = "https://journals.sagepub.com/doi/10.1177/2150131917742299"

# Open page and get content
driver.get(url)
html_content = driver.page_source
driver.quit()

print(html_content)  # Print first 1000 characters


In [None]:
import re
from src.Services.Factories.Sections.SectionExtractor import SectionExtractor

# Example Usage
with open("./running_away.txt", "r", encoding="utf-8") as file:
    document_text = file.read()

extract_sections = SectionExtractor(document_text)

# Retrieve specific sections
abstract_content = extract_sections.get("Introduction")
methods_content = extract_sections.get("Methods")
extract_sections.get("main content")
extract_sections.available_sections()

### Cochrane Extractor

In [None]:
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory
from bs4 import BeautifulSoup
with open("file_path.html", "r", encoding="utf-8") as file:
    html_soup = BeautifulSoup(file.read(), "html.parser")

extractor = ArticleExtractorFactory.get_extractor(soup=html_soup, url="cochranelibrary.com")
print("Available sections (tandfonline):", extractor.get_available_sections())
extractor._extract_sections()
# extractor.save_extracted_sections(output_path)
# print(f"Extracted sections saved to {output_path}")

### Text Extractor

In [None]:
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory

# Usage Example
file_path = "./testing.txt"  # Input file
output_path = "./extracted_sections.txt"  # Output file
with open(file_path, "r", encoding="utf-8") as file:
    text_content = file.read()
            
extractor = ArticleExtractorFactory.get_extractor(text_content=text_content)
print("Available sections (tandfonline):", extractor.get_available_sections())
extractor._extract_sections()
extractor.save_extracted_sections(output_path)
print(f"Extracted sections saved to {output_path}")


In [None]:
def extract_all_sections(file_path):
    """
    Extracts all sections and their respective content from a text file.
    
    Args:
        file_path (str): Path to the text file.

    Returns:
        dict: A dictionary where the keys are section headers and values are their respective content.
    """
    sections = {}
    current_section = "General"  # Default section for content before any section header
    current_content = []

    with open(file_path, 'r') as file:
        for line in file:
            stripped_line = line.strip()

            # Check if the line is a potential section header
            if stripped_line.isupper() or stripped_line.endswith(":"):
                # Save the previous section and its content
                if current_section:
                    sections[current_section] = "\n".join(current_content)

                # Start a new section
                current_section = stripped_line.rstrip(":")
                current_content = []
            else:
                # Add content to the current section
                if stripped_line:  # Skip empty lines
                    current_content.append(stripped_line)

        # Add the last section's content
        if current_section:
            sections[current_section] = "\n".join(current_content)

    return sections

# Usage example:
file_path = "./extracted_dates.txt"
sections_with_content = extract_sections(file_path)

# Print all sections and their respective content
sections_with_content


In [None]:
import re

def extract_last_literature_search_dates(document):
    """
    Extracts literature search dates from the given text.
    :param document: str, the text to extract dates from.
    :return: list, sorted unique dates extracted.
    """
    if not document or not isinstance(document, str):
        raise ValueError("The document content is empty or invalid. Please provide a valid string.")

    # Use the merged regex pattern
    pattern = r"""
        (?:(?:searched\s+from\s+inception\s+to|date\s+of\s+last\s+literature\s+search|last\s+search\s+date|
        the\s+search\s+was\s+conducted|all\s+searches\s+were\s+conducted|systematic\s+search(?:es)?|
        literature\s+search(?:es)?(?:\s+was|\s+were)?(?:\s+conducted|\s+performed)?|
        up\s+to\s+our\s+last\s+search\s+on|Cochrane\s+Database\s+of\s+Systematic\s+Reviews\s+up\s+to))  # Keywords
        [\s\S]*?  # Match across lines
        (\d{1,2}(?:-|\s)?(?:st|nd|rd|th)?(?:\s|-)?(?:January|February|March|April|May|June|July|August|September|October|November|December)[,]?\s\d{4}|  # Full textual dates
        (?:January|February|March|April|May|June|July|August|September|October|November|December)(?:\s|-)?\d{1,2}(?:st|nd|rd|th)?,?\s\d{4}|  # Month-first formats
        \d{1,2}[/-]\d{1,2}[/-]\d{2,4}|  # Numeric formats like 12/12/2018
        \b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b(?:\s|-)?\d{4})  # Month + Year
    """

    # Apply regex to find matches
    matches = list(re.finditer(pattern, document, re.IGNORECASE | re.VERBOSE))

    # Debugging step: Print raw matches
    print("Raw Matches:", [match.groups() for match in matches])

    # Extract and clean matched dates
    dates = []
    for match in matches:
        if match.group(1):  # Ensure the group exists
            dates.append(match.group(1).strip())

    # Debugging step: Print extracted dates
    print("Extracted Dates (Raw):", dates)

    # Deduplicate and sort dates
    return sorted(set(dates))


# Test Case
document = """
A search for publications was carried out in April 2014, in the National Center for Biotechnology Information Advances Science and Health - US National Library of Medicine - National Institutes of Health - PubMed electronic databases, with no restrictions regarding date and language of publication. Additionally, a search was performed in the LILACS and SciELO databases using the descriptor "Papillomavirus Vaccines", followed by a manual search for randomized controlled trials (RCTs). In the first stage of article selection, the Decs/Mesh health descriptor "papillomavirus vaccines/adverse effects" was used. The study design filter "RCTs" was added to the obtained results. Subsequently, the identified articles were analyzed by reading the titles and abstracts.



"""

# Run the function
extracted_dates = extract_last_literature_search_dates(document)
print("Final Extracted Dates:", extracted_dates)


In [None]:
from src.Services.Factories.Sections.ArticleExtractorFactory import ArticleExtractorFactory

# url_ncbi = 'https://pubmed.ncbi.nlm.nih.gov/35430833/'
# ncbi_extractor = ArticleExtractorFactory.get_extractor(url_ncbi)
# print("Available sections (NCBI):", ncbi_extractor.get_available_sections())
# print("Abstract (NCBI):\n", ncbi_extractor.get_abstract())


# url_ncbi = 'https://pmc.ncbi.nlm.nih.gov/articles/PMC9768633/'
# ncbi_extractor = ArticleExtractorFactory.get_extractor(url_ncbi)
# print("Available sections (NCBI):", ncbi_extractor.get_available_sections())
# print("Abstract (NCBI):\n", ncbi_extractor.get_abstract())

# url_bmj = 'https://bmjopen.bmj.com/content/8/4/e019206'
# bmj_extractor = ArticleExtractorFactory.get_extractor(url_bmj)
# print("Available sections (BMJ):", bmj_extractor.get_available_sections())
# print("Abstract (BMJ):\n", bmj_extractor.get_abstract())

# url_journal = 'https://journal.waocp.org/article_88640_d6458e9820c0023169822ff1d1fc3b2e.pdf'
# journal_extractor = ArticleExtractorFactory.get_extractor(url_journal)
# print("Available sections (journal):", journal_extractor.get_available_sections())
# print("Abstract (journal):\n", journal_extractor.get_abstract())

# url_cochrane = 'https://www.cochranelibrary.com/cdsr/doi/10.1002/14651858.CD013717.pub2/full'
# cochrane_extractor = ArticleExtractorFactory.get_extractor(url_cochrane)
# print("Available sections (Cochrane):", cochrane_extractor.get_available_sections())
# print("Abstract (Cochrane):\n", cochrane_extractor.get_abstract())


url_tandfonline = 'https://www.nature.com/articles/s41598-021-83727-7'
tandfonline_extractor = ArticleExtractorFactory.get_extractor(url_tandfonline)
print("Available sections (tandfonline):", tandfonline_extractor.get_available_sections())
print("Abstract (tandfonline):\n", tandfonline_extractor.get_abstract())

In [None]:
tandfonline_extractor.get_section("Main Content")

In [None]:
from src.Utils.Helpers import process_prisma_images
from bs4 import BeautifulSoup
import requests
url = "https://www.mdpi.com/1660-4601/19/15/9425"
# url = 'https://pmc.ncbi.nlm.nih.gov/articles/PMC8021610/'
# Example URL
# url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC8477621/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://google.com"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

extracted_text = process_prisma_images(soup, url)

In [None]:
from DOIEnricher import DOIEnricher
enricher = DOIEnricher("testing_data.csv")
enricher.run()

###### Check if versions of different requirement files are different

In [None]:
import sys

def parse_requirements(file_path):
    """Reads a requirements file and returns a dictionary of package: version."""
    requirements = {}
    try:
        with open(file_path, "r") as file:
            for line in file:
                line = line.strip()
                if line and not line.startswith("#"):  # Ignore empty lines and comments
                    if "==" in line:  # Handle versioned dependencies
                        package, version = line.split("==")
                        requirements[package.strip()] = version.strip()
                    else:
                        requirements[line.strip()] = "Unknown"  # No version specified
    except FileNotFoundError:
        print(f"❌ Error: File {file_path} not found.")
        sys.exit(1)
    return requirements

def compare_requirements(file1, file2):
    """Compares two requirements files and prints the differences."""
    req1 = parse_requirements(file1)
    req2 = parse_requirements(file2)

    added = req2.keys() - req1.keys()
    removed = req1.keys() - req2.keys()
    changed = {pkg: (req1[pkg], req2[pkg]) for pkg in req1.keys() & req2.keys() if req1[pkg] != req2[pkg]}

    print("\n📌 Requirements Differences:\n")

    if added:
        print("✅ Added Packages:")
        for pkg in added:
            print(f"   + {pkg}=={req2[pkg]}")
    else:
        print("✅ No new packages added.")

    if removed:
        print("\n❌ Removed Packages:")
        for pkg in removed:
            print(f"   - {pkg}=={req1[pkg]}")
    else:
        print("\n❌ No packages removed.")

    if changed:
        print("\n🔄 Updated Packages:")
        for pkg, versions in changed.items():
            print(f"   ~ {pkg}: {versions[0]} → {versions[1]}")
    else:
        print("\n🔄 No packages updated.")

# Usage: python compare_requirements.py old_requirements.txt new_requirements.txt
if __name__ == "__main__":
    
    compare_requirements("requirements.txt", "requirements2.txt")
