In [2]:
import requests
import xml.etree.ElementTree as ET
import time
import json
import csv
from datetime import datetime
import re
from tenacity import retry, stop_after_attempt, wait_fixed

STANDARD_METHODS = [
    "Survey", "Interview", "Questionnaire", "Observation", "Experiment",
    "Case Study", "Literature Review", "Meta-analysis", "Randomized Controlled Trial",
    "Cohort Study", "Cross-sectional Study", "Longitudinal Study", "Mixed Methods",
    "Focus Group", "Ethnography", "Grounded Theory", "Phenomenology", "Content Analysis",
    "Secondary Data Analysis", "Bioelectrical Impedance Analysis", "Observational Study",
    "Secondary Analysis", "Mixed-method Study"
]

def normalize_research_methods(methods_string):
    normalized_methods = []
    for method in STANDARD_METHODS:
        if any(word.lower() in methods_string.lower() for word in method.split()):
            normalized_methods.append(method)
    return ", ".join(normalized_methods) if normalized_methods else "Unable to determine"

def fetch_pubmed_articles(query, max_results=5):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    
    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&usehistory=y&retmode=json"
    search_response = requests.get(search_url).json()
    
    total_count = int(search_response['esearchresult']['count'])
    webenv = search_response['esearchresult']['webenv']
    query_key = search_response['esearchresult']['querykey']
    
    print(f"Total articles found: {total_count}")
    print(f"Fetching up to {max_results} articles...")

    fetch_url = f"{base_url}efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}&retmode=xml&retmax={max_results}"
    fetch_response = requests.get(fetch_url)
    
    root = ET.fromstring(fetch_response.content)
    
    articles = []
    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text if article.find(".//PMID") is not None else "ID not available"
        title_element = article.find(".//ArticleTitle")
        title = title_element.text if title_element is not None else "Title not available"
        abstract_element = article.find(".//Abstract/AbstractText")
        abstract = abstract_element.text if abstract_element is not None else "Abstract not available"
        doi_element = article.find(".//ArticleId[@IdType='doi']")
        doi = doi_element.text if doi_element is not None else "DOI not available"
        
        articles.append({
            'id': pmid,
            'doi': doi,
            'title': title,
            'abstract': abstract
        })
    
    return articles

def preprocess_article(article):
    # Check if the article is None
    if article is None:
        return None

    # Check if 'abstract' key exists and is not None
    if 'abstract' not in article or article['abstract'] is None:
        return None

    # Check if the abstract is too short or not available
    if len(article['abstract']) < 50 or article['abstract'] == "Abstract not available.":
        return None
    
    # Check if 'title' key exists and is not None
    if 'title' not in article or article['title'] is None:
        return None

    # Check if the title is missing
    if article['title'] == "Title not available.":
        return None
    
    return article

@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
def process_article_with_ollama(article, model_name="llama3"):
    url = "http://localhost:11434/api/generate"
    
    # Handle None abstract
    abstract = article['abstract'] if article['abstract'] is not None else "Abstract not available"
    
    prompt = f"""
Analyze the following research article on assisted dying:

Title: {article['title']}

Abstract: {abstract}

Please provide a detailed analysis addressing the following points. Format your response exactly as shown below:

Study Type: [Empirical/Theoretical]

Study Type Justification: [Your explanation here]

Research Methods: [List only the specific research methods used, separated by commas. Use standard terminology such as Survey, Interview, Questionnaire, Observation, Experiment, Case Study, Literature Review, Meta-analysis, etc.]

Research Methods Justification: [Your explanation here]

Ensure your response follows this exact format for easy parsing.

Response:
"""
    
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        result = response.json()['response']
        parsed_result = parse_llm_response(result)
        
        # Add article metadata to the parsed result
        parsed_result['article_id'] = article['id']
        parsed_result['doi'] = article['doi']
        parsed_result['title'] = article['title']
        parsed_result['abstract'] = abstract
        
        return parsed_result
    except requests.exceptions.RequestException as e:
        print(f"Error processing article {article['id']}: {str(e)}")
        raise

def parse_llm_response(response):
    parsed_result = {
        'study_type': "Unable to determine",
        'study_type_justification': "Unable to determine",
        'research_methods': "Unable to determine",
        'research_methods_justification': "Unable to determine"
    }
    
    patterns = {
        'study_type': r"Study Type:\s*(.+?)(?:\n|$)",
        'study_type_justification': r"Study Type Justification:\s*(.+?)(?:\n\n|\n[A-Z]|$)",
        'research_methods': r"Research Methods:\s*(.+?)(?:\n|$)",
        'research_methods_justification': r"Research Methods Justification:\s*(.+?)(?:\n\n|$)"
    }
    
    for key, pattern in patterns.items():
        match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
        if match:
            parsed_result[key] = match.group(1).strip()
    
    # Normalize research methods
    if parsed_result['research_methods'] != "Unable to determine":
        parsed_result['research_methods'] = normalize_research_methods(parsed_result['research_methods'])
    
    return parsed_result

def validate_results(results):
    validated_results = []
    for result in results:
        if all(value != "Unable to determine" for value in result.values()):
            validated_results.append(result)
        else:
            print(f"Incomplete result for article {result['article_id']}. Skipping.")
    return validated_results

def save_results(results, base_filename):
    # Save as JSON
    json_filename = f"{base_filename}.json"
    with open(json_filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {json_filename}")

    # Save as CSV
    csv_filename = f"{base_filename}.csv"
    
    # Get all unique keys from all result dictionaries
    fieldnames = set()
    for result in results:
        fieldnames.update(result.keys())
    fieldnames = sorted(list(fieldnames))  # Sort field names for consistency

    with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for result in results:
            writer.writerow(result)
    print(f"Results saved to {csv_filename}")

def run_integration_test(query="Assisted+dying", max_articles=5, model_name="llama3"):
    print(f"Running integration test with query: '{query}', max articles: {max_articles}, model: {model_name}")
    
    # Fetch articles
    articles = fetch_pubmed_articles(query, max_articles)
    print(f"Successfully fetched {len(articles)} articles.")
    
    # Add this line to print the articles before processing
    print("Articles before processing:")
    for article in articles:
        print(f"ID: {article['id']}, Title: {article['title'][:50]}...")
        if article['abstract'] is None:
            print(f"Warning: Article {article['id']} has no abstract.")

    # Process articles with Ollama
    results = []
    for i, article in enumerate(articles, 1):
        print(f"\nProcessing article {i} of {len(articles)}...")
        try:
            result = process_article_with_ollama(article, model_name)
            results.append(result)
            print(f"Article ID: {result['article_id']}")
            print(f"DOI: {result['doi']}")
            print(f"Title: {result['title']}")
            print(f"Abstract: {result['abstract'][:100]}...")  # Truncated for display
            print(f"Study Type: {result['study_type']}")
            print(f"Study Type Justification: {result['study_type_justification']}")
            print(f"Research Methods: {result['research_methods']}")
            print(f"Research Methods Justification: {result['research_methods_justification']}")
        except Exception as e:
            print(f"Error processing article {article['id']}: {str(e)}")
        print("-" * 50)
        time.sleep(1)  # Be respectful to the Ollama API
    
    print(f"\nIntegration test complete! Processed {len(results)} articles.")
    return results

# Run the integration test
if __name__ == "__main__":
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_filename = f"pubmed_ollama_test_results_{timestamp}"
    
    results = run_integration_test()
    save_results(results, base_filename)

Running integration test with query: 'Assisted+dying', max articles: 5, model: llama3
Total articles found: 3878
Fetching up to 5 articles...
Successfully fetched 5 articles.
Articles before processing:
ID: 39167528, Title: The Impact of Legalizing Medical Aid in Dying on P...
ID: 39160544, Title: Non-invasive technology to assess hydration status...
ID: 39157533, Title: Readiness of nurses when faced with a patient's de...
ID: 39157418, Title: 'There is no such word as palliative care for us a...
ID: 39152645, Title: The double awareness of the wish to hasten death a...

Processing article 1 of 5...
Article ID: 39167528
DOI: 10.1089/jpm.2023.0706
Title: The Impact of Legalizing Medical Aid in Dying on Patient Trust: A Randomized Controlled Survey Study.
Abstract: Abstract not available...
Study Type: Empirical
Study Type Justification: The study is empirical because it aims to investigate and measure the impact of legalizing medical aid in dying on patient trust through a randomized c