In [3]:
import requests
import xml.etree.ElementTree as ET
import time
import json
import csv
from datetime import datetime
import re
from tenacity import retry, stop_after_attempt, wait_fixed

def fetch_pubmed_articles(query, max_results=5):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    
    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&usehistory=y&retmode=json"
    search_response = requests.get(search_url).json()
    
    total_count = int(search_response['esearchresult']['count'])
    webenv = search_response['esearchresult']['webenv']
    query_key = search_response['esearchresult']['querykey']
    
    print(f"Total articles found: {total_count}")
    print(f"Fetching up to {max_results} articles...")

    fetch_url = f"{base_url}efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}&retmode=xml&retmax={max_results}"
    fetch_response = requests.get(fetch_url)
    
    root = ET.fromstring(fetch_response.content)
    
    articles = []
    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text if article.find(".//PMID") is not None else "ID not available"
        title_element = article.find(".//ArticleTitle")
        title = title_element.text if title_element is not None else "Title not available"
        abstract_element = article.find(".//Abstract/AbstractText")
        abstract = abstract_element.text if abstract_element is not None else "Abstract not available"
        doi_element = article.find(".//ArticleId[@IdType='doi']")
        doi = doi_element.text if doi_element is not None else "DOI not available"
        
        articles.append({
            'id': pmid,
            'doi': doi,
            'title': title,
            'abstract': abstract
        })
    
    return articles

def preprocess_article(article):
    # Check if the article is None
    if article is None:
        return None

    # Check if 'abstract' key exists and is not None
    if 'abstract' not in article or article['abstract'] is None:
        return None

    # Check if the abstract is too short or not available
    if len(article['abstract']) < 50 or article['abstract'] == "Abstract not available.":
        return None
    
    # Check if 'title' key exists and is not None
    if 'title' not in article or article['title'] is None:
        return None

    # Check if the title is missing
    if article['title'] == "Title not available.":
        return None
    
    return article

@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
def process_article_with_ollama(article, model_name="llama3"):
    url = "http://localhost:11434/api/generate"
    
    prompt = f"""
    Analyze the following research article on assisted dying:

    Title: {article['title']}

    Abstract: {article['abstract']}

    Please provide a detailed analysis addressing the following points. Format your response exactly as shown below:

    Study Type: [Empirical/Theoretical]

    Study Type Justification: [Your explanation here]

    Research Methods: [List the specific methods used, separated by commas]

    Research Methods Justification: [Your explanation here]

    Ensure your response follows this exact format for easy parsing.

    Response:
    """
    
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        result = response.json()['response']
        return parse_llm_response(result)
    except requests.exceptions.RequestException as e:
        print(f"Error processing article {article['id']}: {str(e)}")
        raise

def parse_llm_response(response):
    parsed_result = {
        'study_type': "Unable to determine",
        'study_type_justification': "Unable to determine",
        'research_methods': "Unable to determine",
        'research_methods_justification': "Unable to determine"
    }
    
    # Use more flexible regex patterns
    patterns = {
        'study_type': r"Study Type:\s*(.+?)(?:\n|$)",
        'study_type_justification': r"Study Type Justification:\s*(.+?)(?:\n\n|\n[A-Z]|$)",
        'research_methods': r"Research Methods:\s*(.+?)(?:\n|$)",
        'research_methods_justification': r"Research Methods Justification:\s*(.+?)(?:\n\n|$)"
    }
    
    for key, pattern in patterns.items():
        match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
        if match:
            parsed_result[key] = match.group(1).strip()
    
    return parsed_result

def validate_results(results):
    validated_results = []
    for result in results:
        if all(value != "Unable to determine" for value in result.values()):
            validated_results.append(result)
        else:
            print(f"Incomplete result for article {result['article_id']}. Skipping.")
    return validated_results

def save_results(results, base_filename):
    # Save as JSON
    json_filename = f"{base_filename}.json"
    with open(json_filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {json_filename}")

    # Save as CSV
    csv_filename = f"{base_filename}.csv"
    
    # Get all unique keys from all result dictionaries
    fieldnames = set()
    for result in results:
        fieldnames.update(result.keys())
    fieldnames = sorted(list(fieldnames))  # Sort field names for consistency

    with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for result in results:
            writer.writerow(result)
    print(f"Results saved to {csv_filename}")

def run_integration_test(query="Assisted+dying", max_articles=5, model_name="llama3"):
    print(f"Running integration test with query: '{query}', max articles: {max_articles}, model: {model_name}")
    
    # Fetch articles
    articles = fetch_pubmed_articles(query, max_articles)
    print(f"Successfully fetched {len(articles)} articles.")
    
    # Preprocess and filter articles
    preprocessed_articles = [preprocess_article(article) for article in articles]
    preprocessed_articles = [article for article in preprocessed_articles if article is not None]
    print(f"After preprocessing, {len(preprocessed_articles)} articles remain.")
    
    # Process articles with Ollama
    results = []
    for i, article in enumerate(preprocessed_articles, 1):
        print(f"\nProcessing article {i} of {len(preprocessed_articles)}...")
        try:
            llm_result = process_article_with_ollama(article, model_name)
            result = {**article, **llm_result}
            results.append(result)
            print(f"Article ID: {result['id']}")
            print(f"DOI: {result['doi']}")
            print(f"Title: {result['title']}")
            print(f"Study Type: {result['study_type']}")
            print(f"Research Methods: {result['research_methods']}")
        except Exception as e:
            print(f"Failed to process article {article['id']}: {str(e)}")
        time.sleep(2)  # Increased delay to be more respectful to the Ollama API
    
    # Validate results
    validated_results = validate_results(results)
    print(f"\nValidation complete. {len(validated_results)} out of {len(results)} results passed validation.")
    
    print("\nIntegration test complete!")
    return validated_results

# Run the integration test
if __name__ == "__main__":
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_filename = f"pubmed_ollama_test_results_{timestamp}"
    
    results = run_integration_test()
    save_results(results, base_filename)

Running integration test with query: 'Assisted+dying', max articles: 5, model: llama3
Total articles found: 3878
Fetching up to 5 articles...
Successfully fetched 5 articles.
After preprocessing, 4 articles remain.

Processing article 1 of 4...
Article ID: 39160544
DOI: 10.1186/s12904-024-01542-z
Title: Non-invasive technology to assess hydration status in advanced cancer to explore relationships between fluid status and symptoms: an observational study using bioelectrical impedance analysis.
Study Type: ** Empirical
Research Methods: ** BIA, observational study

Processing article 2 of 4...
Article ID: 39157533
DOI: 10.3389/fpubh.2024.1399025
Title: Readiness of nurses when faced with a patient's death.
Study Type: ** Empirical
Research Methods: ** Surveys, interviews

Processing article 3 of 4...
Article ID: 39157418
DOI: 10.1177/26323524241272102
Title: 'There is no such word as palliative care for us at the moment': A mixed-method study exploring the perceptions of healthcare profe

ValueError: dict contains fields not in fieldnames: 'id'