In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import time

def fetch_abstract_from_pubmed(pmid):
    """
    Fetches the abstract of a PubMed article given its PMID using NCBI E-utilities.
    Retrieves FULL XML record and extracts abstract text from <Abstract> and <AbstractText> tags.
    Handles nested tags within <AbstractText> using .itertext().
    Returns abstract text if found, None otherwise.  Logs detailed debug information.

    Note: Abstracts may not be available in XML for all PubMed article types (e.g., Letters).
    """
    print(f"DEBUG: Fetching abstract (FULL XML) for PMID: {pmid}")
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "retmode": "xml",
        "rettype": "xml",  # Get full XML to maximize abstract retrieval
        "id": pmid
    }
    abstract_text = None
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        xml_content = response.text
        root = ET.fromstring(xml_content)

        abstract_element = root.find('.//Abstract') # Search for <Abstract> tag
        if abstract_element is not None:
            abstract_parts = []
            for abstract_text_element in abstract_element.findall('.//AbstractText'): # Find all <AbstractText> tags
                abstract_parts.append("".join(abstract_text_element.itertext())) # Extract all text using .itertext()
            abstract_text = "\n".join(part for part in abstract_parts if part) # Join parts with newline
            if abstract_text:
                print(f"DEBUG: Abstract extracted for PMID: {pmid}")
            else:
                print(f"DEBUG: <Abstract> tag found for PMID: {pmid}, but no content in <AbstractText>.")
        else:
            print(f"DEBUG: <Abstract> tag NOT found for PMID: {pmid}")

    except requests.exceptions.RequestException as e:
        print(f"ERROR: Network error fetching PMID {pmid}: {e}")
        return None
    except ET.ParseError as e:
        print(f"ERROR: XML Parse error for PMID {pmid}: {e}")
        return None
    except Exception as e:
        print(f"ERROR: Unexpected error processing PMID {pmid}: {e}")
        return None
    finally:
        print(f"DEBUG: Finished fetching abstract for PMID: {pmid}")
        return abstract_text

def main():
    """
    Reads article data from CSV, fetches abstracts from PubMed for each PMID, and saves
    to a new CSV. Uses FULL XML retrieval for improved abstract capture.
    Prints cumulative count of collected abstracts after each article.
    Saves intermediate CSV every N steps.
    """
    input_csv_file = "/home/aricept094/mydata/csv-DigitalHea-set.csv"
    output_csv_file = "pubmed_articles_with_abstracts_final.csv" # Final output file name

    start_time = time.time()
    print(f"INFO: Starting script at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}")
    print(f"INFO: Reading article data from {input_csv_file}...")

    try:
        df = pd.read_csv(input_csv_file)
        total_articles = len(df)
        print(f"INFO: Loaded data for {total_articles} articles.")
    except FileNotFoundError:
        print(f"ERROR: Input CSV file not found at {input_csv_file}")
        return

    abstracts = [None] * total_articles # Initialize abstracts list with None, same length as articles
    successful_abstract_count = 0 # Counter for successful abstracts
    processed_count = 0
    save_interval = 200 # Save intermediate progress every 10 articles (set to 10 as requested)

    print(f"INFO: Fetching abstracts from PubMed...")
    for index, row in df.iterrows():
        pmid = str(row['PMID'])
        print(f"INFO: Processing article {processed_count + 1}/{total_articles}, PMID: {pmid}...")
        abstract = fetch_abstract_from_pubmed(pmid)
        abstracts[index] = abstract # Assign abstract to the correct index
        if abstract: # Increment counter if abstract is successfully fetched (not None)
            successful_abstract_count += 1
        processed_count += 1

        print(f"INFO: Abstract processed for PMID: {pmid}. {total_articles - processed_count} articles remaining. Total abstracts collected: {successful_abstract_count}") # Added print for total abstracts

        time.sleep(0.1) # Be respectful to NCBI servers

        if processed_count % save_interval == 0:
            print(f"INFO: Saving intermediate progress after {processed_count} articles...")
            df_intermediate = df.copy() # Create a copy to avoid modifying original during iteration
            df_intermediate['Abstract'] = abstracts # Assign the current abstracts list
            intermediate_output_file = f"pubmed_articles_with_abstracts_intermediate_{processed_count}.csv"
            try:
                df_intermediate.to_csv(intermediate_output_file, index=False)
                print(f"INFO: Saved intermediate data to {intermediate_output_file}")
            except Exception as e:
                print(f"ERROR: Error saving intermediate CSV file: {e}")
            del df_intermediate # Clean up intermediate DataFrame

    df['Abstract'] = abstracts # Final assignment of abstracts to DataFrame

    print(f"INFO: Abstract fetching complete.")
    print(f"INFO: Saving final results to {output_csv_file}...")
    try:
        df.to_csv(output_csv_file, index=False)
        end_time = time.time()
        duration = end_time - start_time
        print(f"SUCCESS: Saved articles with abstracts to {output_csv_file}")
        print(f"INFO: Script finished at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}")
        print(f"INFO: Total processing time: {duration:.2f} seconds")
        print(f"INFO: Successfully retrieved abstracts for {successful_abstract_count}/{total_articles} articles.") # Report success count
    except Exception as e:
        print(f"ERROR: Error saving final output CSV file: {e}")

if __name__ == "__main__":
    main()