In [None]:
import os
import csv
import string
import logging
from Bio import Entrez
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

def fetch_pubmed_articles(query, max_results, email, api_key=None):
    Entrez.email = email
    if api_key:
        Entrez.api_key = api_key
    with Entrez.esearch(db="pubmed", term=query, retmax=max_results) as handle:
        search_results = Entrez.read(handle)
    id_list = search_results.get("IdList", [])

    with Entrez.efetch(db="pubmed", id=id_list, retmode="xml") as handle:
        articles = Entrez.read(handle)
    return articles

def preprocess_text(text):
    """Aggressive cleaning: lowercasing, punctuation & digit removal, stopword removal, and lemmatizing."""
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    text = ''.join(ch for ch in text if not ch.isdigit())
    tokens = text.split()
    tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOP_WORDS]
    return ' '.join(tokens)

def clean_text_for_llm(text):
    """Minimal cleaning for LLM training: preserve punctuation and numbers, remove extra whitespace."""
    return ' '.join(text.split())

def save_text(filename, text):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)

def save_processed_article(pmid, processed_text, output_dir="processed_articles"):
    save_text(os.path.join(output_dir, f"{pmid}.txt"), processed_text)

def save_llm_cleaned_article(pmid, cleaned_text, output_dir="llm_cleaned_articles"):
    save_text(os.path.join(output_dir, f"{pmid}.txt"), cleaned_text)

def extract_entities(text, nlp_bc5cdr, nlp_jnlpba):
    """Extract entities using two NER models."""
    entities = []
    doc_bc5cdr = nlp_bc5cdr(text)
    for ent in doc_bc5cdr.ents:
        if ent.label_ == 'CHEMICAL':
            entities.append((ent.text, 'Drug'))
        elif ent.label_ == 'DISEASE':
            entities.append((ent.text, 'Disease'))
    doc_jnlpba = nlp_jnlpba(text)
    for ent in doc_jnlpba.ents:
        if ent.label_ == 'PROTEIN':
            entities.append((ent.text, 'Gene'))
    return entities

def save_ner_results(ner_results, output_file="ner_results.csv"):
    unique_entities = set()  # Track unique (PMID, Entity, Type)
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['PMID', 'Entity', 'Type'])
        for result in ner_results:
            pmid = result.get('pmid', 'Unknown')
            for entity, entity_type in result.get('entities', []):
                key = (pmid, entity, entity_type)
                if key not in unique_entities:
                    unique_entities.add(key)
                    writer.writerow([pmid, entity, entity_type])

def extract_relations_from_text(text, extracted_entities, nlp_re):

   
    relation_verbs = {
        "inhibit", "block", "suppress", "activate", "treat", "reduce",
        "upregulate", "downregulate", "bind", "target", "modulate",
        "enhance", "promote", "interact", "associate", "regulate",
        "potentiate", "antagonize", "induce", "overexpress", "underexpress"
    }

    relations = []
    doc = nlp_re(text)

    for sent in doc.sents:
        sent_text = sent.text
        # Check which entities appear in the sentence (case insensitive matching)
        present_entities = [
            (ent_text, ent_type)
            for ent_text, ent_type in extracted_entities
            if ent_text.lower() in sent_text.lower()
        ]
        if len(present_entities) >= 2:
            for token in sent:
                if token.lemma_.lower() in relation_verbs:
                    # For simplicity, use the first two entities found in the sentence.
                    entity1, entity2 = present_entities[0], present_entities[1]
                    relations.append((entity1[0], token.lemma_, entity2[0], sent_text))
                    break  # Stop after finding one relation per sentence.
    return relations


def save_relation_results(relations, output_file="relation_results.csv"):
    """Save relation extraction results to a CSV file."""
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['PMID', 'Entity1', 'Relation', 'Entity2', 'Sentence'])
        for pmid, entity1, relation, entity2, sentence in relations:
            writer.writerow([pmid, entity1, relation, entity2, sentence])

def main():
    query = input("Enter your PubMed query: ")
    try:
        max_results = int(input("Enter number of articles to fetch: "))
    except ValueError:
        logging.error("Invalid number for max results. Exiting.")
        return

    email = input("Enter your email address: ")
    api_key = input("Enter your PubMed API key (optional): ") or None

    logging.info("Fetching articles from PubMed...")
    try:
        articles = fetch_pubmed_articles(query, max_results, email, api_key)
    except Exception as e:
        logging.error(f"Error fetching articles: {e}")
        return

    # Load NER models
    try:
        nlp_bc5cdr = spacy.load("en_ner_bc5cdr_md")
    except Exception as e:
        logging.error("Error loading en_ner_bc5cdr_md model. Ensure it is installed and compatible.")
        return

    try:
        nlp_jnlpba = spacy.load("en_ner_jnlpba_md")
    except Exception as e:
        logging.error("Error loading en_ner_jnlpba_md model. Ensure it is installed and compatible.")
        return

   
    try:
        nlp_re = spacy.load("en_core_web_sm")
    except Exception as e:
        logging.error("Error loading en_core_web_sm model for relation extraction. Install it with: python -m spacy download en_core_web_sm")
        return

    ner_results = []
    relation_results = []

    for article in articles.get('PubmedArticle', []):
        pmid = "Unknown"
        try:
            medline = article.get('MedlineCitation', {})
            pmid = medline.get('PMID', 'Unknown')
            article_info = medline.get('Article', {})
            title = article_info.get('ArticleTitle', '')
            abstract = ""
            if 'Abstract' in article_info:
                abs_content = article_info['Abstract']
                if isinstance(abs_content, dict) and 'AbstractText' in abs_content:
                    abs_text = abs_content['AbstractText']
                    abstract = ' '.join(abs_text) if isinstance(abs_text, list) else abs_text
            raw_text = f"{title} {abstract}"

           
            llm_cleaned_text = clean_text_for_llm(raw_text)
            save_llm_cleaned_article(pmid, llm_cleaned_text)
            processed_text = preprocess_text(raw_text)
            save_processed_article(pmid, processed_text)

            #  entities
            entities = extract_entities(raw_text, nlp_bc5cdr, nlp_jnlpba)
            ner_results.append({'pmid': pmid, 'entities': entities})

            # Extracts relations from raw text using the previously extracted entities
            relations = extract_relations_from_text(raw_text, entities, nlp_re)
            for rel in relations:
                relation_results.append((pmid, rel[0], rel[1], rel[2], rel[3]))
        except Exception as e:
            logging.warning(f"Skipping article {pmid} due to error: {e}")
            continue


    save_ner_results(ner_results)
    save_relation_results(relation_results)
    logging.info("Processing completed. Check the output directories and CSV files.")

if __name__ == "__main__":
    main()
