In [5]:
from pathlib import Path
from collections import defaultdict
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_mistralai import MistralAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from langchain_qdrant import QdrantVectorStore
import os, json
from dotenv import load_dotenv
import re
import pickle

In [6]:
load_dotenv()
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

In [7]:
USE_SECONDARY_SPLIT = False   
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
# HEADERS_TO_SPLIT = [
#     ("## Abstract", "H1"),
#     ("## Introduction", "H1"),
#     ("## Results", "H1"),
#     ("## Discussion", "H1"),
#     ("## Results and Discussion", "H1"),
#     ("## Conclusion", "H1"),
#     ("## Conclusions", "H1"),
# ]
HEADERS_TO_SPLIT = [("#", "H1"), ("##", "H2"), ("###", "H3")]                              # Number of results

SECTION_COLLECTION_MAP = {
    "Abstract": "abstracts",
    "Introduction": "introductions",
    "Results": "results_discussion",
    "Result": "results_discussion",
    "Discussion": "results_discussion",
    "Results and Discussion": "results_discussion",
    "Conclusion": "conclusions",
    "Conclusions": "conclusions",
    "Background": "introductions",   # Asignamos background a introductions
    "Summary": "conclusions",        # Asignamos summary a conclusions
}                            # Number of results

In [8]:
def normalize_header(header: str) -> str:
    """Normaliza un header: minúsculas y elimina numeración inicial tipo '1. Introduction'."""
    header = header.strip().lower()
    header = re.sub(r'^\d+\.\s*', '', header)
    return header

def extract_metadata_from_md(text: str):
    """Extrae metadatos clave desde un Markdown."""
    metadata = {}

    titles_n1 = re.findall(r'^# (.+)', text, flags=re.MULTILINE)
    if titles_n1:
        metadata["title"] = titles_n1[0]

    pre_authors_section = text.split("* Author information")[0]
    authors = re.findall(r'^###\s+(.+)', pre_authors_section, flags=re.MULTILINE)
    if authors:
        metadata["authors"] = ", ".join(authors)

    author_info_match = re.search(r'### .+?\n([\s\S]*?)(?=\nFind articles)', text)
    if author_info_match:
        metadata["author_information"] = author_info_match.group(1).strip()

    notes_match = re.search(r'(Received .*?Issue date.*?\.)', text)
    if notes_match:
        metadata["article_notes"] = notes_match.group(1).strip()

    copyright_match = re.search(r'(Copyright ©.*?Microbiology)', text)
    if copyright_match:
        metadata["copyright_license"] = copyright_match.group(1).strip()

    pmcid_match = re.search(r'PMCID:\s*(PMC\d+)', text)
    if pmcid_match:
        metadata["pmcid"] = pmcid_match.group(1).strip()

    pmid_match = re.search(r'PMID:\s*\[(\d+)\]', text)
    if pmid_match:
        metadata["pmid"] = pmid_match.group(1).strip()

    # Extraer fecha (simple)
    date_match = re.search(r'\b(19|20)\d{2}\s+[A-Za-z]{3}(?:\s*\d{1,2})?', text)
    if date_match:
        metadata["publication_date"] = date_match.group(0)

    return metadata

def process_markdown_directory_2(in_dir: str):
    """Procesa Markdown y guarda en Qdrant por secciones."""

    in_dir = Path(in_dir)
    all_docs = []
    counts = defaultdict(int)
    all_metadata = {}

    # --- 1) Leer archivos ---
    for md_path in in_dir.rglob("*.md"):
        text = md_path.read_text(encoding="utf-8", errors="ignore")
        docs = split_markdown_by_section_dynamic(text)
        metadata = extract_metadata_from_md(text)
        pmcid = metadata.get("pmcid", md_path.stem)
        all_metadata[pmcid] = metadata

        for d in docs:
            d.metadata.update(metadata)

        all_docs.extend(docs)
        counts[md_path.as_posix()] += len(docs)

    # Guardar metadata completa en JSON
    with open("metadata.json", "w", encoding="utf-8") as f:
        json.dump(all_metadata, f, ensure_ascii=False, indent=4)

    print(f"Collected {len(all_docs)} chunks from {len(counts)} files.")

    # --- 2) Embeddings ---
    embeddings = MistralAIEmbeddings(model="mistral-embed")
    vector_size = len(embeddings.embed_query("sample text"))

    # --- 3) Qdrant client ---
    client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

    # --- 4) Crear colecciones por sección ---
    section_names = set(d.metadata["section_collection"] for d in all_docs)
    for col_name in section_names:
        try:
            client.get_collection(collection_name=col_name)
        except Exception:
            client.create_collection(
                collection_name=col_name,
                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
            )

    # --- 5) Separar por colección ---
    sections_docs = defaultdict(list)
    for d in all_docs:
        col_name = d.metadata["section_collection"]
        sections_docs[col_name].append(d)

    # --- VALIDACIÓN: imprimir 2 primeros chunks por colección ---
    for col_name, docs in sections_docs.items():
        print(f"\n=== Collection: {col_name} | {len(docs)} chunks ===")
        for d in docs[:10]:  # solo los 2 primeros
            # print(f"Title: {d.metadata.get('title')}")
            # print(f"PMCID: {d.metadata.get('pmcid')}")
            print(f"section_collection: {d.metadata.get('section_collection')}")
            print(f"H1: {d.metadata.get('H1')}")
            print(f"H2: {d.metadata.get('H2')}")
            print(f"H3: {d.metadata.get('H3')}")
            print(f"Header: {d.metadata.get('Header')}")
            # print(f"Content snippet: {d.page_content[:30]}...\n")
    BATCH_SIZE = 50
    SAFE_COLLECTIONS = ["abstracts", "introductions", "conclusions"]
    PROBLEM_COLLECTIONS = ["results_discussion"]

    # --- 5a) Subir colecciones seguras ---
    for col_name in SAFE_COLLECTIONS:
        docs = sections_docs.get(col_name, [])
        if not docs:
            continue
        store = QdrantVectorStore(client=client, collection_name=col_name, embedding=embeddings)
        total = len(docs)
        print(f"Uploading {total} chunks to collection '{col_name}' in batches of {BATCH_SIZE}...")
        for i in range(0, total, BATCH_SIZE):
            batch = docs[i:i+BATCH_SIZE]
            store.add_documents(batch)
            print(f"  Uploaded batch {i//BATCH_SIZE + 1} ({len(batch)} chunks)")
        print(f"Collection '{col_name}' uploaded successfully!")

    # # --- 5b) Guardar chunks problemáticos para reintento ---
    # for col_name in PROBLEM_COLLECTIONS:
    #     docs = sections_docs.get(col_name, [])
    #     if docs:
    #         with open(f"problem_chunks_{col_name}.pkl", "wb") as f:
    #             pickle.dump(docs, f)
    #         print(f"{len(docs)} chunks from '{col_name}' saved for reattempt later")

    # --- 5b) Guardar chunks problemáticos para reintento ---
    for col_name in PROBLEM_COLLECTIONS:
        docs = sections_docs.get(col_name, [])
        if docs:
            failed_chunks = []
            embeddings = MistralAIEmbeddings(model="mistral-embed")
            
            for i, d in enumerate(docs):
                try:
                    # Probar embedding solo para este chunk
                    emb = embeddings.embed_documents([d.page_content])
                except Exception as e:
                    print(f"[ERROR] Chunk {i} in '{col_name}' failed: {e}")
                    failed_chunks.append(d)
            
            # Guardar solo los chunks que fallaron
            if failed_chunks:
                with open(f"problem_chunks_{col_name}.pkl", "wb") as f:
                    pickle.dump(failed_chunks, f)
                print(f"{len(failed_chunks)} chunks from '{col_name}' saved for reattempt later")

    # --- Stats finales ---
    print("\nChunks per file (desc):")
    for src, c in sorted(counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{c:5d}  {src}")
    total = sum(counts.values())
    avg = total / len(counts) if counts else 0
    print(f"\nFiles: {len(counts)} | Total chunks: {total} | Avg/file: {avg:.2f}")
    
def has_meaningful_content_for_collection(text: str, collection: str) -> bool:
    """
    Retorna True si el chunk tiene contenido suficiente para la colección indicada.
    """
    text_clean = text.strip()
    if not text_clean:
        return False

    # Por ejemplo, ignorar chunks con menos de 5 palabras
    if len(text_clean.split()) < 5:
        return False

    # Validaciones específicas por colección
    first_line = text_clean.split("\n")[0].lower()
    if collection == "abstracts":
        forbidden = ["background", "introduction", "methods", "study design", "results", "discussion"]
    elif collection == "results_discussion":
        forbidden = ["abstract", "background", "introduction", "conclusion", "summary"]
    elif collection == "conclusions":
        forbidden = ["abstract", "background", "introduction", "results", "discussion"]
    elif collection == "introductions":
        forbidden = ["abstract", "results", "discussion", "conclusion", "summary"]
    else:
        forbidden = []

    return not any(first_line.startswith(f) for f in forbidden)

def split_markdown_by_section_dynamic(text: str):
    """
    Divide Markdown en chunks por sección y asigna la colección correspondiente.
    Evita guardar chunks vacíos o con contenido que no coincide con el header.
    """

    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=HEADERS_TO_SPLIT,
        strip_headers=True,  # elimina los headers del contenido
    )
    
    docs = splitter.split_text(text)
    SECTION_COLLECTION_MAP_LOWER = {k.lower(): v for k, v in SECTION_COLLECTION_MAP.items()}

    def has_meaningful_content(text: str) -> bool:
        """Verifica si un chunk tiene contenido real"""
        text_clean = text.strip()
        if not text_clean:
            return False
        # Opcional: ignorar solo headers o líneas muy cortas
        if len(text_clean.split()) < 5:
            return False
        return True

    def looks_like_abstract(text: str) -> bool:
        """
        Verifica si el texto parece un Abstract real
        Evita Background, Introduction, Methods, Results, Discussion al inicio
        """
        first_line = text.strip().split("\n")[0].lower() if text else ""
        forbidden = ["background", "introduction", "methods", "study design", "results", "discussion"]
        return not any(first_line.startswith(f) for f in forbidden)

    section_docs = []

    for d in docs:
        # Saltar chunks sin contenido
        if not has_meaningful_content(d.page_content):
            continue

        # Tomar headers de mayor a menor: H3 > H2 > H1
        headers = [(h, d.metadata.get(h, "")) for h in ["H3", "H2", "H1"] if d.metadata.get(h)]
        collection = None

        for h_name, h_value in headers:
            h_norm = normalize_header(h_value)
            if h_norm in SECTION_COLLECTION_MAP_LOWER:
                candidate_collection = SECTION_COLLECTION_MAP_LOWER[h_norm]

                if not has_meaningful_content_for_collection(d.page_content, candidate_collection):
                  continue  # Ignorar chunks que no tienen contenido real

                collection = candidate_collection
                break

        if not collection:
            continue  # ignorar si ningún header es válido

        # Guardar metadata
        d.metadata["Header"] = " | ".join([h_value for _, h_value in headers])
        d.metadata["H1"] = d.metadata.get("H1", "")
        d.metadata["H2"] = d.metadata.get("H2", "")
        d.metadata["H3"] = d.metadata.get("H3", "")
        d.metadata["section_collection"] = collection

        section_docs.append(d)

    return section_docs

In [None]:
process_markdown_directory_2(
    in_dir="md_out"
)

In [9]:
def process_results_discussion_only(in_dir: str):
    """
    Procesa Markdown, extrae chunks de 'results_discussion' y los sube a Qdrant uno por uno.
    Guarda los chunks fallidos en un archivo .pkl.
    """

    in_dir = Path(in_dir)
    all_docs = []
    all_metadata = {}

    # --- 1) Leer archivos ---
    for md_path in in_dir.rglob("*.md"):
        text = md_path.read_text(encoding="utf-8", errors="ignore")
        docs = split_markdown_by_section_dynamic(text)  # tu función dinámica de secciones
        metadata = extract_metadata_from_md(text)
        pmcid = metadata.get("pmcid", md_path.stem)
        all_metadata[pmcid] = metadata

        for d in docs:
            d.metadata.update(metadata)

        all_docs.extend(docs)

    # Guardar metadata completa en JSON
    with open("metadata.json", "w", encoding="utf-8") as f:
        json.dump(all_metadata, f, ensure_ascii=False, indent=4)

    print(f"Collected {len(all_docs)} chunks from {len(list(in_dir.rglob('*.md')))} files.")

    # --- 2) Embeddings ---
    embeddings = MistralAIEmbeddings(model="mistral-embed")
    vector_size = len(embeddings.embed_query("sample text"))

    # --- 3) Qdrant client ---
    client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

    # --- 4) Crear colección 'results_discussion' si no existe ---
    COLLECTION_NAME = "results_discussion"
    try:
        client.get_collection(collection_name=COLLECTION_NAME)
    except Exception:
        print(f"Creando colección '{COLLECTION_NAME}'...")
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
        )

    # --- 5) Filtrar solo chunks de results_discussion ---
    docs = [d for d in all_docs if d.metadata.get("section_collection") == COLLECTION_NAME]

    if not docs:
        print(f"No hay chunks de '{COLLECTION_NAME}' para subir.")
        return

    print(f"Subiendo {len(docs)} chunks de '{COLLECTION_NAME}' uno por uno...")

    store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME, embedding=embeddings)
    failed_chunks = []

    for i, d in enumerate(docs):
        try:
            store.add_documents([d])  # subir uno por uno
        except Exception as e:
            print(f"[ERROR] Chunk {i} falló: {e}")
            failed_chunks.append(d)

    # --- Guardar chunks fallidos ---
    if failed_chunks:
        with open(f"failed_chunks_{COLLECTION_NAME}.pkl", "wb") as f:
            pickle.dump(failed_chunks, f)
        print(f"{len(failed_chunks)} chunks fallidos guardados en 'failed_chunks_{COLLECTION_NAME}.pkl'.")

    print(f"Subida de '{COLLECTION_NAME}' finalizada. Total exitosos: {len(docs) - len(failed_chunks)} | Fallidos: {len(failed_chunks)}")

In [None]:
process_results_discussion_only("md_out")