In [None]:
from txtai import Embeddings
from txtai.pipeline import Segmentation
import polars as pl

from src.lfe.impl.utils import iter_by_line_parquet
from tqdm.notebook import tqdm

In [None]:
textractor = Segmentation(chunker="recursive")


def stream():
    for el in iter_by_line_parquet("raw_pubmed_meta_df.parquet", 500000):
        if el["PMID"] == "18265450":
            continue

        text = f"""
        # {el["Title"]}
        Keywords: {el["KeywordList"]}
        {"\n\n".join(el["AbstractText"] if el["AbstractText"] is not None else [])}
        """
        el["DateAvail"] = el["DateAvail"].strftime("%Y-%m-%d")
        chunks = textractor(text)
        for chunk in chunks:

            yield el["PMID"], chunk
        yield el["PMID"], el

In [None]:
url = "postgresql+psycopg2://postgres:postgress@localhost/postgres"

embeddings = Embeddings(
    path="neuml/pubmedbert-base-embeddings",
    content=url,
    backend="pgvector",
    pgvector={"url": url},
    # graph={
    #     "backend": "rdbms",
    #     "url": url,
    #     "approximate": False,hto
    # },
    gpu=True,
    hybrid=True,
    encode_batch=4096,
    scoring={"method": "pgtext", "url": url},
    batch=50000,
)
embeddings.index(
    tqdm(
        stream(),
        total=pl.scan_parquet("raw_pubmed_meta_df.parquet")
        .select(pl.len())
        .collect()[0, 0],
    ),
    checkpoint="./pubmed-index-cp/",
)
embeddings.save("./pubmed-index")