In [1]:
from typing import List

def split_into_chunks(doc_file: str) -> List[str]:
    with open(doc_file, 'r',encoding='utf-8') as file:
        content = file.read()
    return [chunk for chunk in content.split("##")]

# 主程序部分
chunks = split_into_chunks("README.md")

for i, chunk in enumerate(chunks[:5]):
    print(f"[{i}] {chunk}\n")

[0] <h1>
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="docs/images/nf-core-pairgenomealign_logo_dark.png">
    <img alt="nf-core/pairgenomealign" src="docs/images/nf-core-pairgenomealign_logo_light.png">
  </picture>
</h1>

[![GitHub Actions CI Status](https://github.com/nf-core/pairgenomealign/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/pairgenomealign/actions/workflows/ci.yml)
[![GitHub Actions Linting Status](https://github.com/nf-core/pairgenomealign/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/pairgenomealign/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/pairgenomealign/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.13910535-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.13910535)
[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-

In [2]:
from sentence_transformers import SentenceTransformer
from typing import List

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunk(chunk: str) -> List[float]:
    embedding = embedding_model.encode(chunk)
    return embedding.tolist()

test_embedding = embed_chunk("test")
print(len(test_embedding))
print(test_embedding)


384
[0.011573436670005322, 0.025136202573776245, -0.03670184686779976, 0.05932488292455673, -0.0071490490809082985, -0.04119422286748886, 0.0770873948931694, 0.03744256868958473, 0.01244899071753025, -0.006117628887295723, 0.017034275457262993, -0.07701538503170013, -0.00039416426443494856, 0.027909062802791595, -0.015989158302545547, -0.06827527284622192, 0.008884645998477936, -0.020280703902244568, -0.08035995811223984, -0.013074046932160854, -0.04110001400113106, -0.025898080319166183, -0.0265386700630188, 0.03305228799581528, -0.022079195827245712, 0.021046103909611702, -0.05792200192809105, 0.03294876217842102, 0.02970738522708416, -0.06224840506911278, 0.038788024336099625, 0.03199068829417229, 0.015330815687775612, 0.0453069731593132, 0.05314944311976433, 0.013360676355659962, 0.041224926710128784, 0.028142910450696945, 0.019398434087634087, -0.0032523232512176037, -0.0036123408935964108, -0.14286024868488312, 0.0380711704492569, -0.010916205123066902, 0.02609400637447834, 0.041

In [3]:
embeddings = [embed_chunk(chunk) for chunk in chunks]
print(len(embeddings))
print(embeddings[0])

72
[-0.09221261739730835, 0.018330078572034836, -0.014622059650719166, 0.02387947589159012, 0.04229506105184555, 0.03056703321635723, -0.013751836493611336, -0.03502431884407997, 0.029271598905324936, -0.015627561137080193, 0.0398603230714798, 0.06536902487277985, 0.02458924986422062, -0.042360153049230576, -0.0004565980634652078, 0.0691816583275795, -0.09958221763372421, 0.03261108323931694, -0.0450553372502327, -0.04892238602042198, -0.011267300695180893, -0.06656229496002197, -0.013047306798398495, -0.022809557616710663, -0.01983414962887764, 0.02300063706934452, -0.031199226155877113, -0.0029408191330730915, 0.01665005460381508, -0.07034137099981308, -0.036975931376218796, 0.06463643163442612, 0.035039402544498444, 0.03786580264568329, -0.02161167562007904, 0.05056149885058403, 0.060499317944049835, -0.030701272189617157, -0.01798071898519993, -0.029251625761389732, 0.005497481673955917, -0.060153476893901825, 0.056236788630485535, -0.04040248319506645, 0.02569899894297123, 0.10685

In [4]:
import chromadb

chromadb_client = chromadb.EphemeralClient()
chromadb_collection = chromadb_client.get_or_create_collection(name="default")

def save_embeddings(chunks:List[str], embeddings:List[List[float]]) -> None:
    ids = [str(i) for i in range(len(chunks))]
    chromadb_collection.add(
        documents=chunks,
        embeddings=embeddings,
        ids=ids
    )

save_embeddings(chunks,embeddings)

In [5]:
def retrieve(query: str, top_k: int) -> List[str]:
    query_embedding = embed_chunk(query)
    results = chromadb_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "distances"]  # 建议带上距离，方便观察
    )
    return results['documents'][0]

# 测试
query = "What is metatdenovo?How to use it?"
retrieved_chunks = retrieve(query, 5)

for i, chunk in enumerate(retrieved_chunks):
    print(f"[{i}] {chunk}\n")


[0]  Contributions and Support

If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).

For further information or help, don't hesitate to get in touch on the [Slack `#metatdenovo` channel](https://nfcore.slack.com/channels/metatdenovo) (you can join with [this invite](https://nf-co.re/join/slack)).



[1]  Credits

nf-core/metatdenovo was originally written by Danilo Di Leo (@danilodileo), Emelie Nilsson (@emnilsson) & Daniel Lundin (@erikrikarddaniel).



[2]  Introduction

**nf-core/metatdenovo** is a bioinformatics best-practice analysis pipeline for assembly and annotation of metatranscriptomic and metagenomic data from prokaryotes, eukaryotes or viruses.

On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persis

In [6]:
from sentence_transformers import CrossEncoder

def rerank(query:str, retrieved_chunks:List[str],top_k:int) -> List[str]:
    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')
    pairs=[(query,chunk) for chunk in retrieved_chunks]
    scores=cross_encoder.predict(pairs)

    chunk_with_score_list=[(chunk,score) 
                           for chunk,score in zip(retrieved_chunks, scores)]
    chunk_with_score_list.sort(key=lambda pair:pair[1],reverse=True)
    return [chunk for chunk,_ in chunk_with_score_list][:top_k]

reranked_chunks = rerank(query, retrieved_chunks,3)

for i,chunk in enumerate(reranked_chunks):
    print(f"[{i}]{chunk}\n")

[0] Introduction

**nf-core/metatdenovo** is a bioinformatics best-practice analysis pipeline for assembly and annotation of metatranscriptomic and metagenomic data from prokaryotes, eukaryotes or viruses.

On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/metatdenovo/results).



[1] Usage

![nf-core/metatdenovo metro map](docs/images/metat-metromap.png)

1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
3. Quality trimming and adapter removal for raw reads ([`Trim Galore!`](https://ww

In [7]:
from dotenv import load_dotenv
from google import genai


load_dotenv()
google_client = genai.Client()

def generate(query:str, chunks:List[str]) ->str:
    prompt = f"""You're a knowledge assistant,please answer questions according to the user's request and the following information. 
    the user's question:{query}
    relevant information:
    {"\n\n".join(chunks)}
    Please answer according to the information mentioned above.Do not make up information."""

    print(f"{prompt}\n\n---\n")

    response = google_client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )
    return response.text

answer = generate(query,reranked_chunks)
print(answer)


You're a knowledge assistant,please answer questions according to the user's request and the following information. 
    the user's question:What is metatdenovo?How to use it?
    relevant information:
     Introduction

**nf-core/metatdenovo** is a bioinformatics best-practice analysis pipeline for assembly and annotation of metatranscriptomic and metagenomic data from prokaryotes, eukaryotes or viruses.

On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/metatdenovo/results).



 Usage

![nf-core/metatdenovo metro map](docs/images/metat-metromap.png)

1. Read QC ([`FastQC`](https:/