In [1]:
from zlai.embedding import *
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime

In [2]:
from zlai.retrievers import *
from zlai.schema import *
from zlai.elasticsearch import *
from zlai.elasticsearch.document import *

# 准备数据

In [3]:
path = "./data/"
embedding = Embedding(
    model_path="/home/models/BAAI/bge-m3",
    max_len=5000,
    batch_size=4,
    verbose=True,
)

In [4]:
# ES 的索引结构
class DataAssetDocument(BaseModel):
    url: Optional[str] = ""
    title: Optional[str] = ""
    content: Optional[str] = ""
    vector: List[float]
    date: datetime = datetime.now()

In [5]:
load = LoadingDocuments(
    embedding=embedding,
    chunk_size=3000,
    chunk_overlap=500,
    separator=r"。|\.",
    glob='txt',
    keep_separator="。",
    verbose=True,
)
documents = load(path=path)

100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:01<00:00, 23.70it/s]


[32mLoading model ...[0m
[32mSuccess load model ...[0m


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:39<00:00, 11.06s/it]


# 将数据保存至ES

## 创建ES index

In [6]:
index_name = "5g"
con = get_es_con(hosts="http://localhost:9200/")

In [7]:
from elasticsearch_dsl import Document
from elasticsearch_dsl.field import Text, DenseVector

class DataAssetDocSchema(Document):
    url = Text()
    title = Text(analyzer=analyzer_ik, search_analyzer=analyzer_ik)
    content = Text(analyzer=analyzer_ik, search_analyzer=analyzer_ik)
    vector = DenseVector(dims=1024)
    date = Text()

create_index(
    index_name=index_name,
    field_schema=DataAssetDocSchema,
    reset=True, con=con, disp=True
)

[1m[32mIndex 5g created![0m


## 保存数据

In [9]:
save = DocumentSaveToElasticsearch(
    host="http://localhost:9200/",
    index_name=index_name,
    embedding=embedding,
    batch_size=16,
    thresh=1.95,
    verbose=True,
)

data = [DataAssetDocument.model_validate(doc.model_dump()).model_dump() for doc in documents]
save(data=data)

[34mStart saving data to ElasticSearch, current document: 0 ...[0m


Progress: 100%|██████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 14.70it/s]


------