In [2]:
import os
import csv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from huggingface_hub import hf_hub_download
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path_to_pdf = "data/pdf"
path_to_csv = "data/csv"

def read_pdf(pdf_folder):
    docs = []
    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            path = os.path.join(pdf_folder, file)
            reader = PdfReader(path)
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    docs.append(
                        Document(
                            page_content=text,
                            metadata={"source": file, "page": i + 1}
                        )
                    )
    return docs


def read_csv(csv_folder):
    docs = []
    for file in os.listdir(csv_folder):
        if file.endswith(".csv"):
            path = os.path.join(csv_folder, file)
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                reader = csv.reader(f)
                header = next(reader, None)  # первая строка (заголовки)
                for i, row in enumerate(reader):
                    # собираем строку вида "col1: val1, col2: val2, ..."
                    if header:
                        text = ", ".join([f"{col}: {val}" for col, val in zip(header, row)])
                    else:
                        text = ", ".join(row)
                    docs.append(
                        Document(
                            page_content=text,
                            metadata={"source": file, "row": i + 1}
                        )
                    )
    return docs


# Load all documents
all_docs = read_pdf(path_to_pdf) + read_csv(path_to_csv)
print(f"Загружено документов: {len(all_docs)}")

# Split on chunks with saving the metadata
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunked_docs = splitter.split_documents(all_docs)

Загружено документов: 335919


In [1]:
model_id = "onnx-models/all-MiniLM-L6-v2-onnx"
model_path = hf_hub_download(model_id, filename="model.onnx")
session = ort.InferenceSession(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_id)

def get_embeddings(documents, tokenizer, session):
    embeddings, texts, metas = [], [], []
    for doc in documents:
        inputs = tokenizer(doc.page_content, return_tensors="np",
                           padding=True, truncation=True, max_length=512)
        ort_inputs = {"input_ids": inputs["input_ids"],
                      "attention_mask": inputs["attention_mask"],
                      "token_type_ids": inputs.get("token_type_ids")
                      }
        outputs = session.run(None, ort_inputs)
        embedding = outputs[0][0].astype("float32")

        embeddings.append(embedding)
        texts.append(doc.page_content)
        metas.append(doc.metadata)
    return np.array(embeddings), texts, metas

emb_matrix, texts, metas = get_embeddings(chunked_docs, tokenizer, session)
print(f"Создано эмбеддингов: {len(emb_matrix)}")

NameError: name 'hf_hub_download' is not defined

In [None]:
index = faiss.IndexFlatL2(emb_matrix.shape[1])
index.add(emb_matrix)
print("Размер индекса:", index.ntotal)

def search(query, tokenizer, session, index, texts, metas, top_k=5):
    inputs = tokenizer(query, return_tensors="np", padding=True, truncation=True)
    ort_inputs = {"input_ids": inputs["input_ids"],
                  "attention_mask": inputs["attention_mask"]}
    query_emb = session.run(None, ort_inputs)[0][0].astype("float32")

    distances, indices = index.search(np.array([query_emb]), top_k)
    results = [{"text": texts[i], "metadata": metas[i]} for i in indices[0]]
    return results