In [9]:
import os
import csv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from huggingface_hub import hf_hub_download
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

In [7]:
path_to_pdf = "data/pdf"
path_to_csv = "data/csv"

def read_pdf(pdf_folder):
    docs = []
    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            path = os.path.join(pdf_folder, file)
            reader = PdfReader(path)
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    docs.append(
                        Document(
                            page_content=text,
                            metadata={"source": file, "page": i + 1}
                        )
                    )
    return docs


def read_csv(csv_folder):
    docs = []
    for file in os.listdir(csv_folder):
        if file.endswith(".csv"):
            path = os.path.join(csv_folder, file)
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                reader = csv.reader(f)
                header = next(reader, None)  # первая строка (заголовки)
                for i, row in enumerate(reader):
                    # собираем строку вида "col1: val1, col2: val2, ..."
                    if header:
                        text = ", ".join([f"{col}: {val}" for col, val in zip(header, row)])
                    else:
                        text = ", ".join(row)
                    docs.append(
                        Document(
                            page_content=text,
                            metadata={"source": file, "row": i + 1}
                        )
                    )
    return docs


# Load all documents
all_docs = read_pdf(path_to_pdf) + read_csv(path_to_csv)
print(f"Загружено документов: {len(all_docs)}")

# Split on chunks with saving the metadata
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunked_docs = splitter.split_documents(all_docs)

Загружено документов: 335919


In [None]:
# model_id = "onnx-models/all-MiniLM-L6-v2-onnx"
# model_path = hf_hub_download(model_id, filename="model.onnx")
# session = ort.InferenceSession(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_id)


# def get_embeddings(documents, tokenizer, session, batch_size=128, max_length=512):
#     embeddings, texts, metas = [], [], []

#     required_inputs = [inp.name for inp in session.get_inputs()]
#     print("Модель ожидает входы:", required_inputs)

#     for start in range(0, len(documents), batch_size):
#         batch = documents[start:start+batch_size]

#         texts_batch = [doc.page_content for doc in batch]

#         inputs = tokenizer(
#             texts_batch,
#             return_tensors="np",
#             padding=True,
#             truncation=True,
#             max_length=max_length
#         )

#         # Подбираем только нужные ключи
#         ort_inputs = {k: v for k, v in inputs.items() if k in required_inputs}

#         # Запуск модели
#         outputs = session.run(None, ort_inputs)
#         batch_embeddings = outputs[0].astype("float32")

#         # Сохраняем результаты
#         for i, doc in enumerate(batch):
#             embeddings.append(batch_embeddings[i])
#             texts.append(doc.page_content)
#             metas.append(doc.metadata)

#         print(f"Обработано {start+len(batch)}/{len(documents)} документов")

#     return np.array(embeddings), texts, metas



# emb_matrix, texts, metas = get_embeddings(chunked_docs, tokenizer, session)
# print(f"Создано эмбеддингов: {len(emb_matrix)}")

Модель ожидает входы: ['input_ids', 'attention_mask', 'token_type_ids']
Обработано 128/524604 документов
Обработано 256/524604 документов
Обработано 384/524604 документов
Обработано 512/524604 документов
Обработано 640/524604 документов
Обработано 768/524604 документов
Обработано 896/524604 документов
Обработано 1024/524604 документов
Обработано 1152/524604 документов
Обработано 1280/524604 документов
Обработано 1408/524604 документов
Обработано 1536/524604 документов
Обработано 1664/524604 документов
Обработано 1792/524604 документов
Обработано 1920/524604 документов
Обработано 2048/524604 документов
Обработано 2176/524604 документов
Обработано 2304/524604 документов
Обработано 2432/524604 документов
Обработано 2560/524604 документов
Обработано 2688/524604 документов
Обработано 2816/524604 документов
Обработано 2944/524604 документов
Обработано 3072/524604 документов
Обработано 3200/524604 документов
Обработано 3328/524604 документов
Обработано 3456/524604 документов
Обработано 3584/5

KeyboardInterrupt: 

In [None]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [doc.page_content for doc in chunked_docs]
metas = [doc.metadata for doc in chunked_docs]

embeddings = model.encode(
    texts,
    batch_size=512,
    show_progress_bar=True,
    convert_to_numpy=True
)

results = [
    {
        "embedding": embeddings[i],
        "content": texts[i],
        "metadata": metas[i]
    }
    for i in range(len(texts))
]

print(f"Создано {len(results)} эмбеддингов")
print("Пример:", results[0])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Batches:  29%|██▉       | 298/1025 [06:51<17:47,  1.47s/it]

In [None]:
emb_matrix = np.array([r["embedding"] for r in results]).astype("float32")
texts = [r["content"] for r in results]
metas = [r["metadata"] for r in results]

index = faiss.IndexFlatL2(emb_matrix.shape[1])
index.add(emb_matrix)
print("Размер индекса:", index.ntotal)


# === 2. Поисковая функция ===
def search(query, model, index, texts, metas, top_k=5):
    # Получаем эмбеддинг запроса
    query_emb = model.encode([query], convert_to_numpy=True).astype("float32")

    # Ищем ближайшие соседи
    distances, indices = index.search(query_emb, top_k)

    # Собираем результаты
    results = [
        {
            "text": texts[i],
            "metadata": metas[i],
            "distance": float(distances[0][j])
        }
        for j, i in enumerate(indices[0])
    ]
    return results
