In [1]:
import os
import csv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from huggingface_hub import hf_hub_download
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_to_pdf = "data/pdf"
path_to_csv = "data/csv"

def read_pdf(pdf_folder):
    docs = []
    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            path = os.path.join(pdf_folder, file)
            reader = PdfReader(path)
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    docs.append(
                        Document(
                            page_content=text,
                            metadata={"source": file, "page": i + 1}
                        )
                    )
    return docs


def read_csv(csv_folder):
    docs = []
    for file in os.listdir(csv_folder):
        if file.endswith(".csv"):
            path = os.path.join(csv_folder, file)
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                reader = csv.reader(f)
                header = next(reader, None)  # первая строка (заголовки)
                for i, row in enumerate(reader):
                    # собираем строку вида "col1: val1, col2: val2, ..."
                    if header:
                        text = ", ".join([f"{col}: {val}" for col, val in zip(header, row)])
                    else:
                        text = ", ".join(row)
                    docs.append(
                        Document(
                            page_content=text,
                            metadata={"source": file, "row": i + 1}
                        )
                    )
    return docs


# Load all documents
all_docs = read_pdf(path_to_pdf) + read_csv(path_to_csv)
print(f"Загружено документов: {len(all_docs)}")

# Split on chunks with saving the metadata
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunked_docs = splitter.split_documents(all_docs)

Загружено документов: 335919


In [16]:
def generate_embeddings(docs, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=256):
    model = SentenceTransformer(model_name)

    texts = [doc.page_content for doc in docs]
    metas = [doc.metadata for doc in docs]

    for start in range(0, len(texts), batch_size):
        batch_texts = texts[start:start+batch_size]
        batch_metas = metas[start:start+batch_size]

        batch_embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            show_progress_bar=False
        )

        # Возвращаем пачку как генератор
        for i, emb in enumerate(batch_embeddings):
            yield {
                "embedding": emb,
                "content": batch_texts[i],
                "metadata": batch_metas[i]
            }

In [18]:
# Создаём FAISS-индекс
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
d = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(d)

texts, metas = [], []

# Генеративная обработка
for i, item in enumerate(generate_embeddings(chunked_docs, batch_size=256)):
    index.add(np.array([item["embedding"]], dtype="float32"))
    texts.append(item["content"])
    metas.append(item["metadata"])

else:
    print(f"Добавлено {i} эмбеддингов в индекс")

print("Размер индекса:", index.ntotal)

Добавлено 524603 эмбеддингов в индекс
Размер индекса: 524604


In [19]:
# Поиск релевантых документов
def search(query, model, index, texts, metas, top_k=5):
    query_emb = model.encode([query], convert_to_numpy=True).astype("float32")
    distances, indices = index.search(query_emb, top_k)
    return [
        {"text": texts[i], "metadata": metas[i], "distance": float(distances[0][j])}
        for j, i in enumerate(indices[0])
    ]


In [20]:
def build_prompt(query, retrieved_docs):
    context = "\n\n".join([doc["text"] for doc in retrieved_docs])
    
    prompt = f"""
You are an expert in SQL query optimization. 
Always use the retrieved context to improve SQL queries. 

Context may include: 
- Database schemas, table structures 
- Existing indexes and constraints 
- Query execution plans or performance notes 

Your responsibilities:
1. Analyze the original SQL query provided by the user.
2. Use the retrieved context to suggest the best possible optimization.
3. Output the optimized SQL query in a code block.
4. Explain why this query is more efficient, referencing the context when relevant.
5. If multiple optimization strategies are possible, pick the best one and briefly mention alternatives.
6. Ensure the optimized query always produces the same results as the original.

Format your answer strictly as follows:

**Optimized Query:**
```sql
-- optimized SQL here

Question: {query} \n
Context: {context}

"""
    return prompt


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot")
model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot")

  [2m2025-09-21T20:45:39.096853Z[0m [31mERROR[0m  [31merror invoking get_reconstruction api, error: ReqwestMiddlewareError(Middleware(couldn't get token: TokenRefreshFailure("Error refreshing token: PyErr { type: <class 'requests.exceptions.ConnectionError'>, value: ConnectionError(ProtocolError('Connection aborted.', TimeoutError(60, 'Operation timed out')), '(Request ID: 2833a55e-94c4-4efa-9513-37b03b9c0292)'), traceback: Some(<traceback object at 0x3143dc280>) }"))), [1;31mcaller[0m[31m: "/Users/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:329"[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/error_printer/src/lib.rs:28

  [2m2025-09-21T20:45:49.974798Z[0m [31mERROR[0m  [31mFatal Error: "s3::get_range" api call failed (request id , retry 2): HTTP status client error (403 Forbidden) for url (https://transfer.xethub.hf.co/xorbs/default/259ed03c459a334855ff03650ea9d5c64a83d4519ec9f190b82524a67f76fdb7?X-Xet-Signed-Range=bytes%3D0-67097194&X-Xet-Sessi

In [None]:
def generate_answer(query, retrieved_docs):
    prompt = build_prompt(query, retrieved_docs)

    # Токенизируем входные данные
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(**inputs, max_new_tokens=300)
    
    # Декодируем и возвращаем результат
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
query = "Что такое машинное обучение?"
retrieved = search(query, model, index, texts, metas, top_k=5)
answer = generate_answer(query, retrieved)

print("Ответ:", answer)
