In [1]:
from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = QdrantClient(url="http://localhost:6333")

In [3]:
collection_name = "bank_compliance_v1"

client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "dense": models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
            hnsw_config=models.HnswConfigDiff(
                m=8,
                ef_construct=100
            )
        )
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    },
    optimizers_config=models.OptimizersConfigDiff(
        indexing_threshold=1000
    )
)

client.create_payload_index(collection_name, "file_name", models.PayloadSchemaType.KEYWORD)
client.create_payload_index(collection_name, "page_label", models.PayloadSchemaType.INTEGER)
client.create_payload_index(collection_name, "article_no", models.PayloadSchemaType.KEYWORD)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)

In [4]:
import os
from llama_index.core import SimpleDirectoryReader

def banking_metadata_extractor(file_path):
    """
    Her dosya okunduƒüunda dosya bazlƒ± sabit metadata'larƒ± ekler.
    LlamaIndex sayfa numaralarƒ±nƒ± (page_label) otomatik olarak d√∂k√ºmana ekleyecektir.
    """
    file_name = os.path.basename(file_path)
    
    # Dosya ismine g√∂re d√∂k√ºman ba≈ülƒ±ƒüƒ± atama (Enterprise ETL mantƒ±ƒüƒ±)
    doc_titles = {
        "BBSEBH.pdf": "Bankalarƒ±n Bilgi Sistemleri ve Elektronik Bankacƒ±lƒ±k Hizmetleri Hakkƒ±nda Y√∂netmelik",
        "sir_saklama_yukumlulugu.pdf": "Sƒ±r Niteliƒüindeki Bilgilerin Payla≈üƒ±lmasƒ± Hakkƒ±nda Y√∂netmelik"
    }
    
    return {
        "file_name": file_name,
        "document_title": doc_titles.get(file_name, "Bilinmeyen Mevzuat"),
        "file_path": file_path,
        "category": "Banking Regulation",
        "ingestion_date": "2024-05-22"
    }

# Veriyi Y√ºkleme
reader = SimpleDirectoryReader(
    input_dir="./data", 
    file_metadata=banking_metadata_extractor
)
documents = reader.load_data()

print(documents[0].metadata)

{'page_label': '1', 'file_name': 'BBSEBH.pdf', 'document_title': 'Bankalarƒ±n Bilgi Sistemleri ve Elektronik Bankacƒ±lƒ±k Hizmetleri Hakkƒ±nda Y√∂netmelik', 'file_path': '/home/berk/finreg-navigator/data/BBSEBH.pdf', 'category': 'Banking Regulation', 'ingestion_date': '2024-05-22'}


In [5]:
from llama_index.core.node_parser import HierarchicalNodeParser

node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048,512,128],
    chunk_overlap=20
)

nodes = node_parser.get_nodes_from_documents(documents)

In [6]:
import re

def enrich_metadata_with_article(nodes):
    current_article = "Bilinmiyor"
    for node in nodes:
        # Sadece hiyerar≈üinin en √ºst√ºndeki (b√ºy√ºk) node'larda madde ara
        # veya her node'da ara ama bulamazsan bir √∂nceki maddeyi kullan
        match = re.search(r"MADDE\s+(\d+)", node.text)
        if match:
            current_article = f"Madde {match.group(1)}"
            node.metadata["article_no"] = current_article
        else:
            # Eƒüer bu k√º√ß√ºk par√ßa madde no i√ßermiyorsa, 
            # ait olduƒüu d√∂k√ºmandaki son bulunan maddeyi ata
            node.metadata["article_no"] = current_article
    return nodes

enriched_nodes = enrich_metadata_with_article(nodes)

In [7]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Banking_RAG_Ingestion")

with mlflow.start_run(run_name="Initial_Ingestion_Hybrid"):
    # Parametreleri logla
    mlflow.log_param("chunk_sizes", "[2048, 512, 128]")
    mlflow.log_param("vector_store", "Qdrant")
    mlflow.log_param("collection_name", "banking_compliance_v1")
    
    # Node sayƒ±sƒ±nƒ± logla
    mlflow.log_metric("total_nodes", len(enriched_nodes))
    
    print(f"Toplam {len(enriched_nodes)} node olu≈üturuldu ve MLflow'a loglandƒ±.")

Toplam 1751 node olu≈üturuldu ve MLflow'a loglandƒ±.
üèÉ View run Initial_Ingestion_Hybrid at: http://localhost:5000/#/experiments/1/runs/bfe589ea61ff44628cfd7bd874cfcdb5
üß™ View experiment at: http://localhost:5000/#/experiments/1


In [8]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# 1. Embedding Modelini Tanƒ±mla (Dense)
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Qdrant Store Yapƒ±landƒ±rmasƒ±
vector_store = QdrantVectorStore(
    collection_name="bank_compliance_v1",
    client=client,
    enable_hybrid=True,
    batch_size=64,
    dense_vector_name="dense",
    sparse_vector_name="sparse"
)

# 3. Indexleme ƒ∞≈ülemi
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    enriched_nodes, 
    storage_context=storage_context,
    embed_model=embed_model
)

print("ƒ∞ndeksleme ba≈üarƒ±yla tamamlandƒ±!")

2026-02-11 19:11:33,293 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2026-02-11 19:11:33,577 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
2026-02-11 19:11:33,623 - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
2026-02-11 19:11:33,772 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
2026-02-11 19:11:33,820 - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json "HTTP/1.1 200 OK"
2026-02-11 19:11:33,967 - INFO - HTTP Request: HEAD https://huggingface.co/sent

ƒ∞ndeksleme ba≈üarƒ±yla tamamlandƒ±!


In [9]:
# Geri getiriciyi (Retriever) yapƒ±landƒ±r
# similarity_top_k=3: En alakalƒ± 3 par√ßayƒ± getir
retriever = index.as_retriever(similarity_top_k=3)

# Test Sorgusu
query = "A√ßƒ±k bankacƒ±lƒ±k servisleri nelerdir ve kimler eri≈üebilir?"
retrieved_nodes = retriever.retrieve(query)

# Sonu√ßlarƒ± ƒ∞nceleme
for i, node in enumerate(retrieved_nodes):
    print(f"\n--- Sonu√ß {i+1} (Skor: {node.score:.4f}) ---")
    print(f"Kaynak: {node.metadata.get('file_name')} | Sayfa: {node.metadata.get('page_label')}")
    print(f"Madde: {node.metadata.get('article_no', 'Belirtilmemi≈ü')}")
    print(f"ƒ∞√ßerik √ñzeti: {node.text[:200]}...")

2026-02-11 19:15:53,709 - INFO - HTTP Request: POST http://localhost:6333/collections/bank_compliance_v1/points/query/batch "HTTP/1.1 200 OK"



--- Sonu√ß 1 (Skor: 0.7274) ---
Kaynak: BBSEBH.pdf | Sayfa: 1
Madde: Madde 1
ƒ∞√ßerik √ñzeti: n) G√ºvenlik duvarƒ±: Farklƒ± g√ºvenlik seviyelerine sahip aƒülar veya aƒüa baƒülƒ± cihazlar arasƒ±ndaki trafik akƒ±≈ü kontrol√ºn√º saƒülayan donanƒ±m ya da
yazƒ±lƒ±mlarƒ±,
o) Hassas veri: Kimlik doƒürulamada kullanƒ±lan...

--- Sonu√ß 2 (Skor: 0.7257) ---
Kaynak: BBSEBH.pdf | Sayfa: 2
Madde: Madde 4
ƒ∞√ßerik √ñzeti: bilgi sistemlerinden sorumlu √ºst d√ºzey y√∂netici ile bankanƒ±n ilgili i≈ü
birimlerinden √ºst d√ºzey y√∂neticilerin bu komiteye √ºye olmasƒ± esastƒ±r....

--- Sonu√ß 3 (Skor: 0.7224) ---
Kaynak: sir_saklama_yukumlulugu.pdf | Sayfa: 1
Madde: Madde 1
ƒ∞√ßerik √ñzeti: c) Ana sermaye: 5/9/2013 tarihli ve 28756 sayƒ±lƒ± Resm√Æ Gazete‚Äôde yayƒ±mlanan Bankalarƒ±n √ñzkaynaklarƒ±na
ƒ∞li≈ükin Y√∂netmelikte belirlenen usul ve esaslar √ßer√ßevesinde hesaplanacak ana sermayeyi,
√ß) Anonim...
