<a href="https://www.kaggle.com/code/shedai/llm-langchain-rag-soru-cevap?scriptVersionId=249989985" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pdf-files/TUBA_Responsible_AI_Climate.pdf
/kaggle/input/pdf-files/TUBA GTT Conference  Program.pdf (1).pdf
/kaggle/input/pdf-files/Product Recommendation System with Machine Learning Algorithms for SME Banking.pdf
/kaggle/input/pdf-files/Editorial.pdf


In [2]:
!pip install langchain langchain-community pypdf sentence-transformers faiss-cpu transformers torch accelerate


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9

In [3]:
# Adım 1: Gerekli kütüphaneleri yükleyin
# Bu hücreyi çalıştırmak için not defteri ayarlarından internetin açık olduğundan emin olun.
#!pip install langchain langchain-community pypdf sentence-transformers faiss-cpu transformers torch accelerate

import os
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Günlük (logging) ayarlarını yapılandır
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class KaggleRAGSystem:
    def __init__(self):
        self.documents = []
        self.vector_store = None
        self.embeddings = None
        self.llm = None
        self.qa_chain = None

    def load_pdfs_from_directory(self, directory_path):
        """
        Kaggle'da belirtilen bir dizindeki tüm PDF'leri yükler.
        Bu fonksiyon, Google Colab'daki 'files.upload()' yerine geçer.
        """
        pdf_paths = []
        try:
            logger.info(f"'{directory_path}' dizininde PDF'ler aranıyor...")
            if not os.path.exists(directory_path):
                 raise FileNotFoundError(f"Dizin bulunamadı: '{directory_path}'. Lütfen veri setini eklediğinizden ve yolun doğru olduğundan emin olun.")

            for filename in os.listdir(directory_path):
                if filename.lower().endswith(".pdf"):
                    full_path = os.path.join(directory_path, filename)
                    pdf_paths.append(full_path)
            
            if not pdf_paths:
                logger.warning(f"'{directory_path}' dizininde hiç PDF dosyası bulunamadı.")
                return []

            logger.info(f"{len(pdf_paths)} adet PDF bulundu: {pdf_paths}")
            return pdf_paths
        except Exception as e:
            logger.error(f"'{directory_path}' dizininden okuma sırasında hata: {e}")
            raise

    def load_documents(self, pdf_paths):
        """Yüklenen PDF belgelerini ayrıştırır ve yükler."""
        for pdf_path in pdf_paths:
            try:
                loader = PyPDFLoader(pdf_path)
                documents = loader.load()
                self.documents.extend(documents)
                logger.info(f"'{pdf_path}' dosyasından {len(documents)} sayfa yüklendi.")
            except Exception as e:
                logger.error(f"'{pdf_path}' yüklenirken hata: {e}")
        if self.documents:
            logger.info(f"Toplamda {len(self.documents)} sayfa doküman yüklendi.")

    def split_documents(self, chunk_size=1000, chunk_overlap=200):
        """Belgeleri daha küçük parçalara (chunk) böler."""
        if not self.documents:
            logger.warning("Bölünecek doküman bulunmuyor. Bu adım atlanıyor.")
            return
        try:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            self.document_chunks = text_splitter.split_documents(self.documents)
            logger.info(f"Dokümanlar {len(self.document_chunks)} parçaya bölündü.")
        except Exception as e:
            logger.error(f"Dokümanları bölerken hata: {e}")
            raise

    def setup_embeddings(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        """Gömme (embedding) modelini kurar."""
        try:
            self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
            logger.info(f"'{model_name}' gömme modeli kuruldu.")
        except Exception as e:
            logger.error(f"Gömme modeli kurulurken hata: {e}")
            raise

    def create_vector_store(self):
        """Belge parçalarından bir vektör deposu oluşturur."""
        if not self.document_chunks:
            logger.warning("Vektör deposu oluşturmak için belge parçası bulunmuyor. Bu adım atlanıyor.")
            return
        try:
            self.vector_store = FAISS.from_documents(self.document_chunks, self.embeddings)
            logger.info("FAISS vektör deposu oluşturuldu.")
        except Exception as e:
            logger.error(f"Vektör deposu oluşturulurken hata: {e}")
            raise

    def setup_local_llm(self, model_id="google/flan-t5-base", device="auto"):
        """Hugging Face kullanarak yerel bir LLM kurar."""
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            # 'device_map="auto"' Kaggle'da varsa GPU kullanımını otomatikleştirir
            model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=device)

            pipe = pipeline(
                "text2text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True
            )

            self.llm = HuggingFacePipeline(pipeline=pipe)
            logger.info(f"'{model_id}' yerel LLM'i kuruldu.")
        except Exception as e:
            logger.error(f"Yerel LLM kurulurken hata: {e}")
            raise

    def setup_qa_chain(self, k=3):
        """Vektör deposu ve LLM'i kullanarak bir Soru-Cevap zinciri oluşturur."""
        if not self.vector_store or not self.llm:
            logger.warning("Vektör deposu veya LLM hazır değil. QA zinciri oluşturulamıyor.")
            return
        try:
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=self.vector_store.as_retriever(search_kwargs={"k": k})
            )
            logger.info(f"Soru-Cevap zinciri k={k} ile kuruldu.")
        except Exception as e:
            logger.error(f"Soru-Cevap zinciri kurulurken hata: {e}")
            raise

    def answer_question(self, question):
        """RAG sistemini kullanarak bir soruyu yanıtlar."""
        if not self.qa_chain:
            return "Soru-Cevap sistemi henüz hazır değil. Lütfen kurulumu tamamlayın."
        try:
            logger.info(f"Soru soruluyor: {question}")
            answer = self.qa_chain.run(question)
            logger.info(f"Cevap alındı: {answer}")
            return answer
        except Exception as e:
            logger.error(f"Soru yanıtlarken hata: {e}")
            raise

    def run_setup(self, pdf_directory, chunk_size=1000, chunk_overlap=200, model_id="google/flan-t5-base", k=3):
        """Tüm kurulum sürecini çalıştırır."""
        try:
            pdf_paths = self.load_pdfs_from_directory(pdf_directory)
            if not pdf_paths:
                logger.error("Kurulum durduruldu çünkü hiç PDF dosyası bulunamadı.")
                return False
            
            self.load_documents(pdf_paths)
            self.split_documents(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
            self.setup_embeddings()
            self.create_vector_store()
            self.setup_local_llm(model_id=model_id)
            self.setup_qa_chain(k=k)
            logger.info("RAG sistemi kullanıma hazır!")
            return True
        except Exception as e:
            logger.error(f"Kurulum sırasında genel bir hata oluştu: {e}")
            return False

# --- KULLANICI AYARLARI ---
# 1. PDF dosyalarınızı bir Kaggle Veri Seti olarak ekleyin. Sağdaki bölmeden "+ Add data"ya tıklayın.
# 2. Aşağıdaki `pdf_directory` değişkenini veri setinizin yolu ile güncelleyin.
#    Yol, "/kaggle/input/veri-setinizin-adi/" şeklinde olacaktır.
pdf_directory = "/kaggle/input/pdf-files"  # <-- BU YOLU DEĞİŞTİRİN

# --- SİSTEMİ ÇALIŞTIRMA ---
rag = KaggleRAGSystem()
# Kurulum fonksiyonuna dizin yolunu verin
setup_successful = rag.run_setup(
    pdf_directory=pdf_directory, 
    chunk_size=1000, 
    chunk_overlap=200, 
    model_id="google/flan-t5-base", 
    k=3
)

# --- SORU SORMA ---
if setup_successful:
    print("\n--- Sistem Hazır, Sorularınızı Sorabilirsiniz ---")
    
    question1 = "What is the main topic of these documents?"
    answer1 = rag.answer_question(question1)
    print(f"\nSoru: {question1}\nCevap: {answer1}")

    question2 = "Summarize the key points from the documents."
    answer2 = rag.answer_question(question2)
    print(f"\nSoru: {question2}\nCevap: {answer2}")
else:
    print("\n--- Kurulum Başarısız Oldu ---")
    print("Lütfen yukarıdaki log mesajlarını kontrol edin. 'pdf_directory' yolunun doğru olduğundan ve veri setinizin PDF dosyaları içerdiğinden emin olun.")


2025-07-11 15:50:03.255512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752249003.642271      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752249003.755087      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  self.embeddings = HuggingFaceEmbeddings(model_name=model_name)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
  self.llm = HuggingFacePipeline(pipeline=pipe)
  answer = self.qa_chain.run(question)
Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors



--- Sistem Hazır, Sorularınızı Sorabilirsiniz ---

Soru: What is the main topic of these documents?
Cevap: Science-20 conferences, “Science for Global Transformation”, scheduled to be held in Brazil in early 2024

Soru: Summarize the key points from the documents.
Cevap: A case study for presenting Bank recommender systems based on bon card transaction data.
