In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
!pip install langchain chromadb pypdf torch sentence-transformers transformers

Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.83-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Downloading langchain_text_splitters-0.3.11-py3-none-any.whl.metadata (1.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none

In [17]:
# --------------------------
# 2. Imports
# --------------------------
import torch
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.base import Embeddings
from transformers import AutoTokenizer, AutoModel, pipeline

In [18]:
# --------------------------
# 3. Load PDF
# --------------------------
pdf_path = "/kaggle/input/attention-is-all-you-need/Attention Is All You Need.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [19]:
# --------------------------
# 4. Split text into chunks
# --------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
docs = text_splitter.split_documents(documents)

In [20]:
# --------------------------
# 5. Load BGE-M3 model for embeddings
# --------------------------
print("Loading BGE-M3 model for embeddings...")
model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = 0 if torch.cuda.is_available() else -1
if device == 0:
    model = model.cuda()

def embed_text(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    if device == 0:
        inputs = {k:v.cuda() for k,v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over last hidden state
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

Loading BGE-M3 model for embeddings...


In [21]:
# --------------------------
# 6. Create an embedding wrapper for LangChain
# --------------------------
class BGEEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return [embed_text(t) for t in texts]

    def embed_query(self, text):
        return embed_text(text)

embedding_model = BGEEmbeddings()

In [22]:
# --------------------------
# 7. Create Chroma vector store
# --------------------------
print("Creating ChromaDB vector store...")
vectorstore = Chroma.from_texts(
    texts=[doc.page_content for doc in docs],
    embedding=embedding_model,
    persist_directory="/kaggle/working/chroma_pdf_db"
)
vectorstore.persist()

Creating ChromaDB vector store...


  vectorstore.persist()


In [36]:
# --------------------------
# 8. Setup HuggingFace LLM for QA
# --------------------------
print("Loading HuggingFace LLM for QA...")
device_hf = 0 if torch.cuda.is_available() else -1
hf_pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=device_hf,
   
)
llm = HuggingFacePipeline(pipeline=hf_pipe)

Loading HuggingFace LLM for QA...


Device set to use cuda:0


In [43]:
# --------------------------
# 9. Setup RetrievalQA
# --------------------------
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

In [44]:
# --------------------------
# 10. Ask questions
# --------------------------
queries = ["What is the main idea of this paper?"]

for q in queries:
    print("Q:", q)
    print("A:", qa.run(q))
    print("="*10)

Q: What is the main idea of this paper?
A: The Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output.


In [45]:
# --------------------------
# 10. Ask questions
# --------------------------
queries = ["explain transformers?"]

for q in queries:
    print("Q:", q)
    print("A:", qa.run(q))
    print("="*10)

Q: explain transformers?
A: The Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention
