In [3]:
import pdfplumber
import re
import os
import uuid
import json
import numpy as np
from sentence_transformers import SentenceTransformer

# ----------------------------------
# 1. CONFIG
# ----------------------------------
pdf_files = [
    "/content/10-Q4-2024-As-Filed.pdf",
    "/content/tsla-20231231-gen.pdf"
]

chunk_size = 500
overlap = 50

model = SentenceTransformer("all-MiniLM-L6-v2")

# ----------------------------------
# 2. CLEAN TEXT
# ----------------------------------
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ----------------------------------
# 3. SECTION DETECTION
# ----------------------------------
def detect_section(text):
    lines = text.split("\n")
    for line in lines:
        if line.isupper() and len(line.split()) < 10:
            return line.strip()
        if re.match(r'^\d+(\.\d+)*\s+', line):
            return line.strip()
    return "Unknown"

# ----------------------------------
# 4. CHUNKING
# ----------------------------------
def chunk_text(text):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# ----------------------------------
# 5. BUILD RAG DOCUMENTS
# ----------------------------------
rag_documents = []

for pdf_path in pdf_files:
    document_name = os.path.basename(pdf_path)

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue

            section = detect_section(text)
            text = clean_text(text)
            chunks = chunk_text(text)

            for i, chunk in enumerate(chunks):

                embedding = model.encode(chunk).tolist()

                record = {
                    "id": f"{document_name}_page{page_number}_chunk{i}",
                    "content": chunk,
                    "embedding": embedding,
                    "metadata": {
                        "document": document_name,
                        "page": page_number,
                        "section": section
                    }
                }

                rag_documents.append(record)

print(f"Total RAG records created: {len(rag_documents)}")

# ----------------------------------
# 6. SAVE AS JSON (RAG-ready)
# ----------------------------------
with open("rag_documents.json", "w", encoding="utf-8") as f:
    json.dump(rag_documents, f, indent=2)

print("RAG-ready JSON file created.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Total RAG records created: 2006
RAG-ready JSON file created.


In [4]:
import json

with open("rag_documents.json", "r", encoding="utf-8") as f:
    rag_data = json.load(f)

# Display the content. For very large files, consider displaying only a sample.
print(json.dumps(rag_data, indent=2))

In [5]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# 1. LOAD RAG DOCUMENTS
# -----------------------------
with open("rag_documents.json", "r", encoding="utf-8") as f:
    rag_documents = json.load(f)

model = SentenceTransformer("all-MiniLM-L6-v2")

# Extract embeddings + metadata
embeddings = np.array([doc["embedding"] for doc in rag_documents])
contents = [doc["content"] for doc in rag_documents]
metadata = [doc["metadata"] for doc in rag_documents]


# -----------------------------
# 2. RETRIEVAL FUNCTION
# -----------------------------
def retrieve(query, top_k=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)[0]

    top_indices = similarities.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "content": contents[idx],
            "metadata": metadata[idx],
            "score": float(similarities[idx])
        })

    return results


# -----------------------------
# 3. SIMPLE ANSWER GENERATOR
# (Replace with LLM if needed)
# -----------------------------
SIMILARITY_THRESHOLD = 0.35

def generate_answer(question):
    retrieved = retrieve(question, top_k=5)

    # If top match score too low → no answer
    if not retrieved or retrieved[0]["score"] < SIMILARITY_THRESHOLD:
        return {
            "answer": "This question cannot be answered based on the provided documents.",
            "sources": []
        }

    # Combine retrieved context
    context = "\n".join([r["content"] for r in retrieved])

    # For now: returning top context as answer
    # (Replace with LLM call in real RAG system)
    answer_text = context[:1000]  # truncate if needed

    # Build source list
    sources = []
    seen = set()

    for r in retrieved:
        source_key = (r["metadata"]["document"], r["metadata"]["page"])
        if source_key not in seen:
            seen.add(source_key)
            sources.append({
                "document": r["metadata"]["document"],
                "page": r["metadata"]["page"]
            })

    return {
        "answer": answer_text,
        "sources": sources
    }


# -----------------------------
# 4. PROCESS MULTIPLE QUESTIONS
# -----------------------------
questions = [
    {"question_id": 11, "question": "What is the total revenue mentioned?"},
    {"question_id": 12, "question": "Explain the risk mitigation policy."}
]

output = []

for q in questions:
    result = generate_answer(q["question"])

    output.append({
        "question_id": q["question_id"],
        "answer": result["answer"],
        "sources": result["sources"]
    })

# -----------------------------
# 5. FINAL OUTPUT FORMAT
# -----------------------------
print(json.dumps(output, indent=2))


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


[
  {
    "question_id": 11,
    "answer": "Revenue Recognition Revenue by source The following table disaggregates our revenue by major source (in millions): Year Ended December 31, 2023 2022 2021 Automotive sales $ 78,509 $ 67,210 $ 44,125 Automotive regulatory credits 1,790 1,776 1,465 Energy generation and storage sales 5,515 3,376 2,279 Services and other 8,319 6,091 3,802 Total revenues from sales and services 94,133 78,453 51,671 Automotive leasing 2,120 2,476 1,642 Energy generation and storage leasing 520 533 510 Total revenues $\nfollowing table presents revenues and gross profit by reportable segment (in millions): Year Ended December 31, 2023 2022 2021 Automotive segment Revenues $ 90,738 $ 77,553 $ 51,034 Gross profit $ 16,519 $ 20,565 $ 13,735 Energy generation and storage segment Revenues $ 6,035 $ 3,909 $ 2,789 Gross profit $ 1,141 $ 288 $ (129) The following table presents revenues by geographic area based on the sales location of our products (in millions): Year Ended

In [1]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.4.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
pip install sentence_transformers

