In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sentence-transformers PyPDF2 faiss-cpu -U bitsandbytes accelerate

In [None]:
import os
import re
import faiss
import torch
import numpy as np
import unicodedata
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"[•◦▪▫‣⁃∙◆◇■□●○]", " ", text)
    text = re.sub(r"[\x00-\x09\x0B-\x1F\x7F]", " ", text)
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

In [None]:
def chunk_text(text, chunk_size=5, overlap=1, min_chars=50):
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    chunks = []
    start = 0
    while start < len(lines):
        end = start + chunk_size
        chunk = " ".join(lines[start:end])
        if len(chunk) >= min_chars and any(c.isalpha() for c in chunk):
            chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

In [None]:
def embed_chunks(chunks, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, convert_to_numpy=True)
    return model, embeddings

In [None]:
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

In [None]:
def search_index(query, model, index, chunks, k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

In [None]:
def generate_text(prompt, max_new_tokens=80, num_return_sequences=1):
    device = model.device
    messages = [
        {"role": "user", "content": prompt}
    ]
    enc = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    )
    input_ids = enc.to(device)
    attention_mask = torch.ones_like(input_ids, device=device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            use_cache=False,              
            pad_token_id=tokenizer.eos_token_id
        )
    generated_ids = outputs[0][input_ids.shape[-1]:]
    answer = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return [answer]

In [None]:
def build_prompt(question, chunks):
    context_text = "\n".join(c.strip() for c in chunks if c.strip())

    prompt = (
        "Answer the question using only the provided context.\n\n"
        f"Context:\n{context_text}\n\n"
        f"Question:\n{question}\n\n"
        "Answer in 2–3 sentences."
    )
    return prompt

In [None]:
def find_pdf(directory="/kaggle/input"):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                return os.path.join(root, file)
    return None

In [None]:
model_name = "mistralai/Mistral-Nemo-Instruct-2407"
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

In [None]:
pdf_path = None
chunks = []
model_embeddings = None
embeddings = None
index = None
loaded = False

print("Please upload your PDF. I'll be here to answer your questions!\n")

while True:
    question = input().strip()
    if not loaded:
        pdf_path = find_pdf()
        if pdf_path is None:
            print("No PDF detected. Please upload a PDF so I can help you.")
            continue
        text = extract_text_from_pdf(pdf_path)
        chunks = chunk_text(text, chunk_size=5, overlap=1)
        model_embeddings, embeddings = embed_chunks(chunks)
        index = create_faiss_index(embeddings)
        loaded = True
    top_chunks = search_index(question, model_embeddings, index, chunks, k=3)
    prompt = build_prompt(question, top_chunks)
    llm_outputs = generate_text(prompt, max_new_tokens=50, num_return_sequences=1)
    answer = llm_outputs[0]
    print(f"{answer}\n")