In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install faiss-cpu langchain langchain-community langchain-core -qU langchain-huggingface

In [None]:
import warnings
# Surpass warnings
warnings.filterwarnings('ignore')
import re
import torch
import unicodedata
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
def extract_text_from_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    cleaned_docs = []
    for doc in documents:
        clean_text = doc.page_content
        bullets = '•◦▪▫‣⁃∙◆◇■□●○'
        for bullet in bullets:
            clean_text = clean_text.replace(bullet, '')
        cleaned_doc = Document(
            page_content=clean_text,
            metadata=doc.metadata
        )
        cleaned_docs.append(cleaned_doc)
    return cleaned_docs

In [None]:
def chunk(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=[
            "\n\n",  
            "\n",    
            ". ",    
            " ",     
            ""       
        ]
    )
    chunks = text_splitter.split_documents(documents)
    for chunk in chunks:
        lines = chunk.page_content.split('\n')
        non_empty_lines = [line for line in lines if line.strip()]
        chunk.page_content = '\n'.join(non_empty_lines)
    return chunks

In [None]:
def create_faiss_index(chunks):
    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedding = HuggingFaceEmbeddings(model_name=embedding_model_name)
    index = FAISS.from_documents(chunks, embedding)
    return index

In [None]:
def search_index(query, index, k=3):
    docs = index.similarity_search(query, k)
    context = "\n\n".join([doc.page_content for doc in docs])
    return context

In [None]:
def generate_text(prompt, max_new_tokens=100, num_return_sequences=1):
    device = model.device
    messages = [
        {"role": "user", "content": prompt}
    ]
    enc = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    )
    input_ids = enc.to(device)
    attention_mask = torch.ones_like(input_ids, device=device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,           
            pad_token_id=tokenizer.eos_token_id
        )
    generated_ids = outputs[0][input_ids.shape[-1]:]
    answer = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return [answer]

In [None]:
def build_prompt(question, chunks):
    context_text = "\n".join(c.strip() for c in chunks if c.strip())

    prompt = (
        "Answer the question using only the provided context.\n\n"
        f"Context:\n{context_text}\n\n"
        f"Question:\n{question}\n\n"
        "Answer in 2–3 sentences."
    )
    return prompt

In [None]:
def find_pdf(directory="/kaggle/input"):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                return os.path.join(root, file)
    return None

In [None]:
model_name = "mistralai/Mistral-Nemo-Instruct-2407"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    device_map="auto"
)

In [None]:
print("Hi, I'll be here to answer your questions!")
last_speaker = "ai"

pdf_path = find_pdf()

while pdf_path is None:
    if last_speaker != "ai":
        print()
    print("Please upload the pdf and notify me with 'done'.")
    last_speaker = "ai"
    while True:
        if last_speaker != "user":
            print()
        answer = input(">").strip().lower()
        last_speaker = "user"
        if answer == "done":
            break
        if last_speaker != "ai":
            print()
        print("Please ensure on me with 'done'.")
        last_speaker = "ai"
    pdf_path = find_pdf()
    if pdf_path is None:
        if last_speaker != "ai":
            print()
        print("Didn't find any pdf. ", end='')
        last_speaker = "ai"

if last_speaker != "ai":
    print()
print("Processing your document...")
last_speaker = "ai"
pdf_path = find_pdf()
documents = extract_text_from_pdf(pdf_path)
chunks = chunk(documents)
index = create_faiss_index(chunks)
if last_speaker != "ai":
    print()
print("Document processed, ask me anything :)")
last_speaker = "ai"

while True:
    if last_speaker != "user":
        print()
    question = input(">").strip()
    last_speaker = "user"
    context = search_index(question, index, k=2)
    prompt = build_prompt(question, [context])
    llm_outputs = generate_text(prompt, max_new_tokens=300, num_return_sequences=1)
    answer = llm_outputs[0]
    if last_speaker != "ai":
        print()
    print(f"{answer}")
    last_speaker = "ai"