In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sentence-transformers PyPDF2 faiss-cpu nltk

In [None]:
import faiss
import torch
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n"
    return full_text

In [None]:
def chunk_text(text, chunk_size=3, overlap=1):
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    chunks = []
    start = 0
    while start < len(lines):
        end = start + chunk_size
        chunk = " ".join(lines[start:end])
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

In [None]:
def embed_chunks(chunks, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, convert_to_numpy=True)
    return model, embeddings

In [None]:
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

In [None]:
def search_index(query, model, index, chunks, k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

In [None]:
def generate_text(prompt, max_new_tokens=100, num_return_sequences=1):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    answers = []
    for output in outputs:
        generated_ids = output[inputs["input_ids"].shape[-1]:]
        answer = tokenizer.decode(generated_ids, skip_special_tokens=True)
        answers.append(answer)
    return answers

In [None]:
def build_prompt(question, chunks):
    context_text = "\n".join(chunks)
    prompt = (
        f"You are a helpful assistant providing information based on the document provided.\n"
        f"Use the context below to answer the question.\n\n"
        f"Context:\n{context_text}\n\n"
        f"Question: {question}\n"
        f"Answer the question, and at the end, you may invite the user to ask further questions if appropriate."
    )
    return prompt

In [None]:
pdf_path = "/kaggle/input/tips-hindawi-pdf/Tips Hindawi University Info.pdf"
text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(text, chunk_size=3, overlap=1)
model_embeddings, embeddings = embed_chunks(chunks)
index = create_faiss_index(embeddings)
model_name = "mistralai/Mistral-Nemo-Instruct-2407"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

In [None]:
print("Hello! I can help you explore your document. Ask me anything about it!\n")
while True:
    question = input("> ")
    top_chunks = search_index(question, model_embeddings, index, chunks, k=3)
    prompt = build_prompt(question, top_chunks)
    llm_outputs = generate_text(prompt, max_new_tokens=200, num_return_sequences=1)
    answer = llm_outputs[0]
    print(f"{answer}\n")