In [1]:
pip install sentence-transformers faiss-cpu langchain pymupdf

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting langchain
  Downloading langchain-0.3.10-py3-none-any.whl.metadata (7.1 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting langchain-core<0.4.0,>=0.3.22 (from langchain)
  Downloading langchain_core-0.3.22-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting packaging (from faiss-cpu)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting requests-toolbelt<2.0.0

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import fitz
import re
from transformers import AutoTokenizer, AutoModel

In [3]:
def extract_text_from_pdf(path):
    text=""
    pdf_doc=fitz.open(path)
    for page in pdf_doc:
        text+=page.get_text()
    pdf_doc.close()
    return text
path=r'/kaggle/input/food-safety-manual/fpc-manual.pdf'
pdf_text=extract_text_from_pdf(path)

In [4]:
def clean_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces/newlines
    text = re.sub(r'[^a-zA-Z0-9\s,.]', '', text)  # Remove special characters
    return text.strip()

In [5]:
cleaned_text=clean_text(pdf_text)
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100
)
chunks=text_splitter.split_text(cleaned_text)
with open("chunks.txt", "w") as f:
    for chunk in chunks:
        f.write(chunk + "\n")

In [6]:

def compute_and_save_chunk_embeddings(chunks, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    model = AutoModel.from_pretrained(embedding_model_name)

    embeddings = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())

    # Save embeddings and chunks
    embedding_matrix = np.vstack(embeddings)
    np.save("chunk_embeddings.npy", embedding_matrix)
    index = faiss.IndexFlatL2(embedding_matrix.shape[1])
    index.add(embedding_matrix)
    faiss.write_index(index, "faiss_index.bin")
    
compute_and_save_chunk_embeddings(chunks)

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [7]:
def load_chunk_embeddings(file_path):
    embedding_matrix = np.load(file_path)
    print(f"Chunk embeddings loaded from {file_path}.")
    return embedding_matrix


In [8]:
def load_faiss_index(file_path):
    index = faiss.read_index(file_path)
    print(f"FAISS index loaded from {file_path}.")
    return index


In [9]:
def load_chunks_from_file(file_path):
    with open(file_path, "r") as f:
        chunks = [line.strip() for line in f]
    print(f"Chunks loaded from {file_path}.")
    return chunks

In [10]:
embedding_matrix = load_chunk_embeddings("chunk_embeddings.npy")
faiss_index = load_faiss_index("faiss_index.bin")
chunk_texts = load_chunks_from_file("chunks.txt")


Chunk embeddings loaded from chunk_embeddings.npy.
FAISS index loaded from faiss_index.bin.
Chunks loaded from chunks.txt.


In [11]:
from transformers import AutoTokenizer, AutoModel

def retrieve_relevant_chunks(query, faiss_index, chunk_texts, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    model = AutoModel.from_pretrained(embedding_model_name)

    # Compute embedding for the query
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    query_embedding = model(**inputs).last_hidden_state.mean(dim=1).detach().numpy()

    # Search in FAISS index
    _, indices = faiss_index.search(query_embedding, k=5)  # Retrieve top 5 chunks
    return [chunk_texts[i] for i in indices[0]]


In [12]:
import requests

In [13]:

API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct"
API_TOKEN = "Not so fast"

def query_huggingface_api(prompt):
    headers = {"Authorization": f"Bearer {API_TOKEN}"}
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 500,
            "temperature": 0.5,
            "top_p":0.9,
            "repetition_penalty":1.2
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        return f"Error: {response.status_code}, {response.text}"


In [14]:
query = "give me full process to make margarita cheese pizza?"
retrieved_chunks = retrieve_relevant_chunks(query, faiss_index, chunk_texts)
context = "\n".join(retrieved_chunks)
prompt = f"Context: {context}\n\nQuery: {query}\nAnswer:"
response = query_huggingface_api(prompt)
print(response)


Error: 400, {"error":"Authorization header is correct, but the token seems invalid"}
