# LLM with RAG learning

### Loading dataset

In [None]:
import os
from loguru import logger
import pdfplumber

dataset = []
file_path = 'pa_04_model_selection_via_gap_statistics_and_sampling.pdf'
with pdfplumber.open(file_path) as pdf:
    for i, page in enumerate(pdf.pages):
        dataset.append(page.extract_text())
logger.info(f"Loaded {len(dataset)} texts")

[32m2025-08-15 12:53:43.060[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mLoaded 22 texts[0m


### Implementing Vector Database

In [None]:
import ollama
from typing import List

In [10]:
EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

In [13]:
VECTOR_DB: tuple = []  # to store tuple of (chunk, embedding)

def add_chunk_to_database(chunk):
    embedding:List[float] = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
    VECTOR_DB.append((chunk, embedding))

for i, chunk in enumerate(dataset):
    add_chunk_to_database(chunk=chunk)
    logger.info(f"Added chunk {(i+1)/ len(dataset)} to the database")

[32m2025-08-15 13:00:43.901[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mAdded chunk 0.045454545454545456 to the database[0m
[32m2025-08-15 13:00:44.038[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mAdded chunk 0.09090909090909091 to the database[0m
[32m2025-08-15 13:00:44.125[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mAdded chunk 0.13636363636363635 to the database[0m
[32m2025-08-15 13:00:44.222[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mAdded chunk 0.18181818181818182 to the database[0m
[32m2025-08-15 13:00:44.312[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mAdded chunk 0.22727272727272727 to the database[0m
[32m2025-08-15 13:00:44.398[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mAdded chunk 0.2727272727272727 to the database[0m
[32m2025-08-15 13:00:44.479[0m | [1mINFO   

### Implementing the retrieval function

In [14]:
# helper function to calculate the cosine similarity

def consine_similarity(a, b):
    dot_product = sum([x * y for x, y in zip(a,b)])
    norm_a = sum([x ** 2 for x in a]) ** 0.5
    norm_b = sum([x ** 2 for x in b]) ** 0.5
    return dot_product / (norm_a * norm_b)

In [None]:
def retrieve(query, top_n=3):
    query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
    similarities: List = []
    for chunk, embedding in VECTOR_DB:
        # calculating the distance 
        similarity = consine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))

    # sort by similarity in descending order, since higher similarity 
    # means more relevant chunks
    similarities.sort(key=lambda x: x[1], reverse=True)

    # return the top N relevant chunks
    return similarities[:top_n]

### Response Generation Phase

In [20]:
input_query = input("Aske me a question: ")
retrieved_knowledge = retrieve(input_query)

logger.info("Retrieved knowledge: ")
for chunk, similarity in retrieved_knowledge:
    logger.info(f"- (similarity: {similarity:.2f}) {chunk}")

context_text = "\n".join([f"- chunk" for chunk, _ in retrieved_knowledge])


instruction_prompt = f""""You are a helpful chatbot. Use only the following pieces of context to answer the question. 
Do not make up any new information. 
Context: {context_text}
"""

TypeError: 'float' object is not subscriptable