In [567]:
import os
import numpy as np
import fitz
import time
import polars as pl

from dotenv import load_dotenv
from google import genai
from google.genai import types


load_dotenv()

client = genai.Client(api_key=os.getenv("API_KEY"))

### Pre-processing and general functions

In [None]:
# Defining a function for transforming the pdf's to a readable state. 
def remove_headers_and_footers(
    pdf_path, header_height_pt=70, footer_height_pt=70
):
    """
    Extracts text from a PDF, excluding the header and footer areas.

    Args:
        pdf_path (str): Path to the PDF file.
        header_height_pt (int, optional): Height of the header area in points.
            Defaults to 50 (a reasonable starting value).
        footer_height_pt (int, optional): Height of the footer area in points.
            Defaults to 50 (a reasonable starting value).

    Returns:
        str: Extracted text without the header and footer.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        page_rect = page.rect  # Get the page rectangle
        page_height = page_rect.height
        # Define the clipping rectangle, excluding header and footer
        clip_rect = fitz.Rect(
            page_rect.x0,
            header_height_pt,
            page_rect.x1,
            page_height - footer_height_pt,
        )
        text += page.get_text(clip=clip_rect) + "\n"  # Add newline between pages
    doc.close()
    return text


In [604]:
# Construct the path to the PDF file and merging all the pdfs
notebook_dir = os.getcwd()

pdf_folder = os.path.join(notebook_dir, 'materials')

pdf_file_path_1 = os.path.join(pdf_folder, 'Summer_that_never_was.pdf')
pdf_file_path_2 = os.path.join(pdf_folder, 'Let_the_leaves_fall.pdf')
pdf_file_path_3 = os.path.join(pdf_folder, 'No_breath_to_cry.pdf')
pdf_file_path_4 = os.path.join(pdf_folder, 'To_bloom_below_the_web.pdf')

text_1 = remove_headers_and_footers(pdf_file_path_1)
text_2 = remove_headers_and_footers(pdf_file_path_2)
text_3 = remove_headers_and_footers(pdf_file_path_3)
text_4 = remove_headers_and_footers(pdf_file_path_4)

extracted_text = text_1 + text_2 + text_3 + text_4

In [605]:
print(len(text_1))

383082


In [None]:
# Defining the function to create a embedding for an input.
def get_embedding(text, model="text-embedding-004", delay = 0.6):
    """
    Creates an embedding for the given text using Genai.

    Args:
    text (str): Input text.
    model (str): Embedding model name.

    Returns:
    np.ndarray: The embedding vector.
    """
    time.sleep(delay)
    response = client.models.embed_content(
              model=model, 
              contents=text)
    
    return response.embeddings[0].values


In [None]:
# Defining the function for calculating the cosine similarity
def cosine_similarity(vec1, vec2):
    """
    Computes cosine similarity between two vectors.

    Args:
    vec1 (np.ndarray): First vector.
    vec2 (np.ndarray): Second vector.

    Returns:
    float: Cosine similarity.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Compute similarity between consecutive sentences
similarities = [cosine_similarity(embeddings[i], embeddings[i + 1]) for i in range(len(embeddings) - 1)]


### Semantic chunking

In [570]:
# Splitting text into sentences (basic split)
sentences = extracted_text.split(". ")

In [572]:
# Generate embeddings for each sentence
embeddings = [get_embedding(sentence) for sentence in sentences]

print(f"Generated {len(embeddings)} sentence embeddings.")
print(f"From {len(sentences)} sentences.")

Generated 6140 sentence embeddings.
From 6140 sentences.


In [574]:
def compute_breakpoints(similarities, method="percentile", threshold=90):
    """
    Computes chunking breakpoints based on similarity drops.

    Args:
    similarities (List[float]): List of similarity scores between sentences.
    method (str): 'percentile', 'standard_deviation', or 'interquartile'.
    threshold (float): Threshold value (percentile for 'percentile', std devs for 'standard_deviation').

    Returns:
    List[int]: Indices where chunk splits should occur.
    """
    # Determine the threshold value based on the selected method
    if method == "percentile":
        # Calculate the Xth percentile of the similarity scores
        threshold_value = np.percentile(similarities, threshold)
    elif method == "standard_deviation":
        # Calculate the mean and standard deviation of the similarity scores
        mean = np.mean(similarities)
        std_dev = np.std(similarities)
        # Set the threshold value to mean minus X standard deviations
        threshold_value = mean - (threshold * std_dev)
    elif method == "interquartile":
        # Calculate the first and third quartiles (Q1 and Q3)
        q1, q3 = np.percentile(similarities, [25, 75])
        # Set the threshold value using the IQR rule for outliers
        threshold_value = q1 - 1.5 * (q3 - q1)
    else:
        # Raise an error if an invalid method is provided
        raise ValueError("Invalid method. Choose 'percentile', 'standard_deviation', or 'interquartile'.")

    # Identify indices where similarity drops below the threshold value
    return [i for i, sim in enumerate(similarities) if sim < threshold_value]

# Compute breakpoints using the percentile method with a threshold of 90
breakpoints = compute_breakpoints(similarities, method="percentile", threshold=90)

In [575]:
def split_into_chunks(sentences, breakpoints):
    """
    Splits sentences into semantic chunks.

    Args:
    sentences (List[str]): List of sentences.
    breakpoints (List[int]): Indices where chunking should occur.

    Returns:
    List[str]: List of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    start = 0  # Initialize the start index

    # Iterate through each breakpoint to create chunks
    for bp in breakpoints:
        # Append the chunk of sentences from start to the current breakpoint
        chunks.append(". ".join(sentences[start:bp + 1]) + ".")
        start = bp + 1  # Update the start index to the next sentence after the breakpoint

    # Append the remaining sentences as the last chunk
    chunks.append(". ".join(sentences[start:]))
    return chunks  # Return the list of chunks

# Create chunks using the split_into_chunks function
text_chunks = split_into_chunks(sentences, breakpoints)

# Print the number of chunks created
print(f"Number of semantic chunks: {len(text_chunks)}")

# Print the first chunk to verify the result
print("\nFirst text chunk:")
print(text_chunks[0])

Number of semantic chunks: 5526

First text chunk:
28623811
4649373
The Summer 
That 
Never Was
28623811
4649373

28623812
4649374
WILLOWSHORE
WILLOWSHORE
D9a
D9a
W17
W17
W11
W11
W9
W9
W26
W26
W15
W15
W5
W5
W10
W10
W13
W13
W14
W14
W12
W12
W8
W8
W7
W7
W5
W5
W6
W6
W4
W4
W3
W3
W2
W2
W1
W1
W20
W20
W21
W21
W24
W24
W22
W22
W23
W23
W30
W30
W19
W19
W16
W16
W27
W27
W28
W28
W25
W25
W18
W18
D11
D11
D9b
D9b
D8
D8
D7
D7
D6
D6
D4
D4
D5
D5
P
D3
D3
Ceiba River
Ceiba River
D1
D1
Duyue River
Duyue River
Duyue River
Duyue River
P
Dragonfly Creek
Dragonfly Creek
Gourd Lake
Gourd Lake
D2
D2
Ceiba Creek
Ceiba Creek
Ceiba River
Ceiba River
 Duyue River
 Duyue River
WILLOWSHORE
WILLOWSHORE
1200 FEET
1200 FEET
• Trail
• Trail
• Seasonal Creek
• Seasonal Creek
D12
D12
D13
D13
WILLOWSHORE HINTERLANDS
WILLOWSHORE HINTERLANDS
1 HEX = 2 MILES
1 HEX = 2 MILES
28623812
4649374

28623813
4649375
Paizo Inc.
7120 185th Ave NE, Ste 120
Redmond WA 98052 0577
AUTHOR
Sen H.H.S.
ADDITIONAL WRITING
James Jacobs
DEVELOPER
Jame

In [576]:
def create_embeddings(text_chunks):
    """
    Creates embeddings for each text chunk.

    Args:
    text_chunks (List[str]): List of text chunks.

    Returns:
    List[np.ndarray]: List of embedding vectors.
    """
    # Generate embeddings for each text chunk using the get_embedding function
    return [get_embedding(chunk) for chunk in text_chunks]

# Create chunk embeddings using the create_embeddings function
chunk_embeddings = create_embeddings(text_chunks)

### Fixed length chunking

In [606]:
chunks = []
n = 2000
overlap = 400

for i in range (0, len(extracted_text), n -overlap): 
    chunks.append(extracted_text[i:i+n])

print(f"Antal chunks: {len(chunks)}.")

Antal chunks: 967.


In [611]:
fixed_length_embeddings = create_embeddings(chunks)

### Vector store

In [None]:
class VectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []
    #tagit bort np.array framför embedding i append
    
    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})

    def semantic_search(self, query_embedding, k=5):
        if not self.vectors:
            return []
        query_vector = np.array(query_embedding)

        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        
        similarities.sort(key=lambda x: x[1], reverse=True)

        results = []

        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({"text": self.texts[idx],
                            "metadata": self.metadata[idx],
                            "similarity": score
                            })
        
        return results
    
    def save(self, filename):
        df = pl.DataFrame(
            dict(
                vectors=self.vectors,
                texts=self.texts,
                metadata=self.metadata))
        df.write_parquet(filename)

    def load(self, file):
        df = pl.read_parquet(file, columns=["vectors", "texts", "metadata"])
        self.vectors = df["vectors"].to_list()
        self.texts = df["texts"].to_list()
        self.metadata = df["metadata"].to_list()
        

In [578]:
vector_store = VectorStore()
for i, chunk in enumerate(text_chunks):
    vector_store.add_item(text=text_chunks[i], embedding=chunk_embeddings[i], metadata={"type": "chunk", "index": i})

In [579]:
vector_store.save("embeddings.parquet")

In [580]:
vector_store_2 = VectorStore()
vector_store_2.load("embeddings.parquet")

In [612]:
vector_store_fixed_length = VectorStore()
for i, chunk in enumerate(chunks):
    vector_store_fixed_length.add_item(text=chunks[i], embedding=fixed_length_embeddings[i], metadata={"type": "chunk", "index": i})


In [613]:
vector_store_fixed_length.save("embeddings_fixed_length.parquet")

In [614]:
vector_store_fixed_length_2 = VectorStore()
vector_store_fixed_length_2.load("embeddings_fixed_length.parquet")

### Semantic search

In [581]:
def semantic_search(query, text_chunks, chunk_embeddings, k=5):
    """
    Finds the most relevant text chunks for a query.

    Args:
    query (str): Search query.
    text_chunks (List[str]): List of text chunks.
    chunk_embeddings (List[np.ndarray]): List of chunk embeddings.
    k (int): Number of top results to return.

    Returns:
    List[str]: Top-k relevant chunks.
    """
    # Generate an embedding for the query
    query_embedding = get_embedding(query)
    
    # Calculate cosine similarity between the query embedding and each chunk embedding
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    
    # Get the indices of the top-k most similar chunks
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    # Return the top-k most relevant text chunks
    return [text_chunks[i] for i in top_indices]

In [615]:
def generate_user_prompt(query):
    context = "\n".join(semantic_search(query, vector_store_2.texts, vector_store_2.vectors))
    user_prompt = f"The question is {query}. This is the context: {context}."
    return user_prompt


In [None]:
# Setting the system prompt for the AI assistant

system_prompt = "You are an AI assistant that gives guidence based on the given context. The context is an adventure for a TTRPG called 'Pathfinder 2e'. Give fleshed out answers point by point, but make sure to indicate what part's you are less certain about."

In [None]:
# Defining the generate respone function.
def generate_response(system_prompt, user_message, model="gemini-2.0-flash"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".

    Returns:
    dict: The response from the AI model.
    """
    response = client.models.generate_content(
        model=model,
        config=types.GenerateContentConfig(
        system_instruction=system_prompt),
        contents=generate_user_prompt(user_message)
        )
    return response

In [584]:
top_chunks = semantic_search(query, text_chunks, chunk_embeddings, k=3)

# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

In [626]:
print(generate_response(system_prompt, "How long does it take to walk the Pilgrims path?").text)

Okay, based on the context you provided, here's how long it takes to walk the Pilgrim's Path, broken down for clarity:

*   **Mandatory Duration:** The journey along the Pilgrim's Path to the monastery *must* take **four days and three nights**. This is non-negotiable due to the traditions associated with the path.

*   **Reason for the Duration:** The text states that those seeking knowledge traditionally stopped to spend the night three times at shrines along the way. This implies that there are specific shrines or locations where travelers are expected to pause.

*   **Distance:** The actual hiking distance of the last leg of the path from Willowshore to the monastery is 13 miles. This is relatively short but the time taken is longer.

*   **Speed Restriction:** Travel along the Pilgrim’s Path can’t be rushed. This reinforces that the journey is about the experience and reflection, not speed.

**In summary,** regardless of the actual distance, the Pilgrim's Path requires a commitmen

### Search with fixed length

In [617]:
def generate_user_prompt_fixed_length(query):
    context = "\n".join(semantic_search(query, vector_store_fixed_length_2.texts, vector_store_fixed_length_2.vectors))
    user_prompt = f"The question is {query}. This is the context: {context}."
    return user_prompt

In [618]:
def generate_response_fixed_length(system_prompt, user_message, model="gemini-2.0-flash"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".

    Returns:
    dict: The response from the AI model.
    """
    response = client.models.generate_content(
        model=model,
        config=types.GenerateContentConfig(
        system_instruction=system_prompt),
        contents=generate_user_prompt(user_message)
        )
    return response

In [627]:
print(generate_response_fixed_length(system_prompt, "How long does it take to walk the Pilgrims path?").text)

Okay, based on the context provided, here's the breakdown of how long it takes to walk the Pilgrim's Path:

*   **Minimum Time:** 4 days and 3 nights. The text explicitly states that to reach the ruined monastery, travelers *must* spend this amount of time following the Pilgrim's Path traditions.
*   **Reasoning:** The tradition involves pausing to spend the night at three shrines along the 13-mile hike from Willowshore to the monastery. This breaks the journey into four segments (the stretches between Willowshore and the first shrine, between the shrines, and from the last shrine to the monastery), each taking approximately a day.
*   **Unimpeded Travel:** While travel along the path is unimpeded in terms of obstacles or dangers (as long as you stay on the path), it *cannot* be rushed. This reinforces the 4-day/3-night minimum.

**In summary, it takes a minimum of 4 days and 3 nights to walk the Pilgrim's Path to reach the ruined monastery, adhering to the traditional practice of stop

### Validation

In [586]:
validation_data = [
    {
        "question": "What faction does Old Matsuki represent?",
        "ideal_answer": "Southbank"
    },
    {
        "question": "Who is the doctor in Willowshore?",
        "ideal_answer": "Dr Dami"
    },
    {
        "question": "What happens when the Eternal Lantern is lit?",
        "ideal_answer": "The monsters and perils abate, citizens come out of hiding but remain frienghtened. Gray Butcher and Mo Douqiu realize that their grip on Willowshore has grown tenuous. Gray Butcher takes to patrolling downtown's streets."
    }
]

In [587]:
evaluation_system_prompt = """You are an intelligent evaluationsystem with the purpose of evaluating the answer of an AI-assistent. If the answer if close to the ideal answer, score it 1.0. If it's wrong or not good enough score it 0. If it's partly correct score it 0.5. Motivate the score you give it."""

In [588]:
query = validation_data[0]["question"]
response = generate_response(system_prompt, query)

evaluation_prompt = f"""Question: {query}
Response: {response.text}
Ideal answer: {validation_data[0]["ideal_answer"]}"""

evaluation_response = generate_response(evaluation_system_prompt, evaluation_prompt)

print(evaluation_response.text)

The response correctly identifies several clues about Old Matsuki's potential allegiances, including his opposition to Lung Wa and Northridge, his leadership position in South Willowshore, and his fondness for tradition. However, it doesn't definitively state that he represents the Southbank faction.
Therefore, I rate this answer 0.5.
