In [20]:
import os
import numpy as np
import fitz
import time
import polars as pl

from dotenv import load_dotenv
from google import genai
from google.genai import types


load_dotenv()

client = genai.Client(api_key=os.getenv("API_KEY"))

### Pre-processing and general functions

In [21]:
# Defining a function for transforming the pdf's to a readable state. 
def remove_headers_and_footers(
    pdf_path, header_height_pt=70, footer_height_pt=70
):
    """
    Extracts text from a PDF, excluding the header and footer areas.

    Args:
        pdf_path (str): Path to the PDF file.
        header_height_pt (int, optional): Height of the header area in points.
            Defaults to 50 (a reasonable starting value).
        footer_height_pt (int, optional): Height of the footer area in points.
            Defaults to 50 (a reasonable starting value).

    Returns:
        str: Extracted text without the header and footer.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        page_rect = page.rect  # Get the page rectangle
        page_height = page_rect.height
        # Define the clipping rectangle, excluding header and footer
        clip_rect = fitz.Rect(
            page_rect.x0,
            header_height_pt,
            page_rect.x1,
            page_height - footer_height_pt,
        )
        text += page.get_text(clip=clip_rect) + "\n"  # Add newline between pages
    doc.close()
    return text


In [22]:
# Defining the function to create a embedding for an input.
def get_embedding(text, model="text-embedding-004", delay = 0.6):
    """
    Creates an embedding for the given text using Genai.

    Args:
    text (str): Input text.
    model (str): Embedding model name.

    Returns:
    np.ndarray: The embedding vector.
    """
    time.sleep(delay)
    response = client.models.embed_content(
              model=model, 
              contents=text)
    
    return response.embeddings[0].values


In [24]:
def create_embeddings(text_chunks):
    """
    Creates embeddings for each text chunk.

    Args:
    text_chunks (List[str]): List of text chunks.

    Returns:
    List[np.ndarray]: List of embedding vectors.
    """
    # Generate embeddings for each text chunk using the get_embedding function
    return [get_embedding(chunk) for chunk in text_chunks]

# Create chunk embeddings using the create_embeddings function

In [25]:
# Defining the function for calculating the cosine similarity
def cosine_similarity(vec1, vec2):
    """
    Computes cosine similarity between two vectors.

    Args:
    vec1 (np.ndarray): First vector.
    vec2 (np.ndarray): Second vector.

    Returns:
    float: Cosine similarity.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))



In [26]:
def compute_breakpoints(similarities, method="percentile", threshold=90):
    """
    Computes chunking breakpoints based on similarity drops.

    Args:
    similarities (List[float]): List of similarity scores between sentences.
    method (str): 'percentile', 'standard_deviation', or 'interquartile'.
    threshold (float): Threshold value (percentile for 'percentile', std devs for 'standard_deviation').

    Returns:
    List[int]: Indices where chunk splits should occur.
    """
    # Determine the threshold value based on the selected method
    if method == "percentile":
        # Calculate the Xth percentile of the similarity scores
        threshold_value = np.percentile(similarities, threshold)
    elif method == "standard_deviation":
        # Calculate the mean and standard deviation of the similarity scores
        mean = np.mean(similarities)
        std_dev = np.std(similarities)
        # Set the threshold value to mean minus X standard deviations
        threshold_value = mean - (threshold * std_dev)
    elif method == "interquartile":
        # Calculate the first and third quartiles (Q1 and Q3)
        q1, q3 = np.percentile(similarities, [25, 75])
        # Set the threshold value using the IQR rule for outliers
        threshold_value = q1 - 1.5 * (q3 - q1)
    else:
        # Raise an error if an invalid method is provided
        raise ValueError("Invalid method. Choose 'percentile', 'standard_deviation', or 'interquartile'.")

    # Identify indices where similarity drops below the threshold value
    return [i for i, sim in enumerate(similarities) if sim < threshold_value]





In [33]:
def split_into_chunks(sentences, breakpoints):
    """
    Splits sentences into semantic chunks.

    Args:
    sentences (List[str]): List of sentences.
    breakpoints (List[int]): Indices where chunking should occur.

    Returns:
    List[str]: List of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    start = 0  # Initialize the start index

    # Iterate through each breakpoint to create chunks
    for bp in breakpoints:
        # Append the chunk of sentences from start to the current breakpoint
        chunks.append(". ".join(sentences[start:bp + 1]) + ".")
        start = bp + 1  # Update the start index to the next sentence after the breakpoint

    # Append the remaining sentences as the last chunk
    chunks.append(". ".join(sentences[start:]))
    return chunks  # Return the list of chunks


In [27]:
def semantic_search(query, text_chunks, chunk_embeddings, k=5):
    """
    Finds the most relevant text chunks for a query.

    Args:
    query (str): Search query.
    text_chunks (List[str]): List of text chunks.
    chunk_embeddings (List[np.ndarray]): List of chunk embeddings.
    k (int): Number of top results to return.

    Returns:
    List[str]: Top-k relevant chunks.
    """
    # Generate an embedding for the query
    query_embedding = get_embedding(query)
    
    # Calculate cosine similarity between the query embedding and each chunk embedding
    similarities = [cosine_similarity(query_embedding, emb) for emb in chunk_embeddings]
    
    # Get the indices of the top-k most similar chunks
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    # Return the top-k most relevant text chunks
    return [text_chunks[i] for i in top_indices]

In [56]:
def generate_user_prompt(query):
    context = "\n".join(semantic_search(query, vector_store_fixed_length.texts, vector_store_fixed_length.vectors))
    user_prompt = f"The question is {query}. This is the context: {context}."
    return user_prompt


In [62]:
def generate_user_prompt(query, texts, embeddings):
    context = "\n".join(semantic_search(query, texts, embeddings))
    user_prompt = f"The question is {query}. This is the context: {context}."
    return user_prompt

In [63]:
# Defining the generate respone function.
def generate_response(system_prompt, user_message, texts, embeddings, model="gemini-2.0-flash", ):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".

    Returns:
    dict: The response from the AI model.
    """
    response = client.models.generate_content(
        model=model,
        config=types.GenerateContentConfig(
        system_instruction=system_prompt),
        contents=generate_user_prompt(user_message, texts, embeddings)
        )
    return response

### Converting pdfs and merging to text

In [30]:
# Construct the path to the PDF file and merging all the pdfs
notebook_dir = os.getcwd()

pdf_folder = os.path.join(notebook_dir, 'materials')

pdf_file_path_1 = os.path.join(pdf_folder, 'Summer_that_never_was.pdf')
pdf_file_path_2 = os.path.join(pdf_folder, 'Let_the_leaves_fall.pdf')
pdf_file_path_3 = os.path.join(pdf_folder, 'No_breath_to_cry.pdf')
pdf_file_path_4 = os.path.join(pdf_folder, 'To_bloom_below_the_web.pdf')

text_1 = remove_headers_and_footers(pdf_file_path_1)
text_2 = remove_headers_and_footers(pdf_file_path_2)
text_3 = remove_headers_and_footers(pdf_file_path_3)
text_4 = remove_headers_and_footers(pdf_file_path_4)

extracted_text = text_1 + text_2 + text_3 + text_4

In [None]:
# Splitting text into sentences (basic split)
sentences = extracted_text.split(". ")


### Making the embeddings for semantic search

In [None]:
# Generate embeddings for each sentence
embeddings = [get_embedding(sentence) for sentence in sentences]
print(f"Generated {len(embeddings)} sentence embeddings.")
print(f"From {len(sentences)} sentences.")

In [None]:
# Compute breakpoints using the percentile method with a threshold of 90
# Compute similarity between consecutive sentences
similarities = [cosine_similarity(embeddings[i], embeddings[i + 1]) for i in range(len(embeddings) - 1)]

breakpoints = compute_breakpoints(similarities, method="percentile", threshold=90)

In [None]:
# Create chunks using the split_into_chunks function
text_chunks = split_into_chunks(sentences, breakpoints)

chunk_embeddings = create_embeddings(text_chunks)

### Making the embeddings for Fixed length chunking

In [None]:
chunks = []
n = 2000
overlap = 400

for i in range (0, len(extracted_text), n -overlap): 
    chunks.append(extracted_text[i:i+n])

print(f"Antal chunks: {len(chunks)}.")

In [None]:
fixed_length_embeddings = create_embeddings(chunks)


### Vector store

In [34]:
class VectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []
    #tagit bort np.array framför embedding i append
    
    def add_item(self, text, embedding, metadata=None):
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})

    def semantic_search(self, query_embedding, k=5):
        if not self.vectors:
            return []
        query_vector = np.array(query_embedding)

        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        
        similarities.sort(key=lambda x: x[1], reverse=True)

        results = []

        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({"text": self.texts[idx],
                            "metadata": self.metadata[idx],
                            "similarity": score
                            })
        
        return results
    
    def save(self, filename):
        df = pl.DataFrame(
            dict(
                vectors=self.vectors,
                texts=self.texts,
                metadata=self.metadata))
        df.write_parquet(filename)

    def load(self, file):
        df = pl.read_parquet(file, columns=["vectors", "texts", "metadata"])
        self.vectors = df["vectors"].to_list()
        self.texts = df["texts"].to_list()
        self.metadata = df["metadata"].to_list()
        

#### Saving the semantic seartch model

In [None]:
vector_store_s_search = VectorStore()
for i, chunk in enumerate(text_chunks):
    vector_store_s_search.add_item(text=text_chunks[i], embedding=chunk_embeddings[i], metadata={"type": "chunk", "index": i})

In [None]:
vector_store_s_search.save("embeddings_s_search.parquet")

#### Saving the fixed length model

In [None]:
vector_store_fixed_length = VectorStore()
for i, chunk in enumerate(chunks):
    vector_store_fixed_length.add_item(text=chunks[i], embedding=fixed_length_embeddings[i], metadata={"type": "chunk", "index": i})


In [None]:
vector_store_fixed_length.save("embeddings_fixed_length.parquet")

#### Loading both models

In [37]:
vector_store_s_search = VectorStore()
vector_store_s_search.load("embeddings_s_search.parquet")

In [38]:
vector_store_fixed_length = VectorStore()
vector_store_fixed_length.load("embeddings_fixed_length.parquet")

### Using the models

In [41]:
# Setting the system prompt for the AI assistant
system_prompt = "You are an AI assistant that gives guidence based on the given context. The context is an adventure for a TTRPG called 'Pathfinder 2e'. Give fleshed out answers point by point, but make sure to indicate what part's you are less certain about."

In [None]:
#Test with semantic search
query = "Describe the first day of walking the pilgrims path, what dangers avaits the adventurers?"
print(generate_response(system_prompt, query, vector_store_s_search.texts, vector_store_s_search.vectors).text)


Okay, here's a breakdown of the first day walking the Pilgrim's Path, focusing on potential dangers and things to consider:

**The First Day on the Pilgrim's Path:**

*   **Setting the Scene:** The first day should emphasize the sense of leaving the familiar behind. The party departs the Mountain Shrine and enters the Pilgrim's Path. Describe the immediate change in the environment (if any). Is the path well-worn or overgrown? Are there immediate visual cues indicating they are now on a special, perhaps sacred, route?

*   **Initial Pace and Ritual:** Remind the players that they cannot rush this journey. Describe the pace as deliberate and meditative. They need to adhere to any known Pilgrim's Path traditions (which you should have conveyed beforehand). Are there prayers to be said? Specific ways they should step or orient themselves on the path? A specific order to walk in? Failure to adhere to these traditions might have consequences (see "Dangers" below).

*   **Environmental Descr

In [None]:
# Test with fixed length
query = "Describe the first day of walking the pilgrims path, what dangers avaits the adventurers?"
print(generate_response(system_prompt, query, vector_store_fixed_length.texts, vector_store_fixed_length.vectors).text)

Okay, here's a breakdown of the first day on the Pilgrim's Path, focusing on what the adventurers might encounter:

*   **Setting and Weather:**

    *   The day begins with a "crisp and cloudy" atmosphere.
    *   The path winds through dense forest for the first half of the day. This means potentially limited visibility and navigation challenges.
    *   The afternoon brings the adventurers to the eastern shore of Mirror Lake, offering expansive views. Be sure to describe the beauty of the lake and the distant mountains to the players.
    *   The text mentions that the party can see all the way to the opposite shore, but they are unable to travel too far out into the lake, or they will enter the Willowshore mindscape

*   **Strange Path Effects:**

    *   The path is strangely preserved, appearing as it did decades ago when the monastery was active. This can create an eerie or unsettling atmosphere.
    *   The passage of time is distorted. No matter how fast or slow the party move

### Validation

In [79]:
validation_data = [
    {
        "question": "What faction does Old Matsuki represent?",
        "ideal_answer": "Southbank"
    },
    {
        "question": "What NPC's help the PC's when researching the willowshore curse?",
        "ideal_answer": "You So-Jin, Igawa Jubei, Great Willow, "
    },
    {
        "question": "What happens when the Eternal Lantern is lit?",
        "ideal_answer": "The monsters and perils abate, citizens come out of hiding but remain frienghtened. Gray Butcher and Mo Douqiu realize that their grip on Willowshore has grown tenuous. Gray Butcher takes to patrolling downtown's streets."
    }
]

In [74]:
evaluation_system_prompt = """You are an intelligent evaluationsystem with the purpose of evaluating the answer of an AI-assistent. If the answer if close to the ideal answer, score it 1.0 (as long as the other information remains relevant). If it's wrong or not good enough score it 0. If it's partly correct score it 0.5. Motivate the score you give it."""

In [80]:
query = validation_data[1]["question"]
print(query)

response_s_search = generate_response(system_prompt, query, vector_store_s_search.texts, vector_store_s_search.vectors)
print(response_s_search.text)

evaluation_prompt = f"""Question: {query}
Response with semantic chunking: {response_s_search.text}
Ideal answer: {validation_data[0]["ideal_answer"]}"""

evaluation_response = generate_response(evaluation_system_prompt, evaluation_prompt, vector_store_s_search.texts, vector_store_s_search.vectors)

print(evaluation_response.text)

What NPC's help the PC's when researching the willowshore curse?
Okay, based on the provided text, here's a breakdown of the NPCs who can assist the PCs in researching the Willowshore curse, and how they can help:

*   **You So-Jin (of Graveside Manners):**
    *   **Expertise:** Willowshore's history and forgotten local knowledge.
    *   **How she helps:** PCs spend the week talking with her, uncovering historical insights related to the curse.
    *   **Research Checks:** DC 17 Willowshore Lore or DC 19 Diplomacy.
    *   **Roleplaying:** Maximum RP 2 (opportunity for roleplaying interactions).
*   **Igawa Jubei (of Mother's Coil):**
    *   **Expertise:** Arcane knowledge, potentially possessing relevant books or insights within her collection.
    *   **How she helps:** PCs investigate at Mother's Coil, working with Igawa Jubei, speaking with her, and/or sifting through her books.
    *   **Research Checks:** DC 17 Library Lore or DC 19 Arcana.
    *   **Roleplaying:** Maximum RP 

In [81]:
query = validation_data[1]["question"]
print(query)

response_fixed_length = generate_response(system_prompt, query, vector_store_fixed_length.texts, vector_store_fixed_length.vectors)
print(response_fixed_length.text)
evaluation_prompt = f"""Question: {query}
Response with fixed-length chunking: {response_fixed_length.text}
Ideal answer: {validation_data[0]["ideal_answer"]}"""

evaluation_response = generate_response(evaluation_system_prompt, evaluation_prompt, vector_store_fixed_length.texts, vector_store_fixed_length.vectors)

print(evaluation_response.text)

What NPC's help the PC's when researching the willowshore curse?
Okay, based on the provided text, here's a breakdown of the NPCs who can assist the PCs in researching the Willowshore curse:

*   **You So-Jin (of Graveside Manners):**

    *   **Expertise:** Willowshore's history and forgotten lore.
    *   **How she helps:** By speaking with her, the PCs can uncover historical information about Willowshore relevant to the curse.
    *   **Research Point (RP) Maximum:** 2
    *   **Research Checks:** DC 17 Willowshore Lore or DC 19 Diplomacy.

*   **Igawa Jubei (of Mother's Coil):**

    *   **Expertise:** Books and knowledge at Mother's Coil.
    *   **How she helps:** Allows access to her collection of books and provides insights through conversation.
    *   **RP Maximum:** 2
    *   **Research Checks:** DC 17 Library Lore or DC 19 Arcana.

*   **Great Willow (kodama):**

    *   **Expertise:** The extent/nature of the curse (having helped determine it previously).
    *   The text 