In [4]:
import nltk # natural language toolkit
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
import os
import openai
import streamlit as st

## Data Storage and Retrieval
### Loading the PDF

In [5]:
# File path for the document
file_path = r"C:\Users\PC\OneDrive\Documentos\Bootcamp\Mini project gen ai\Piranesi - Susanna Clark.pdf"


### Documents into pages

In [6]:
# Load and split the document
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
len(pages)

169

In [7]:
# Print text from the first pages
print(pages[0].page_content[:1000])  # Show first 1000 characters
print(pages[1].page_content[:1000])
print(pages[3].page_content[:1000])
print(pages[4].page_content[:1000]) 


PIRANESI
For Colin
‘I am the great scholar, the magician, the adept, who is doing the
experiment. Of course I need subjects to do it on.’
The Magician’s Nephew, C. S. Lewis
‘People call me a philosopher or a scientist or an anthropologist. I am
none of those things. I am an anamnesiologist. I study what has been
forgotten. I divine what has disappeared utterly. I work with absences, with
silences, with curious gaps between things. I am really more of a magician
than anything else.’
Laurence Arne-Sayles, interview in The Secret Garden, May 1976
CONTENTS
 
1. PART 1: PIRANESI
2. PART 2: THE OTHER
3. PART 3: THE PROPHET
4. PART 4: 16
5. PART 5: VALENTINE KETTERLEY
6. PART 6: WAVE
7. PART 7: MATTHEW ROSE SORENSEN
8. NOTE ON THE AUTHOR


### Documents into chunks

In [8]:
# Split pages into chunks
text_splitter = CharacterTextSplitter(chunk_size=20000, chunk_overlap=300)
chunks = text_splitter.split_documents(pages)
len(chunks)

169

In [9]:
# Print the length of the text content in the first chunk
print(len(chunks[0].page_content))


8


### Embeddings

In [10]:
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
api_key = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
#embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [12]:
load_dotenv()  # This will load the .env file
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    print("API key not found. Please check your .env file.")
else:
    print("API key loaded successfully.")

API key loaded successfully.


### ChromaDB

In [13]:
db = Chroma.from_documents(chunks, embeddings, persist_directory="./chroma_db")
print("ChromaDB created with document embeddings.")

ChromaDB created with document embeddings.


### Retrieving Documents

In [14]:
user_question = "How did Matthew become Piranesi?" # User question
retrieved_docs = db.similarity_search(user_question, k=10) # k is the number of documents to retrieve

In [15]:
# Display top results
for i, doc in enumerate(retrieved_docs[:1]): # Display top 3 results
    print(f"Document {i+1}:\n{doc.page_content[36:1000]}") # Display content

Document 1:



### Preparing content for GenAI

In [16]:
def _get_document_prompt(docs):
    prompt = "\n"
    for doc in docs:
        prompt += "\nContent:\n"
        prompt += doc.page_content + "\n\n"
    return prompt

In [17]:
# Generate a formatted context from the retrieved documents
formatted_context = _get_document_prompt(retrieved_docs)
print("Context formatted for GPT model.")

Context formatted for GPT model.


### ChatBot Architecture

In [18]:
prompt = f"""
## SYSTEM ROLE
You are an expert on the book *Piranesi* by Sussanna Clarke. Your job is to: 
-Recognize characters, places, objects and events in the book.
-Provide detailed explanations of characters, locations and themes.
-Analyze and corelate: notice relationships between characters (e.g., when one character has multiple identities or names) and explain them.
-Answer questions about the book, including character motivations and plot points.
-Provide summaries of chapters or sections of the book.
-Explain the significance of specific quotes or passages.
-Clarify and interpret: if a user asks about symbolic meanings or hidden themes, provide analyses and interpretations.
-Provide context: if a user asks about a specific event or character, provide background information to help them understand the significance of that event or character.
If unsure, say 'I don't know' and adjuct what you think the answer is.

## USER QUESTION
The user has asked: 
"{user_question}"

## CONTEXT
Here is the relevant content from the book:  
'''
{formatted_context}
'''

## GUIDELINES
1. **Accuracy**:  
   - Character recognition: provide full identity of the characters (including alternate names), their role in the story, and any major developments.
   - Place recognition: provide full names of places, their significance in the story, and any major events that occur there.
   - If the answer cannot be found, explicitly state: "I couldn't find the answer in the book", and ask the user to provide more context or details.
   - Provide page numbers for any specific quotes or passages you reference in your answer.
   - Warn the user about speculative content: if you are unsure about a specific detail, make it clear that you are speculating and provide your reasoning.
   - Spoiler sensitivity: if the user asks about a major plot twist, warn them before reavealing the information.

2. **Transparency**:   
   - Warn the user about speculative content: if you are unsure about a specific detail, make it clear that you are speculating and provide your reasoning. 

3. **Clarity**:  
   - Your response should be clear and easy to understand.
   - Format your response in Markdown for readability.  

## TASK
1. Answer the user's question **directly** if possible.  
2. Point the user to relevant parts of the documentation.  
3. Provide the response in the following format:

## RESPONSE FORMAT
'''
# [Brief Title of the Answer]
[Answer in simple, clear text.]

**Source**:  
• [Book Title], Page(s): [...]
'''
"""
print("Prompt constructed.")

Prompt constructed.


In [19]:
# Set up GPT client and parameters
client = openai.OpenAI()
model_params = {
    'model': 'gpt-4o',
    'temperature': 0.7,  # Increase creativity
    'max_tokens': 4000,  # Allow for longer responses
    'top_p': 0.9,        # Use nucleus sampling
    'frequency_penalty': 0.5,  # Reduce repetition
    'presence_penalty': 0.6    # Encourage new topics
}

### Response

In [20]:
messages = [{'role': 'user', 'content': prompt}]
completion = client.chat.completions.create(messages=messages, **model_params, timeout=120)

In [21]:
answer = completion.choices[0].message.content
print(answer)

'''
# How Matthew Became Piranesi

Matthew Rose Sorensen becomes Piranesi as a result of his experiences in the mysterious world he finds himself in, which is referred to as the House. This transformation is both physical and psychological. Initially, Matthew is an academic who enters this strange world through the manipulations of Laurence Arne-Sayles (known as the Other). Over time, due to isolation and the unique nature of the House, Matthew loses his former identity and adopts the name Piranesi. The environment of endless halls filled with statues and oceans affects his memory and perception, leading him to forget much of his past life.

This transformation highlights themes of identity, memory, and reality within Susanna Clarke's novel. The change from Matthew to Piranesi symbolizes a loss of self but also a new way of being that is deeply connected to the environment he inhabits.

**Source**:  
• *Piranesi* by Susanna Clarke, specific pages not provided.
'''


### Run the app

In [None]:
!streamlit run app.py