### Setup GPT-4o model access

In [None]:
from os import environ
from openai import AzureOpenAI

model = AzureOpenAI(
    api_key=environ['AZURE_OPENAI_API_KEY'],
    api_version="2024-08-01-preview",
    azure_endpoint=environ['AZURE_OPENAI_ENDPOINT'],
    azure_deployment=environ['AZURE_OPENAI_MODEL_DEPLOYMENT'],
)

In [None]:
response = model.chat.completions.create(
            model='gpt-4o',
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant",
                },
                {
                    "role": "user",
                    "content": "Hi, my name is Adam, how are you?",
                },
            ],
        )

In [None]:
response.choices[0].message.content

In [None]:
from langchain.document_loaders import PyPDFLoader
from PyPDF2 import PdfReader
import os

In [None]:
def load_pdfs_from_folder(folder_path):
    documents = []
    file_names = []
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, file_name)
            # Open and read the PDF using PyPDF2
            reader = PdfReader(file_path)
            text_content = ""
            for page in reader.pages:
                text_content += page.extract_text()
            documents.append(text_content)  # Add text of each PDF to the documents list
            file_names.append(file_name)    # Add the file name to the list
    return documents, file_names

# Example usage:
folder_path = "/workspace/Design and Construction Standards/Standards - PDF's"
pdf_documents, pdf_filenames = load_pdfs_from_folder(folder_path)

# Print the content of the first document
print(pdf_documents[0])

In [None]:
# Testing by printing the second document
print(pdf_documents[1])

In [None]:
def num_tokens_from_string(string: str, model: str = "gpt-4") -> int:
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Load documents and filenames
folder_path = "/workspace/Design and Construction Standards/Standards - PDF's"
pdf_documents, pdf_filenames = load_pdfs_from_folder(folder_path)

# List to hold (document name, token count) tuples
token_counts = []

# Calculate token counts and store them with their document names
for doc_name, doc_content in zip(pdf_filenames, pdf_documents):
    token_count = num_tokens_from_string(doc_content, model="gpt-4")
    token_counts.append((doc_name, token_count))  # Store as (document name, token count)

# Sort the list of tuples by token count in descending order
token_counts.sort(key=lambda x: x[1], reverse=True)

# Print the sorted token counts
for doc_name, count in token_counts:
    print(f"Document '{doc_name}' Token Count: {count}")

In [None]:
# Calculate the total number of tokens across all documents
total_tokens = sum(count for _, count in token_counts)

# Compute the average token count
average_tokens = total_tokens / len(token_counts)

# Calculate the variance
variance = sum((count - average_tokens) ** 2 for _, count in token_counts) / len(token_counts)

# Calculate the standard deviation
std_dev_tokens = variance ** 0.5

# Print the average and standard deviation
print(f"Average Token Count: {average_tokens:.2f}, Standard Deviation: {std_dev_tokens:.2f}")


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

# Constants
BATCH_SIZE = 10  # Number of documents per batch
MAX_WORKERS = 10  # Number of threads for concurrent processing

# Function to find relevant content in a document
def find_relevant_content(question, doc_name, text_content):
    prompt = f"""
    You are given a user's question and the content of a PDF document.

    Question: "{question}"

    PDF Content:
    {text_content}

    Your task:

    - If the PDF content contains information relevant to the user's question, provide a detailed answer, including quotes from the document where appropriate.

    - If the PDF content does not contain information relevant to the question, simply reply: "No relevant information found."

    Provide your answer below.
    """
    try:
        response = model.chat.completions.create(
            model=environ['AZURE_OPENAI_MODEL_DEPLOYMENT'],
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        answer = response.choices[0].message.content.strip()
        print(f"Response for {doc_name}:\n{answer}\n")  # For debugging
        if "no relevant information found" not in answer.lower():
            return doc_name, answer  # Return document name and answer if relevant
    except Exception as e:
        print(f"Error with LLM API for {doc_name}: {e}")
    return None

# Function to process a batch of PDFs
def process_pdf_batch(question, pdf_batch, pdf_batch_filenames):
    relevant_info = []
    for doc_content, doc_name in zip(pdf_batch, pdf_batch_filenames):
        print(f"Processing {doc_name}")
        result = find_relevant_content(question, doc_name, doc_content)
        if result:
            relevant_info.append(result)
    return relevant_info

# Main function
def main(question, pdf_documents, pdf_filenames):
    all_relevant_info = []
    BATCH_SIZE = 10  # Adjust as needed

    # Process documents in batches
    for i in range(0, len(pdf_documents), BATCH_SIZE):
        pdf_batch = pdf_documents[i:i + BATCH_SIZE]
        pdf_batch_filenames = pdf_filenames[i:i + BATCH_SIZE]
        print(f"\nProcessing batch {i // BATCH_SIZE + 1}")

        relevant_info = process_pdf_batch(question, pdf_batch, pdf_batch_filenames)
        all_relevant_info.extend(relevant_info)

    # Output detailed answers from relevant documents
    print("\nDetailed answers from relevant documents:")
    for doc_name, answer in all_relevant_info:
        print(f"Document: {doc_name}\nAnswer: {answer}\n")

In [None]:
question = "Can a restroom be used as a lactation room?"
main(question, pdf_documents, pdf_filenames)

In [None]:
question = "Is there a required frit pattern on glass?"
main(question, pdf_documents, pdf_filenames)

In [None]:
question = "Does my project have to provide a gender-neutral restroom?"
main(question, pdf_documents, pdf_filenames)

In [None]:
question = "Does my project have to provide a toilet room?"
main(question, pdf_documents, pdf_filenames)

In [None]:
question = "What VFD manufactures does Cornell University allow on projects?"
main(question, pdf_documents, pdf_filenames)

In [None]:
question = "What is the minimum required lighting level for sidewalks on campus?”"
main(question, pdf_documents, pdf_filenames)

In [None]:
question = "What electrical conduit types are required for direct burial on campus?"
main(question, pdf_documents, pdf_filenames)