In [1]:
import databutton as db

import re
import time
from io import BytesIO
from typing import Any, Dict, List
import pickle

from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain_openai import OpenAI
from pypdf import PdfReader
from typing import Tuple
import faiss

1. Upload the PDF

In [2]:
def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]:
    # Initialize the PDF reader for the provided file.
    pdf = PdfReader(file)
    output = []
    
    # Loop through all the pages in the PDF.
    for page in pdf.pages:
        # Extract the text from the page.
        text = page.extract_text()
        
        # Replace word splits that are split by hyphens at the end of a line.
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        
        # Replace single newlines with spaces, but not those flanked by spaces.
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        
        # Consolidate multiple newlines to two newlines.
        text = re.sub(r"\n\s*\n", "\n\n", text)
        
        # Append the cleaned text to the output list.
        output.append(text)
    
    # Return the list of cleaned texts and the filename.
    return output, filename

def text_to_docs(text: List[str], filename: str) -> List[Document]:
    # Ensure the input text is a list. If it's a string, convert it to a list.
    if isinstance(text, str):
        text = [text]
    
    # Convert each text (from a page) to a Document object.
    page_docs = [Document(page_content=page) for page in text]
    
    # Assign a page number to the metadata of each document.
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    doc_chunks = []
    
    # Split each page's text into smaller chunks and store them as separate documents.
    for doc in page_docs:
        # Initialize the text splitter with specific chunk sizes and delimiters.
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        
        # Split the document's text into chunks.
        chunks = text_splitter.split_text(doc.page_content)
        
        # Convert each chunk into a new document, storing its chunk number, page number, and source file name in its metadata.
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc.metadata["filename"] = filename
            doc_chunks.append(doc)
    
    # Return the list of chunked documents.
    return doc_chunks

2. OpenAI Info

In [3]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

In [4]:
load_dotenv(find_dotenv())

openai_api_key = os.getenv('OPENAI_API_KEY')

3. Indexing

In [5]:
def docs_to_index(docs, openai_api_key):
    index = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=openai_api_key))
    return index

4. Implementation

In [6]:
pdf = '/teamspace/studios/this_studio/app/Formularios/2024_CIE10ES_Tomo_I_Diagnosticos_recortado (1).pdf'

In [7]:
import PyPDF2

with open(pdf, 'rb') as f:
    pdf_bytes = BytesIO(f.read())

cleaned_texts, filename = parse_pdf(pdf_bytes, pdf)

documents = text_to_docs(cleaned_texts, filename)

print(f"\nCreated Documents from '{filename}':")
for i, doc in enumerate(documents, start=1):
    print(f"\nDocument {i}:")
    print(f"  Page Number: {doc.metadata['page']}")
    print(f"  Content:\n{doc.page_content}")  # Assuming Document has a page_content attribute


Created Documents from '/teamspace/studios/this_studio/app/Formularios/2024_CIE10ES_Tomo_I_Diagnosticos_recortado (1).pdf':

Document 1:
  Page Number: 1
  Content:
- conglobata L70.1 - decalvante L66.2 - especificado NCOC L70.8 - excoriado (de las niñas) L70.5 - frontalis L70.2 - indurada L70.0 - infantil L70.4 - lupoide L70.2 - necrótico, necrótica (miliar) L70.2 - neonatal L70.4 - nodular L70.0 - ocupacional L70.8 - pustuloso L70.0 - queloide L73.0 - quístico L70.0 - rodens L70.2 - rosácea L71.9 - trópica L70.3 - varioliforme L70.2 - vulgaris L70.0 Acnitis (primaria) A18.4 Acodadura, acodamiento - arteria I77.1 - íleon o intestino - véase Obstrucción, intestino - Lane, de - véase Obstrucción, intestino - órgano o localización, congénita NCOC - véase Anomalía, por localización - pelo (adquirida) L67.8 - uréter (unión pélvica) N13.5 - - con - - - hidronefrosis N13.1 - - - - con infección N13.6 - - - pielonefritis (crónica) N11.1 - - congénita Q62.39 - vena (-s) I87.8 - - cava I87.1 -

In [8]:
index = docs_to_index(documents, openai_api_key=openai_api_key)

print("Search index created!")

Search index created!


In [9]:
index.save_local(folder_path="/teamspace/studios/this_studio/Curso_IA_Gen/Formularios", index_name="myFaissIndex")

In [10]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [11]:
db = FAISS.load_local(folder_path="/teamspace/studios/this_studio/Curso_IA_Gen/Formularios",embeddings=embeddings,index_name="myFaissIndex",allow_dangerous_deserialization=True)