In [2]:
# Cargar librerías necesarias
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Limpieza de texto
file_path = r"D:\Documentos\Libros\The Project Gutenberg eBook of The Strange Case of Dr. Jekyll and Mr. Hyde.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()

In [4]:
# Filtrar solo contenido
start = raw_text.find("STORY OF THE DOOR")
end = raw_text.find("*** END OF THE PROJECT GUTENBERG EBOOK")
clean_text = raw_text[start:end]

In [5]:
# Dividir el texto (chunks de 500)
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.create_documents([clean_text])

Created a chunk of size 1254, which is longer than the specified 500
Created a chunk of size 1128, which is longer than the specified 500
Created a chunk of size 802, which is longer than the specified 500
Created a chunk of size 767, which is longer than the specified 500
Created a chunk of size 4319, which is longer than the specified 500
Created a chunk of size 598, which is longer than the specified 500
Created a chunk of size 530, which is longer than the specified 500
Created a chunk of size 566, which is longer than the specified 500
Created a chunk of size 512, which is longer than the specified 500
Created a chunk of size 1946, which is longer than the specified 500
Created a chunk of size 719, which is longer than the specified 500
Created a chunk of size 2232, which is longer than the specified 500
Created a chunk of size 1068, which is longer than the specified 500
Created a chunk of size 1238, which is longer than the specified 500
Created a chunk of size 573, which is lon

In [6]:
# Crear los embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [7]:
# Cargar modelo FLAN-T5 optimizado para QA
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

In [8]:
# Crear pipeline de Hugging Face
hf_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

Device set to use cpu


In [9]:
# Crear LLM para LangChain usando el pipeline
llm = HuggingFacePipeline(pipeline=hf_pipeline)

  llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [10]:
# Construir la cadena RAG
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True
)

In [None]:
# Realizar una consulta al sistema RAG
query = "What happens to Dr. Jekyll?"
result = qa_chain.invoke({"query": query})

print("Respuesta generada:", result["result"])

print("\nDocumentos fuente utilizados:")
for doc in result["source_documents"]:
    print(doc.page_content[:500])

Respuesta generada: He is ill

Documentos fuente utilizados:
DR. JEKYLL WAS QUITE AT EASE
“I have had a shock,” he said, “and I shall never recover. It is a
question of weeks. Well, life has been pleasant; I liked it; yes, sir,
I used to like it. I sometimes think if we knew all, we should be more
glad to get away.”

“Jekyll is ill, too,” observed Utterson. “Have you seen him?”
“You know I never approved of it,” pursued Utterson, ruthlessly
disregarding the fresh topic.

“My will? Yes, certainly, I know that,” said the doctor, a trifle
sharply. “You have told me so.”

“Well, I tell you so again,” continued the lawyer. “I have been
learning something of young Hyde.”

The large handsome face of Dr. Jekyll grew pale to the very lips, and
there came a blackness about his eyes. “I do not care to hear more,”
said he. “This is a matter I thought we had agreed to drop.”
“So you found it out, did you?” said Utterson. “But if that be so, we
may step into the court and take a look at the windows.

In [11]:
# Preguntas
questions = [
    "Who is the main character of the book?",
    "What is the book about?",
    "What is the main theme?",
    "What happens to Dr. Jekyll?",
    "Who is Mr. Hyde?"
    ]

In [12]:
# Respuestas
for i, question in enumerate(questions):
    print(f"Pregunta {i+1}: {question}")
    result = qa_chain.invoke({"query": question})
    print("Respuesta:")
    print(result["result"])

Token indices sequence length is longer than the specified maximum sequence length for this model (797 > 512). Running this sequence through the model will result in indexing errors


Pregunta 1: Who is the main character of the book?
Respuesta:
Edward Hyde
Pregunta 2: What is the book about?
Respuesta:
a lawyer
Pregunta 3: What is the main theme?
Respuesta:
The doctor's case
Pregunta 4: What happens to Dr. Jekyll?
Respuesta:
He is ill
Pregunta 5: Who is Mr. Hyde?
Respuesta:
a person of small stature
