In [1]:
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline
from langchain import HuggingFacePipeline, PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm





Load Dataset

In [2]:
data_path = Path("alt.atheism.txt")
with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    raw_text = f.read()

print(f"Dataset length: {len(raw_text)} characters")

Dataset length: 5068446 characters


Split into chunks

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([raw_text])

print(f"Total chunks: {len(chunks)}")



Total chunks: 6879


Use a free sentence-transformers embedding model

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(chunks, embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


 Retriever

In [5]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

query = "What does this dataset say about Books ?"
retrieved_docs = retriever.invoke(query)

print("Retrieved docs:", len(retrieved_docs))
print(retrieved_docs[0].page_content[:500])

Retrieved docs: 4
KS> When he asked for this book, the well educated American book store
KS> assistants in most placed asked him to check out the thriller section,
KS> or then they said that his book has not been published yet, but they
KS> should receive the book soon. In some places the assistants bluntly
KS> said that they don't know of such an author, or that he is not 
KS> a well known living author, so they don't keep copies of his books.

KS> Such is the life and times of America, 200+ years after the revo


LLM & Prompt

Run Retrieval + Generation

In [None]:
qa_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",  # or flan-t5-base
    # max_new_tokens=256  # <-- controls how many tokens to generate
)

llm = HuggingFacePipeline(pipeline=qa_pipeline)

prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided dataset context.
      If the context is insufficient, just say you don't know.

      Context:
      {context}

      Question: {question}
    """,
    input_variables=['context', 'question']
)

# Test
question = "What is the main theme discussed in these documents?"
# question = "tell about United Kingdom from the document?"
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

final_prompt = prompt.format(context=context_text, question=question)
answer = llm.invoke(final_prompt)

print("Answer:\n",answer)



