In [3]:
# First cell - Load environment variables from specific path
from dotenv import load_dotenv
import os

# Specify the exact path to your .env file
ENV_PATH = r"./ragtest/.env"

# Load .env file from specific path
if not os.path.exists(ENV_PATH):
    raise FileNotFoundError(f".env file not found at {ENV_PATH}")

load_dotenv(ENV_PATH)

# Verify API key loaded
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file")

print(f"Environment variables loaded successfully from {ENV_PATH}!")


Environment variables loaded successfully from ./ragtest/.env!


In [7]:


# Second cell - Import required libraries
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from tqdm import tqdm

# Define constants
INPUT_DIR = "./ragtest/input"
INDEX_DIR = "./ragtest/faiss_index"
CACHE_DIR = "./cache"

# Create necessary directories
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(os.path.dirname(INDEX_DIR), exist_ok=True)

# Third cell - Create or load index
def create_or_load_index():
    if os.path.exists(INDEX_DIR):
        print("Loading existing index...")
        embeddings = OpenAIEmbeddings(
            model="text-embedding-3-small"
        )
        vector_store = FAISS.load_local(INDEX_DIR, embeddings)
        return vector_store
    
    print("Creating new index...")
    loader = DirectoryLoader(
        INPUT_DIR,
        glob="**/*.txt",
        loader_cls=TextLoader,
        loader_kwargs={'autodetect_encoding': True}
    )
    documents = loader.load()
    print(f"Loaded {len(documents)} documents")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=100,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    splits = text_splitter.split_documents(documents)
    print(f"Split into {len(splits)} chunks")
    
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-small",
    )
    
    print("Creating vector store...")
    vector_store = FAISS.from_documents(splits, embeddings)
    vector_store.save_local(INDEX_DIR)
    print(f"Index saved to {INDEX_DIR}")
    
    return vector_store

# Create or load the index
vector_store = create_or_load_index()

# Fourth cell - Setup QA chain
prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Context: {context}

Question: {question}

Answer: Let me answer based on the specific context provided."""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Initialize LLM and QA chain
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever(
        search_kwargs={
            "k": 3,
            "fetch_k": 5
        }
    ),
    chain_type="stuff",
    chain_type_kwargs={
        "prompt": PROMPT,
        "verbose": True
    }
)

# Fifth cell - Helper function to get document stats
def get_document_stats():
    doc_count = len(vector_store.docstore._dict)
    print(f"\nIndex Statistics:")
    print(f"Total number of chunks in index: {doc_count}")
    
    sample_ids = list(vector_store.docstore._dict.keys())[:5]
    print(f"\nSample of first few document chunks (IDs):")
    for id in sample_ids:
        print(f"- {id}")

# Show initial stats
get_document_stats()

# Sixth cell - Function to ask a single question
def ask_question(question: str):
    try:
        result = qa_chain.invoke(question)
        return result['result']
    except Exception as e:
        return f"Error processing question: {str(e)}"



Creating new index...
Loaded 10 documents
Split into 307 chunks
Creating vector store...
Index saved to ./ragtest/faiss_index

Index Statistics:
Total number of chunks in index: 307

Sample of first few document chunks (IDs):
- 41af3c29-02ac-4d97-89b2-6c53b6c72fc4
- e0abd438-468f-4258-8081-785a14210599
- 50515279-8b79-4568-b83a-0bf4a5f8b640
- ee357d3e-45ac-46c9-ac84-4349e0d4bb1f
- e66d7bab-4a37-4bbe-801b-2b47b5729be0


In [8]:
# Example usage:
answer = ask_question("What topics are covered in these podcasts?")
print(answer)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Context: I think there are also a bunch of folks like me, where it's going to be all kinds of nostalgic.  You know just, "Oh my God, I remember that."  Like the walk down memory lane. 
What I want people to get out of the show is inspiration.  I want them to be a little more informed about the history of computing, and what's happening now behind the scenes they might not otherwise see.  And I think all of that's important because technology is such a huge part of all of our daily lives, that even those of us who aren't engineers should understand a little bit more about what's going on behind the tech that is suffusing our lives. 
I'm Kevin Scott, Chief Technology Officer at Microsoft.  In 

In [9]:
answer = ask_question("What are the episodes that talk about Billgates?")
print(answer)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Context: Behind the Tech with Kevin Scott Podcast
Episode 52 – Bill Gates

BILL GATES: It was stunning, it was mind blowing. After the biology question I had them type in, “What do you say to a father with a sick child?” And it gave this very careful, excellent answer that was perhaps better than any of us in the room might have given. And so, it was like, wow, what is it – what is the scope of this thing? Because this is way better.
[MUSIC]

KEVIN SCOTT: Hi, everyone. Welcome to Behind the Tech. I’m your host, Kevin Scott, Chief Technology Officer for Microsoft. 

In this podcast, we’re going to get behind the tech. We’ll talk with some of the people who have made our modern tech world poss