<a href="https://colab.research.google.com/github/AmirJlr/LLMs/blob/master/05_Simple_RAG_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages with a specific version for huggingface_hub to ensure compatibility
!pip install -q langchain faiss-cpu transformers sentence-transformers huggingface-hub langchain-community

In [None]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline

In [None]:
### Set hugging face READ access token and store it in colab secrets
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

# Securely get the Hugging Face API token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
print("Token set successfully!")

In [None]:
# Create a directory for our data
!mkdir -p documents

# Create a dummy text file with some notes
file_content = """
Real Madrid named Forbes' most valuable football club in the world for the fourth successive year
Our club tops the list with a value of 6.75 billion dollars, and is the first football team to eclipse 1 billion dollars in revenues.


Real Madrid named Forbes' most valuable football club in the world for the fourth successive year
NEWS.31/05/2025
Real Madrid has been named the most valuable football club in the world for the fourth consecutive year, and the ninth time in the last twelve editions of the list drawn up by Forbes, which values the club at 6.75 billion dollars.
They're followed by Manchester United (6.6 billion dollars). The prestigious publication has released its annual report, which reveals our club's value has risen 2% compared to last year.

Forbes highlights the fact that Real Madrid enjoyed revenues of 1.13 billion dollars in the 2023/24 season, making them the first football team ever to break the 1 billion dollar mark.
The magazine also underlines Real Madrid's Champions League win in 2023/24, as well as the club's international appeal and commercial partnerships.
Furthermore, they highlight how, following the completion of the Santiago Bernabéu, the club expects to increase its matchday revenues, including in ticket sales.


POSITION	                  CLUB	VALUE  IN BILLIONS OF DOLLARS
1	Real Madrid	                         6.75
2	Manchester United	                 6.60
3	F. C. Barcelona	                     5.65
4	Liverpool	                         5.40
5	Manchester City	                     5.30
6	Bayern Munich	                     5.10
7	PSG	                                 4.60
8	Arsenal	                             3.40
9	Tottenham	                         3.30
10	Chelsea	                             3.25
"""

with open("documents/ReadMadrid.txt", "w") as f:
    f.write(file_content)

print("Sample file 'ReadMadrid.txt' created.")

## Step 2: Load and Chunk Documents

In [None]:
# 1. Load documents from directory (you can add your text files here)
def load_documents(directory_path="./documents"):
    """Load all text files from directory"""
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Created {directory_path} directory. Add your text files here!")
        return []

    loader = DirectoryLoader(directory_path, glob="**/*.txt", loader_cls=TextLoader)
    return loader.load()

In [None]:
# 2. Split documents into chunks
def create_chunks(documents):
    """Split documents into smaller chunks for better retrieval"""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return text_splitter.split_documents(documents)

## Step 3: Create Vector Embeddings and Store in FAISS

In [None]:
# 3. Create vector store with embeddings
def create_vector_store(chunks):
    """Create FAISS vector store from document chunks"""
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return FAISS.from_documents(chunks, embeddings)

## Step 4: Create the Q&A Chain

In [None]:
# 4. Setup LLM and QA chain
def setup_qa_chain(vector_store):
    """Setup the question-answering chain"""
    # Use a lightweight model for quick testing
    llm_pipeline = pipeline(
        "text2text-generation",
        model="allenai/unifiedqa-t5-base",
        max_length=512,
        temperature=0.3,
        device_map="auto",
    )
    llm = HuggingFacePipeline(pipeline=llm_pipeline)

    # Create retrieval QA chain
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 1}),
    )

## Step 5: Main chatbot function

In [None]:
def main():
    print("🧠 Loading your documents...")
    documents = load_documents()

    if not documents:
        print("No documents found! Add .txt files to ./documents directory")
        return

    print("📄 Creating document chunks...")
    chunks = create_chunks(documents)

    print("🔍 Building vector store...")
    vector_store = create_vector_store(chunks)

    print("🤖 Setting up Q&A chain...")
    qa_chain = setup_qa_chain(vector_store)

    print("\n✅ Chatbot ready! Type 'quit' to exit")

    while True:
        question = input("\n❓ Ask a question: ")
        if question.lower() == "quit":
            break

        answer = qa_chain.run(question)
        print(f"🤖 Answer: {answer}")

In [None]:
main()