<a href="https://colab.research.google.com/github/Esther-Wagatwe/EASY-ML/blob/master/Retrieval_Augmented_Generation_(RAG).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install required libraries
!pip install -q transformers sentence-transformers faiss-cpu openai
!pip install -q pdfplumber python-docx pandas

In [None]:
# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# STEP 3: Import libraries
import faiss
import numpy as np
import pdfplumber
import pandas as pd
from docx import Document
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os


In [None]:
# STEP 4: Set folder path in Google Drive
folder_path = "/content/drive/MyDrive/RAG"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# STEP 5: Define file extractor
def extract_text(file_path, ext):
    try:
        if ext == "pdf":
            text = ""
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
            return text
        elif ext == "docx":
            doc = Document(file_path)
            return "\n".join([p.text for p in doc.paragraphs])
        elif ext == "csv":
            df = pd.read_csv(file_path)
            return df.to_string(index=False)
        else:
            return ""
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [None]:
# STEP 6: Read all supported files in folder
supported_exts = ["pdf", "docx", "csv"]
raw_text = ""

for file in os.listdir(folder_path):
    full_path = os.path.join(folder_path, file)
    ext = file.split(".")[-1].lower()
    if ext in supported_exts:
        print(f"Reading: {file}")
        text = extract_text(full_path, ext)
        raw_text += text + "\n"

# Now raw_text contains the combined text from all files
print(f"\n Total characters extracted: {len(raw_text)}")

In [None]:
# STEP 7: Split into chunks
def chunk_text(text, max_chars=500):
    paragraphs = text.split("\n")
    chunks, current_chunk = [], ""
    for para in paragraphs:
        if len(current_chunk) + len(para) <= max_chars:
            current_chunk += para + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return [chunk for chunk in chunks if len(chunk) > 50]

doc_chunks = chunk_text(raw_text)

In [None]:
# STEP 7: Embed chunks using SentenceTransformers
embedder = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = embedder.encode(doc_chunks, convert_to_tensor=False)
doc_embeddings = np.vstack(doc_embeddings)

In [None]:
# STEP 8: Build FAISS (Facebook AI Similarity Search) index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

In [None]:
# STEP 9: Set your OpenAI API key


In [None]:
# Updated: RAG using local T5 model (no OpenAI)
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
generator = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

def rag_local_t5_answer(question, top_k=3):
    # Embed query and retrieve
    query_embedding = embedder.encode([question])
    D, I = index.search(np.array(query_embedding), top_k)

    # Combine top passages
    context = "\n".join([doc_chunks[i] for i in I[0]])
    input_prompt = f"""
    Based on the following context, provide a comprehensive and detailed answer to the question.
    Elaborate on the key points found in the context and explain them thoroughly in two well-written paragraphs.

    Context:
    {context}

    Question: {question}
    """

    # Tokenize and generate
    input_ids = tokenizer.encode(input_prompt, return_tensors="pt", truncation=True, max_length=512)
    output_ids = generator.generate(input_ids, max_length=400, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
# STEP 11: Ask a Question using local T5 model
question = "What is covid 19?"
answer = rag_local_t5_answer(question)

print("Question:", question)
print("\n Answer:\n", answer)
