# Environment Setup

In [2]:
# Install required packages
!pip install -q unsloth transformers accelerate torch sentencepiece xformers
!pip install -q chromadb pypdf sentence-transformers langchain

print("✅ Environment setup complete")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.2/378.2 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/122.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m808.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m849.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m140.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Load 4-bit Quantized Model

In [3]:
from unsloth import FastLanguageModel
import torch

# Clear GPU memory first
torch.cuda.empty_cache()

# Load dynamic 4-bit model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",  # DYNAMIC 4-bit
    max_seq_length=2048,  # Longer for RAG context
    dtype=None,  # Auto-detect optimal dtype
    load_in_4bit=True,  # ENABLES 4-bit quantization
    token=None,
    device_map="auto",  # Automatically manages GPU memory
)

print("✅ 4-bit quantized model loaded successfully")

# Memory check
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"📊 Memory: {allocated:.2f}GB / {total:.2f}GB ({allocated/total*100:.1f}%)")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

✅ 4-bit quantized model loaded successfully
📊 Memory: 5.72GB / 15.83GB (36.1%)


# Load Domain-Specific Documents

In [8]:
# Upload PDF from your computer
from google.colab import files
uploaded = files.upload()  # Select medical PDF

Saving cornona.pdf to cornona.pdf


#  Create Vector Database

In [10]:
# ========== STEP 4: VECTOR DATABASE SETUP (FIXED) ==========

import chromadb
from sentence_transformers import SentenceTransformer
import numpy as np

print("🚀 Step 4: Creating Vector Database...")

# 4.1 Initialize ChromaDB with CORRECT syntax
chroma_client = chromadb.PersistentClient(path="./rag_database")

# 4.2 Create collection (fixed syntax)
try:
    collection = chroma_client.get_collection("medical_documents")
    print("⚠️ Collection already exists, using existing")
except:
    collection = chroma_client.create_collection(
        name="medical_documents",
        metadata={"description": "Medical RAG database"}
    )
    print("✅ New collection created")

# 4.3 Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"✅ Embedding model loaded: {embed_model.get_sentence_embedding_dimension()} dimensions")

# 4.4 Create document chunks (use your medical_docs from Step 3)
def create_chunks(documents, chunk_size=300, overlap=50):
    """Split documents into overlapping chunks"""
    chunks = []
    chunk_ids = []

    for doc_idx, doc in enumerate(documents):
        words = doc.split()
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
            chunk_ids.append(f"doc_{doc_idx}_chunk_{i//chunk_size}")

    return chunks, chunk_ids

# Assuming you have 'medical_docs' from Step 3
if 'medical_docs' not in locals():
    # Create sample docs if not loaded yet
    medical_docs = [
        "Diabetes: Metformin 500mg twice daily.",
        "Hypertension: Target BP <130/80 mmHg."
    ]

chunks, chunk_ids = create_chunks(medical_docs)
print(f"✅ Created {len(chunks)} chunks from {len(medical_docs)} documents")

# 4.5 Generate embeddings
print("Generating embeddings...")
embeddings = embed_model.encode(chunks)
print(f"✅ Generated {len(embeddings)} embeddings (shape: {embeddings.shape})")

# 4.6 Add to database (fixed syntax)
collection.add(
    embeddings=embeddings.tolist(),
    documents=chunks,
    ids=chunk_ids
)

print(f"✅ Database populated: {collection.count()} chunks")
print("🎯 Step 4 completed successfully!")
print(f"📊 Collection info: {collection.name} with {collection.count()} items")

🚀 Step 4: Creating Vector Database...
✅ New collection created


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded: 384 dimensions
✅ Created 3 chunks from 3 documents
Generating embeddings...
✅ Generated 3 embeddings (shape: (3, 384))
✅ Database populated: 3 chunks
🎯 Step 4 completed successfully!
📊 Collection info: medical_documents with 3 items


# Build Retrieval System

In [11]:
def retrieve_relevant_documents(query, top_k=3):
    """
    Retrieve most relevant document chunks for a query
    Fulfills: "retrieval logic to fetch relevant chunks"
    """
    print(f"\n🔍 Retrieving documents for query: '{query}'")

    # Step 1: Convert query to embedding
    query_embedding = embed_model.encode([query])

    # Step 2: Query vector database
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    # Step 3: Return formatted results
    retrieved_chunks = results['documents'][0]
    similarities = results['distances'][0]
    metadatas = results['metadatas'][0]

    print(f"✅ Retrieved {len(retrieved_chunks)} relevant chunks")

    # Show what was retrieved
    for i, (chunk, similarity) in enumerate(zip(retrieved_chunks, similarities)):
        print(f"\n  Chunk {i+1} (similarity: {1-similarity:.3f}):")
        print(f"  {chunk[:150]}...")

    return retrieved_chunks, similarities, metadatas

# Test retrieval
test_query = "What is the treatment for diabetes?"
chunks, similarities, metadata = retrieve_relevant_documents(test_query)


🔍 Retrieving documents for query: 'What is the treatment for diabetes?'
✅ Retrieved 3 relevant chunks

  Chunk 1 (similarity: -0.024):
  DIABETES MANAGEMENT GUIDELINES Diagnosis: Fasting glucose ≥126 mg/dL or HbA1c ≥6.5% Treatment: Metformin first-line, 500mg twice daily Monitoring: HbA...

  Chunk 2 (similarity: -0.427):
  HYPERTENSION PROTOCOL Classification: Normal <120/80, Stage 1: 130-139/80-89 Medications: ACE inhibitors for patients under 55 Lifestyle: Reduce sodiu...

  Chunk 3 (similarity: -0.668):
  ASTHMA EMERGENCY PLAN Quick-relief: Albuterol inhaler, 2 puffs every 4 hours Preventive: Inhaled corticosteroids daily Triggers: Avoid pollen, dust, c...


# Create RAG Pipeline

In [12]:
def format_rag_prompt(query, retrieved_chunks):
    """
    Create prompt with retrieved context
    Fulfills: "uses the quantized LLM to generate grounded responses"
    """
    # Combine retrieved chunks into context
    context = "\n\n".join([f"[Document {i+1}]: {chunk}"
                          for i, chunk in enumerate(retrieved_chunks)])

    # RAG prompt template
    prompt = f"""You are a medical assistant. Answer the question using ONLY the provided context.

CONTEXT INFORMATION:
{context}

QUESTION: {query}

INSTRUCTIONS:
1. Answer based ONLY on the context above
2. If the answer is not in the context, say "I cannot answer based on provided information"
3. Reference which document you're using (e.g., "According to Document 1...")
4. Be precise and medically accurate

ANSWER:"""

    return prompt

def generate_rag_response(query, max_new_tokens=300):
    """
    Complete RAG pipeline: Retrieve → Format → Generate
    Fulfills: "RAG framework that retrieves information based on user queries"
    """
    print(f"\n" + "="*60)
    print(f"🚀 PROCESSING QUERY: {query}")
    print("="*60)

    # 1. RETRIEVE
    chunks, similarities, _ = retrieve_relevant_documents(query)

    # 2. FORMAT
    prompt = format_rag_prompt(query, chunks)
    print(f"\n📝 Prompt length: {len(prompt)} characters")

    # 3. GENERATE (using 4-bit model)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1800)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    print("🧠 Generating response with 4-bit model...")

    with torch.no_grad():  # Saves memory during inference
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # 4. DECODE
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the answer part
    if "ANSWER:" in full_response:
        answer = full_response.split("ANSWER:")[-1].strip()
    else:
        answer = full_response

    return answer, chunks, prompt

# Test the complete RAG pipeline
test_queries = [
    "What medication is used for diabetes?",
    "How to manage asthma emergency?",
    "What is normal blood pressure?"
]

for query in test_queries:
    answer, chunks, _ = generate_rag_response(query)
    print(f"\n💡 ANSWER: {answer}")
    print(f"📊 Retrieved {len(chunks)} context chunks")
    print("-"*60)


🚀 PROCESSING QUERY: What medication is used for diabetes?

🔍 Retrieving documents for query: 'What medication is used for diabetes?'
✅ Retrieved 3 relevant chunks

  Chunk 1 (similarity: -0.078):
  DIABETES MANAGEMENT GUIDELINES Diagnosis: Fasting glucose ≥126 mg/dL or HbA1c ≥6.5% Treatment: Metformin first-line, 500mg twice daily Monitoring: HbA...

  Chunk 2 (similarity: -0.347):
  HYPERTENSION PROTOCOL Classification: Normal <120/80, Stage 1: 130-139/80-89 Medications: ACE inhibitors for patients under 55 Lifestyle: Reduce sodiu...

  Chunk 3 (similarity: -0.680):
  ASTHMA EMERGENCY PLAN Quick-relief: Albuterol inhaler, 2 puffs every 4 hours Preventive: Inhaled corticosteroids daily Triggers: Avoid pollen, dust, c...

📝 Prompt length: 1166 characters
🧠 Generating response with 4-bit model...

💡 ANSWER: Metformin is used as a first-line medication for diabetes.
📊 Retrieved 3 context chunks
------------------------------------------------------------

🚀 PROCESSING QUERY: How to manag

# Memory Optimization & Monitoring

In [13]:
def monitor_memory_usage(step_name=""):
    """Monitor and report GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        max_allocated = torch.cuda.max_memory_allocated() / 1e9

        print(f"\n📊 MEMORY USAGE {step_name}:")
        print(f"  Current allocated: {allocated:.2f} GB")
        print(f"  Current reserved:  {reserved:.2f} GB")
        print(f"  Peak allocated:    {max_allocated:.2f} GB")

        # Calculate efficiency
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        efficiency = (allocated / total) * 100

        print(f"  Total GPU memory:  {total:.1f} GB")
        print(f"  Memory efficiency: {efficiency:.1f}% used")

        return allocated
    return 0

def optimize_memory():
    """Apply memory optimization techniques"""
    print("\n🔧 Applying memory optimizations...")

    # 1. Clear cache
    torch.cuda.empty_cache()
    print("  ✓ Cleared GPU cache")

    # 2. Set precision
    torch.set_float32_matmul_precision('medium')
    print("  ✓ Set optimized precision")

    # 3. Monitor after optimizations
    monitor_memory_usage("after optimization")

    return True

# Test memory usage at different stages
print("="*60)
print("MEMORY OPTIMIZATION DEMONSTRATION")
print("="*60)

# Initial state
monitor_memory_usage("initial")

# After model loading
print("\n📈 After loading 4-bit model:")
monitor_memory_usage("model loaded")

# During RAG query
print("\n📈 During RAG query processing:")
answer, chunks, _ = generate_rag_response("Explain diabetes treatment")
monitor_memory_usage("after RAG query")

# After optimization
optimize_memory()

MEMORY OPTIMIZATION DEMONSTRATION

📊 MEMORY USAGE initial:
  Current allocated: 5.93 GB
  Current reserved:  6.20 GB
  Peak allocated:    7.25 GB
  Total GPU memory:  15.8 GB
  Memory efficiency: 37.5% used

📈 After loading 4-bit model:

📊 MEMORY USAGE model loaded:
  Current allocated: 5.93 GB
  Current reserved:  6.20 GB
  Peak allocated:    7.25 GB
  Total GPU memory:  15.8 GB
  Memory efficiency: 37.5% used

📈 During RAG query processing:

🚀 PROCESSING QUERY: Explain diabetes treatment

🔍 Retrieving documents for query: 'Explain diabetes treatment'
✅ Retrieved 3 relevant chunks

  Chunk 1 (similarity: -0.100):
  DIABETES MANAGEMENT GUIDELINES Diagnosis: Fasting glucose ≥126 mg/dL or HbA1c ≥6.5% Treatment: Metformin first-line, 500mg twice daily Monitoring: HbA...

  Chunk 2 (similarity: -0.514):
  HYPERTENSION PROTOCOL Classification: Normal <120/80, Stage 1: 130-139/80-89 Medications: ACE inhibitors for patients under 55 Lifestyle: Reduce sodiu...

  Chunk 3 (similarity: -0.872):


True

# Use TinyLlama (1.1B parameters)

In [18]:
import torch
import gc
from unsloth import FastLanguageModel
import chromadb
from sentence_transformers import SentenceTransformer

# ========== CLEAR EVERYTHING FIRST ==========
gc.collect()
torch.cuda.empty_cache()

print("🧹 Cleared GPU memory")
if torch.cuda.is_available():
    free = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1e9
    print(f"📊 Available GPU memory: {free:.2f} GB")

# ========== SMALLER RAGSystem CLASS ==========
class RAGSystem:
    """RAG system optimized for T4 GPU"""

    def __init__(self):
        print("🚀 Initializing RAG System for T4 GPU...")

        # USE TINYLLAMA - 1.1B parameters (MUCH smaller)
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name="unsloth/tinyllama-bnb-4bit",  # 1.1B model
            max_seq_length=512,  # Very small
            dtype=None,
            load_in_4bit=True,
            device_map="auto",
        )

        # Initialize vector database
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection("rag_docs")

        # Load embedding model
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # 80MB model

        print("✅ RAG System initialized")
        self.print_memory()

    def print_memory(self):
        if torch.cuda.is_available():
            used = torch.cuda.memory_allocated() / 1e9
            total = torch.cuda.get_device_properties(0).total_memory / 1e9
            print(f"📊 GPU Memory: {used:.2f}GB / {total:.2f}GB ({used/total*100:.1f}%)")

    def add_documents(self, documents):
        """Add documents"""
        print(f"📚 Adding {len(documents)} documents...")

        # Simple chunking
        chunks = []
        for doc in documents:
            # Split by sentences for simplicity
            sentences = doc.replace('. ', '.\n').split('\n')
            chunks.extend([s.strip() for s in sentences if s.strip()])

        # Generate embeddings
        embeddings = self.embed_model.encode(chunks)

        # Add to database
        self.collection.add(
            embeddings=embeddings.tolist(),
            documents=chunks,
            ids=[f"chunk_{i}" for i in range(len(chunks))]
        )

        print(f"✅ Added {len(chunks)} chunks")
        return chunks

    def query(self, user_query):
        """Simple RAG query"""
        # Clear cache
        torch.cuda.empty_cache()

        # Retrieve
        query_embed = self.embed_model.encode([user_query])
        results = self.collection.query(
            query_embeddings=query_embed.tolist(),
            n_results=2  # Only 2 chunks
        )

        # Simple prompt
        context = " ".join(results['documents'][0])
        prompt = f"Based on: {context}\n\nQuestion: {user_query}\nAnswer:"

        # Generate
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=400, truncation=True)
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=100,  # Very short
            temperature=0.7,
        )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Clean up response
        if "Answer:" in response:
            return response.split("Answer:")[-1].strip()
        return response

# ========== TEST ==========
print("\n" + "="*60)
print("TESTING RAG WITH TINYLLAMA")
print("="*60)

# Minimal test documents
test_docs = [
    "Diabetes treatment is Metformin 500mg twice daily.",
    "Hypertension target is blood pressure under 130/80 mmHg.",
    "Asthma emergency uses Albuterol inhaler."
]

# Create system
try:
    rag = RAGSystem()

    # Add documents
    rag.add_documents(test_docs)

    # Test query
    result = rag.query("What is diabetes medication?")
    print(f"\n🤖 Response: {result}")

    # Test another
    result2 = rag.query("What is hypertension target?")
    print(f"🤖 Response: {result2}")

except Exception as e:
    print(f"❌ Error: {e}")
    print("\n🔄 Trying alternative approach...")

    # Fallback: Use even smaller model
    !pip install -q --upgrade unsloth
    from unsloth import FastLanguageModel

    # Try with minimal settings
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/tinyllama",
        max_seq_length=256,
        load_in_4bit=True,
    )
    print("✅ Loaded tiny model as fallback")

🧹 Cleared GPU memory
📊 Available GPU memory: 4.07 GB

TESTING RAG WITH TINYLLAMA
🚀 Initializing RAG System for T4 GPU...
==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

❌ Error: Collection [rag_docs] already exists

🔄 Trying alternative approach...
==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Loaded tiny model as fallback
