<a href="https://colab.research.google.com/github/DilkiSandunika/VGTU_Thesis_Project/blob/main/notebooks/%2003_end_to_end_pipeline_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===================================================================
# CELL 1: Install All Necessary Libraries for Gemma
# ===================================================================
print("Installing required libraries for the full RAG pipeline with Gemma...")
# We need transformers and accelerate for Hugging Face models, and bitsandbytes for quantization
!pip install pandas faiss-cpu sentence-transformers torch transformers accelerate bitsandbytes -q
print("Libraries installed successfully.")


# ===================================================================
# CELL 2: Import Libraries and Log in to Hugging Face
# ===================================================================
import os
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import pickle
from google.colab import userdata
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Securely load the Hugging Face token from Colab secrets
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    print("Hugging Face token loaded successfully.")
except Exception as e:
    print("ERROR: Could not load Hugging Face token. Please add it to Colab's secrets (key icon on the left) with the name HF_TOKEN.")

# Log in to Hugging Face Hub
from huggingface_hub import login
login(token=HF_TOKEN)
print("Successfully logged in to Hugging Face.")


# ===================================================================
# CELL 3: Load All Pre-processed Data and Models
# ===================================================================
print("\nLoading all necessary components...")

# --- 1. Load the Parsed Requirements ---
df_requirements = pd.read_csv('/content/parsed_requirements.csv')
print(f"Loaded {len(df_requirements)} requirements from the CSV file.")

# --- 2. Load the Knowledge Base ---
index = faiss.read_index('/content/knowledge_base.index')
with open('/content/knowledge_base_docs.pkl', 'rb') as f:
    knowledge_base_docs = pickle.load(f)
print("Loaded FAISS index and knowledge base documents.")

# --- 3. Load the Sentence Transformer Model ---
retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence Transformer model loaded.")

# --- 4. Load the Gemma Model for Generation (This is the big step!) ---
print("\nLoading Google Gemma model... This will take a few minutes and use significant RAM.")
model_id = "google/gemma-2b-it"

# Use quantization to make the model fit into Colab's free GPU memory
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
gemma_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto" # This automatically uses the GPU if available
)
print("Google Gemma model loaded successfully!")
print("\n--- Setup is complete and all components are ready! ---")


# ===================================================================
# CELL 4: The RAG Core Functions (Updated for Gemma)
# ===================================================================

def retrieve_relevant_knowledge(query_text, top_k=3):
    """
    Searches the FAISS index for the most relevant knowledge base documents for a given query.
    """
    query_vector = retrieval_model.encode([query_text])
    distances, indices = index.search(query_vector.astype('float32'), top_k)
    retrieved_docs = [knowledge_base_docs[i] for i in indices[0]]
    return retrieved_docs

def generate_compliant_requirement_with_gemma(original_requirement, retrieved_docs):
    """
    Builds a prompt and calls the Gemma model to generate a refined requirement.
    """
    retrieved_knowledge = "\n- ".join(retrieved_docs)

    # Gemma uses a specific chat template format. We must follow it precisely.
    chat = [
        { "role": "user", "content": f"""
You are an expert Software Requirements Analyst. Your task is to refine a given software requirement to ensure it is compliant with a set of rules and well-formed according to a template.

**Compliance Rules and Template Guide to Follow:**
- {retrieved_knowledge}

**Original Requirement to Refine:**
"{original_requirement}"

**Your Task:**
Rewrite the original requirement to be fully compliant with the rules provided above.
- Ensure the output strictly follows the format: "The system shall [action description] for the [user role]."
- The final output must be a single, refined sentence and nothing else. Do not add any extra explanations.
"""
        }
    ]

    # Apply the chat template and convert to tensor inputs
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to("cuda") # Send to GPU

    # Generate the output
    outputs = gemma_model.generate(input_ids=inputs, max_new_tokens=150)

    # Decode and return the response
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # The response includes the original prompt, so we need to extract just the generated part
    return response_text[len(prompt)-7:] # -7 is a small adjustment for the template tokens

# ===================================================================
# CELL 5: Run the End-to-End Demo on a Sample Requirement
# ===================================================================

# Select a sample requirement from our DataFrame to test the pipeline
sample_index = 2
original_req_text = df_requirements.loc[sample_index, 'text']

print("=====================================================================")
print("             RAG PIPELINE DEMO with Google GEMMA                     ")
print("=====================================================================")

# --- Step 1: The Original Requirement ---
print(f"\n[INPUT] Original Requirement:\n'{original_req_text}'")

# --- Step 2: Retrieval ---
print("\n[STEP 1 - RETRIEVAL] Finding the most relevant rules from the knowledge base...")
relevant_rules = retrieve_relevant_knowledge(original_req_text)
print("  - Found the following rules:")
for rule in relevant_rules:
    print(f"    - {rule}")

# --- Step 3: Generation ---
print("\n[STEP 2 - GENERATION] Sending the original requirement and retrieved rules to Gemma for refinement...")
refined_requirement = generate_compliant_requirement_with_gemma(original_req_text, relevant_rules)

# --- Step 4: The Final Output ---
print("\n---------------------------------------------------------------------")
print(f"[OUTPUT] Final, Compliant Requirement:\n'{refined_requirement}'")
print("---------------------------------------------------------------------")

Installing required libraries for the full RAG pipeline with Gemma...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hLibraries installed successfully.
Hugging Face token loaded successfully.
Successfully logged in to Hugging Face.

Loading all necessary components...
Loaded 115 requirements from the CSV file.
Loaded FAISS index and knowledge base documents.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence Transformer model loaded.

Loading Google Gemma model... This will take a few minutes and use significant RAM.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Google Gemma model loaded successfully!

--- Setup is complete and all components are ready! ---
             RAG PIPELINE DEMO with Google GEMMA                     

[INPUT] Original Requirement:
'The solution should provide an interface for the user to log any defects or enhancement requests on the application and track thereafter.'

[STEP 1 - RETRIEVAL] Finding the most relevant rules from the knowledge base...
  - Found the following rules:
    - All extracted functional requirements must strictly follow this format: "The system shall [action description] for the [user role]." The requirement must be a complete, standalone sentence. For example: "The system shall generate a monthly report for the administrator."
    - Rule 102: Any requirement handling personally identifiable information (PII) or sensitive data must mention encryption or secure handling.
    - Rule 104: Requirements must be written in a clear, active voice (e.g., "The system shall do X" not "X should be done").

[