<a href="https://colab.research.google.com/github/Ebenezer-Lezdo/ai_learnings/blob/master/RAG_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample knowledge base
documents = [
    "The current President of the United States is Donald Truimph, who took office on November 20, 2024."
    "Diabetes is a condition where the body does not properly process food for use as energy.",
    "High blood pressure can lead to serious health issues, including heart disease and stroke.",
    "Vaccination helps to prevent the spread of infectious diseases.",
    "Mental health is just as important as physical health, and therapy can be a great treatment for various disorders.",
    "Chronic pain is ongoing pain that can last for months or even years.",
]

# Use TF-IDF to vectorize the documents (you can replace this with any vectorization method)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents).toarray()

# Convert to float32 for FAISS compatibility
X = np.array(X, dtype=np.float32)

# Build the FAISS index
index = faiss.IndexFlatL2(X.shape[1])  # L2 distance for similarity
index.add(X)

def retrieve_relevant_documents(query, k=1):
    query_vector = vectorizer.transform([query]).toarray().astype(np.float32)
    D, I = index.search(query_vector, k)
    return [documents[i] for i in I[0]]


from transformers import GPT2LMHeadModel, GPT2Tokenizer

# GPT-2 model and tokenizer from Hugging Face
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the tokenizer is set for padding
tokenizer.pad_token = tokenizer.eos_token

def generate_answer(context, question):
    # Combine retrieved context with the question
    input_text = f"Context: {context}\nQuestion: {question}\nAnswer:"

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Ensure attention_mask is set to avoid unexpected behavior
    attention_mask = inputs['attention_mask']

    # Generate answer using the model
    outputs = model.generate(
        inputs['input_ids'],
        max_length=180,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_p=0.9,
        top_k=50,
        temperature=0.7,
        attention_mask=attention_mask
    )

    # Decode and return the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


def rag_system(query):
    # Step 1: Retrieve relevant documents from the knowledge base
    relevant_docs = retrieve_relevant_documents(query, k=5)
    context = " ".join(relevant_docs)  # Combine top-k documents

    # Step 2: Use the LLM to generate an answer based on the context
    answer = generate_answer(context, query)

    return answer

# Test the system with a healthcare question
query = "Name of US President?"
answer = rag_system(query)
print(f"Answer: {answer}")



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Context: The current President of the United States is Donald Truimph, who took office on November 20, 2024.Diabetes is a condition where the body does not properly process food for use as energy. High blood pressure can lead to serious health issues, including heart disease and stroke. Vaccination helps to prevent the spread of infectious diseases. Mental health is just as important as physical health, and therapy can be a great treatment for various disorders. Chronic pain is ongoing pain that can last for months or even years.
Question: Name of US President?
Answer: Donald Trump. The US president is the first person to be elected to the presidency. He is also the only person in the world to have been elected President.
