<a href="https://colab.research.google.com/github/Aparnamol-KS/CodeCompanion-GroqAI/blob/main/Part_B_VectorDB_Embeddings_GenAI_Bootcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 2: Vector Databases & Embeddings
Welcome to Day 2 of the GenAI Bootcamp! Today, weâ€™ll learn how to work with embeddings and vector databases. This notebook includes theory, working examples, and a mini-project.


## 1. What are Embeddings?
Embeddings are vector representations of data like text or images. They capture semantic meaning.

We can generate embeddings using:
- HuggingFace's `sentence-transformers`
- OpenAI's `text-embedding-ada-002` model

In [None]:
# Install required packages
!pip install -q sentence-transformers faiss-cpu chromadb pandas

In [None]:
# Generate embeddings using HuggingFace
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ["What is AI?", "Tell me about machine learning", "What is deep learning?"]
embeddings = model.encode(sentences)
print("Shape of embeddings:", embeddings.shape)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = model.encode("What is artificial intelligence?")
embedding

## 2. Using FAISS for Vector Search
FAISS is Facebook's library for efficient similarity search over embeddings.

In [None]:
import faiss
import numpy as np

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
D, I = index.search(np.array([embeddings[0]]), k=2)
print("Nearest neighbors for:", sentences[0])
for i in I[0]:
    print(" -", sentences[i])

## 3. Chunking Strategies
To chunk documents, use either fixed-size chunks or sentence-level splits. Useful when indexing large documents.

In [None]:
# Simple chunking example
text = """Artificial Intelligence is a vast field. It includes machine learning and deep learning. Embeddings are a powerful way to represent text."""
chunks = text.split('.')
chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
chunks

## 4. Mini Project: Vector-based Product FAQ Assistant
- Load a CSV of Q&A
- Generate embeddings for questions
- Search for similar questions using FAISS

In [None]:
import pandas as pd
data = pd.DataFrame({
    'question': [
        'How do I reset my password?',
        'Where can I find the user manual?',
        'How to contact support?',
        'How do I update the software?'
    ],
    'answer': [
        'Go to settings and click reset password.',
        'The user manual is available on the product page.',
        'Contact us at support@example.com.',
        'Visit settings > update to install latest software.'
    ]
})

faq_embeddings = model.encode(data['question'].tolist())
faq_index = faiss.IndexFlatL2(faq_embeddings.shape[1])
faq_index.add(faq_embeddings)

# Sample query
query = "I forgot my password"
query_embedding = model.encode([query])
D, I = faq_index.search(np.array(query_embedding), k=2)
print("Query:", query)
for i in I[0]:
    print("Answer:", data.iloc[i]['answer'])

#Mini-Project : FAQ querying using Pinecone

In [None]:
import pandas as pd

# Path to your file
file_path = '/content/train.json'

# Read the JSON lines into a DataFrame
df = pd.read_json(file_path, lines=True)

In [None]:
df

In [None]:
import json

# Load your file
input_path = "train.json"
output_path = "train_fixed.json"

# Read each line and parse it as JSON
with open(input_path, "r", encoding="utf-8") as infile:
    data = [json.loads(line) for line in infile]

# Write as a proper JSON array
with open(output_path, "w", encoding="utf-8") as outfile:
    json.dump(data, outfile, indent=2)

print(f"Fixed JSON saved to: {output_path}")


In [None]:
import json

# Path to your JSON file
file_path = "train_fixed.json"

# Load JSON data into a variable
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)


print(data[0])


In [None]:

for i, record in enumerate(data):
    record["id"] = f"rec-{i}"


In [None]:
pinecone_records = []
answer_lookup = {}

for item in data:
    pinecone_records.append({
        "_id": item["id"],
        "chunk_text": item["question"]
    })
    answer_lookup[item["id"]] = item["answer"]


In [None]:
pip install -q pinecone

In [None]:
# Import the Pinecone library
from pinecone import Pinecone

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key="pcsk_37wn32_3dzTp3RXhQcC4FPya9mThy7npoz9ogkWoUEcV9Q2eTgonkYqZEoh8E3UCwQj5fD")


In [None]:
index_name = "product-faq"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [None]:
dense_index = pc.Index(index_name)
dense_index.upsert_records(namespace="example-namespace", records=pinecone_records)


In [None]:
# View stats for the index
stats = dense_index.describe_index_stats()
print(stats)

In [None]:
print(pinecone_records)

In [None]:
# Define the query
query = "How to cancel my order"

# Search the dense index
results = dense_index.search(
    namespace="example-namespace",
    query={
        "top_k": 2,
        "inputs": {
            'text': query
        }
    }
)

# Access the first hit
hit = results['result']['hits'][0]

# Use the _id to get the answer from the lookup dictionary
print(answer_lookup[hit['_id']])


for hit in results['result']['hits']:
        print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | text: {hit['fields']['chunk_text']:<50} | answer: {answer_lookup[hit['_id']]}")


In [None]:
pip install -q gradio


In [None]:
import gradio as gr

def query_faq(user_question):
    # Search Pinecone with the input query
    results = dense_index.search(
        namespace="example-namespace",
        query={
            "top_k": 2,
            "inputs": {
                'text': user_question
            }
        }
    )

    # Access top result
    hits = results.get("result", {}).get("hits", [])

    if hits:
        top_id = hits[0]['_id']
        answer = answer_lookup.get(top_id, "Answer not found.")
        question_match = hits[0]['fields']['chunk_text']
        return f"Matched Question: {question_match}\n\nAnswer: {answer}"
    else:
        return "No relevant answer found."

import gradio as gr

sample_questions = [
    "How can I create an account?",
    "What payment methods do you accept?",
    "How can I track my order?",
    "What is your return policy?",
    "Can I cancel my order?",
    "How long does shipping take?",
    "Do you offer international shipping?",
    "What should I do if my package is lost or damaged?"
]

gr.Interface(
    fn=query_faq,
    inputs=gr.Textbox(label="Ask a Question", placeholder="e.g., How can I create an account?"),
    outputs=gr.Textbox(label="Answer", lines=8, interactive=False),
    title="Product FAQ Assistant",
    description="Ask your product-related question and get an instant answer from the indexed FAQ database.",
    examples=[[q] for q in sample_questions]
).launch()




In [None]:
pip install -q gradio pinecone

In [None]:
import json
import gradio as gr
from pinecone import Pinecone

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_4kRU8s_7vxpTuQ5inJmb3TprsaDykMveUafJEVsWMtVD7cCtnTXvXfPdT6keYoQu6LJSQF")

index_name = "product-faq"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model": "llama-text-embed-v2",
            "field_map": {"text": "chunk_text"}
        }
    )

dense_index = pc.Index(index_name)

# Global variables for answer lookup and records
answer_lookup = {}
pinecone_records = []

# Function to process uploaded JSON
import os

def upload_and_index(file):
    global answer_lookup, pinecone_records

    # Handle Gradio file input or manual file object
    if hasattr(file, "read"):
        raw_data = file.read().decode("utf-8")
    elif isinstance(file, str) and os.path.exists(file):
        with open(file, "r", encoding="utf-8") as f:
            raw_data = f.read()
    else:
        return "Invalid file input."

    # Try line-delimited JSON first
    try:
        data = [json.loads(line) for line in raw_data.strip().split("\n")]
    except json.JSONDecodeError:
        data = json.loads(raw_data)

    # Assign IDs and build Pinecone records
    for i, record in enumerate(data):
        record["id"] = f"rec-{i}"

    answer_lookup = {rec["id"]: rec["answer"] for rec in data}
    pinecone_records = [{"_id": rec["id"], "chunk_text": rec["question"]} for rec in data]

    # Upload to Pinecone
    dense_index.upsert_records(namespace="example-namespace", records=pinecone_records)

    return f"{len(data)} FAQs indexed successfully."


# Query function
def query_faq(user_question):
    results = dense_index.search(
        namespace="example-namespace",
        query={
            "top_k": 2,
            "inputs": {
                'text': user_question
            }
        }
    )

    hits = results.get("result", {}).get("hits", [])

    if hits:
        top_id = hits[0]['_id']
        answer = answer_lookup.get(top_id, "Answer not found.")
        question_match = hits[0]['fields']['chunk_text']
        return f"{answer}"
    else:
        return "No relevant answer found."



with gr.Blocks() as demo:
    gr.Markdown("# AI-Powered FAQ Assistant\nUpload a JSON file and ask your questions.")

    with gr.Row():
        json_file = gr.File(label="Upload FAQ JSON (line-delimited or array)")
        upload_btn = gr.Button("Upload & Index")

    upload_status = gr.Textbox(label="Upload Status", interactive=False)

    upload_btn.click(fn=upload_and_index, inputs=json_file, outputs=upload_status)

    user_input = gr.Textbox(label="Ask a Question", placeholder="e.g., How can I create an account?")
    output = gr.Textbox(label="Answer", lines=5, interactive=False)

    ask_button = gr.Button("Get Answer")
    ask_button.click(fn=query_faq, inputs=user_input, outputs=output)


demo.launch()


In [None]:
with open("/content/train_fixed.json", "rb") as f:
    result = upload_and_index(f)

print(result)
