In [9]:
# Install packages (uncomment if needed)
# !pip install transformers torch faiss-cpu sentence-transformers datasets

import torch
import numpy as np
from transformers import RagTokenizer, RagSequenceForGeneration, RagRetriever, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from datasets import Dataset, Features, Value, Array2D

# Step 1: Prepare documents and titles
documents = [
    "The capital of France is Paris.",
    "The largest planet in the solar system is Jupiter.",
    "Artificial Intelligence is transforming many industries."
]
titles = ["doc1", "doc2", "doc3"]

# Step 2: Create embeddings with sentence-transformers
encoder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = encoder.encode(documents, convert_to_numpy=True)  # shape: (3, 384)

# Step 3: Define dataset features correctly (embeddings shape = (384,))
features = Features({
    "title": Value("string"),
    "text": Value("string"),
    "embeddings": Array2D(dtype="float32", shape=(384,))
})

# Step 4: Create the dataset with features
dataset = Dataset.from_dict({
    "title": titles,
    "text": documents,
    "embeddings": doc_embeddings
}, features=features)

# Step 5: Save dataset and build FAISS index
dataset_path = "/tmp/my_docs_dataset"
index_path = "/tmp/my_faiss_index.faiss"
dataset.save_to_disk(dataset_path)

# Build FAISS index on embeddings column
index = dataset.get_index("embeddings")
index.save(index_path)

# Step 6: Load RAG tokenizer and model
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
rag_model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")

# Step 7: Initialize retriever with the custom dataset and index
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
)

# Step 8: Set retriever for rag_model
rag_model.set_retriever(retriever)

# Step 9: Query
query = "What is the capital of France?"
input_dict = rag_tokenizer.prepare_seq2seq_batch(query, return_tensors="pt")

# Step 10: Generate answer using RAG with retriever
rag_model.eval()
with torch.no_grad():
    generated = rag_model.generate(
        input_ids=input_dict["input_ids"],
        attention_mask=input_dict["attention_mask"],
        max_new_tokens=50
    )
answer = rag_tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
print("RAG Answer:", answer)

# === Fine-tuning demo ===
print("\n=== Fine-tuning Demo ===")
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Toy paraphrasing data
inputs = ["The capital of France is Paris.", "The sky is blue."]
labels = ["Paris is the capital of France.", "The sky has a blue color."]

tokenized_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
tokenized_labels = tokenizer(labels, padding="max_length", truncation=True, max_length=20, return_tensors="pt").input_ids

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
model.train()
for epoch in range(3):
    optimizer.zero_grad()
    outputs = model(input_ids=tokenized_inputs.input_ids, labels=tokenized_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

model.eval()
test_input = tokenizer("The capital of France is Paris.", return_tensors="pt").input_ids
output = model.generate(test_input, max_new_tokens=20)
prediction = tokenizer.decode(output[0], skip_special_tokens=True)
print("Fine-tuned output:", prediction)


ValueError: shape=(384,) and ndims=2 don't match