<a href="https://colab.research.google.com/github/AmmarJamshed/LLM_GDG_training/blob/main/T5_vs_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample raw data (replace this with your own text data)
raw_data = [
    "Paris is the capital of France.",
    "The Eiffel Tower is one of the most famous landmarks in Paris.",
    "Python is a programming language.",
    "Artificial intelligence is a branch of computer science."
]

# Initialize the tokenizer and model (Google's T5)
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Manually encode the documents into vectors (using TF-IDF as a simple example)
vectorizer = TfidfVectorizer(stop_words="english")
doc_vectors = vectorizer.fit_transform(raw_data).toarray()

# Initialize the retriever (we'll manually handle the vectorization part)
class SimpleRetriever:
    def __init__(self, vectors, raw_data):
        self.vectors = vectors
        self.raw_data = raw_data

    def retrieve(self, query, top_k=1):
        query_vector = vectorizer.transform([query]).toarray()
        scores = np.dot(self.vectors, query_vector.T).flatten()  # Compute cosine similarity
        top_indices = np.argsort(scores)[::-1][:top_k]
        return [self.raw_data[i] for i in top_indices]

# Create a custom retriever instance
retriever = SimpleRetriever(doc_vectors, raw_data)

# Sample query
query = "What is the capital of France?"

# Retrieve relevant documents
retrieved_docs = retriever.retrieve(query, top_k=1)

# Print the retrieved documents
print("Retrieved Documents:", retrieved_docs)

# Concatenate query and the retrieved document
context = " ".join(retrieved_docs)
input_text = f"Question: {query} Context: {context}"

# Encode the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate the response based on the query and the retrieved context
outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

# Decode the response
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", generated_text)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Retrieved Documents: ['Paris is the capital of France.']
Generated Text: Paris


# RAG - Haystack

In [14]:
pip install farm-haystack[all]

Collecting farm-haystack[all]
  Downloading farm_haystack-1.26.4-py3-none-any.whl.metadata (31 kB)
Collecting boilerpy3 (from farm-haystack[all])
  Downloading boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)
Collecting events (from farm-haystack[all])
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Collecting lazy-imports==0.3.1 (from farm-haystack[all])
  Downloading lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)
Collecting posthog (from farm-haystack[all])
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting prompthub-py==4.0.0 (from farm-haystack[all])
  Downloading prompthub_py-4.0.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pydantic<2 (from farm-haystack[all])
  Downloading pydantic-1.10.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (152 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.6/152.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting quantulum3 (from farm-haystac

In [1]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import FARMReader, DenseRetriever
from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import fetch_archive_from_http
from transformers import pipeline

# 1. Set up the document store (FAISS in this case)
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# 2. Load some documents (you can replace this with your own dataset)
# Here we are fetching a sample dataset
fetch_archive_from_http(
    url="https://huggingface.co/datasets/nq_open/blob/main/nq_open.json",
    path_dir="data"
)

# 3. Write documents to the document store
# Assuming you already have a collection of documents to index
# You can use your own dataset here
from haystack.utils import convert_files_to_docs
docs = convert_files_to_docs(dir_path="data/nq_open")
document_store.write_documents(docs)

# 4. Initialize Dense Retriever and Reader
retriever = DenseRetriever(document_store=document_store)
reader = FARMReader("deepset/roberta-base-squad2", use_gpu=True)

# 5. Create a pipeline with the retriever and reader
pipeline = GenerativeQAPipeline(reader, retriever)

# 6. Ask a question
query = "What is the capital of France?"
result = pipeline.run(query=query, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})

# 7. Print the result
print(f"Answer: {result['answers'][0].answer}")

ImportError: cannot import name 'GenerativeQAPipeline' from 'haystack.pipelines' (/usr/local/lib/python3.10/dist-packages/haystack/pipelines/__init__.py)

In [8]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import fetch_archive_from_http
from transformers import pipeline

# 1. Set up the document store (FAISS in this case)
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# 2. Load some documents (you can replace this with your own dataset)
fetch_archive_from_http(
    url="https://huggingface.co/datasets/nq_open/blob/main/nq_open.json",
    output_dir="data"
)

# 3. Write documents to the document store
from haystack.utils import convert_files_to_docs
docs = convert_files_to_docs(dir_path="data/nq_open")
document_store.write_documents(docs)

# 4. Initialize Dense Passage Retriever and Reader (T5 as Reader for generative QA)
retriever = DensePassageRetriever(document_store=document_store)
reader = FARMReader("t5-base", use_gpu=True)

# 5. Create a pipeline with the retriever and reader (use DocumentSearchPipeline)
pipeline = DocumentSearchPipeline(retriever)

# 6. Ask a question
query = "What is the capital of France?"
# First, use the retriever to get documents
retrieved_docs = pipeline.run(query=query, params={"Retriever": {"top_k": 5}})

# Debug: Print retrieved documents
print(f"Retrieved documents: {retrieved_docs['documents']}")

# 7. Pass the retrieved documents to the reader for QA
answers = reader.predict(query=query, documents=retrieved_docs['documents'])

# Debug: Check the answers
print(f"Answers: {answers['answers']}")

# 8. Print the result, handle empty answers case
if answers['answers']:
    print(f"Answer: {answers['answers'][0].answer}")
else:
    print("No answer found.")

ERROR:haystack.modeling.model.language_model:Model type not understood for 't5-base' (model_type not set). Either supply the local path for a saved model, or the name of a model that can be downloaded from the Model Hub. Ensure that the model class name can be inferred from the directory name when loading a Transformers model.
ERROR:haystack.modeling.model.language_model:Using the AutoModel class for 't5-base'. This can cause crashes!
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retrieved documents: []
Answers: []
No answer found.
