Import the necessary Python libraries

In [1]:
import wikipedia
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


Retrieving Knowledge

In [2]:
def get_wikipedia_content(topic):
    try:
        page = wikipedia.page(topic)
        return page.content
    except wikipedia.exceptions.PageError:
        return None
    except wikipedia.exceptions.DisambiguationError as e:
        # handle cases where the topic is ambiguous
        print(f"Ambiguous topic. Please be more specific. Options: {e.options}")
        return None

# user input
topic = input("Enter a topic to learn about: ")
document = get_wikipedia_content(topic)

if not document:
    print("Could not retrieve information.")
    exit()

split the text into smaller overlapping chunks for better retrieval

In [3]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

def split_text(text, chunk_size=256, chunk_overlap=20):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end]))
        if end == len(tokens):
            break
        start = end - chunk_overlap
    return chunks

chunks = split_text(document)
print(f"Number of chunks: {len(chunks)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (17488 > 512). Running this sequence through the model will result in indexing errors


Number of chunks: 75


Storing and Retrieving Knowledge

In [4]:
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = embedding_model.encode(chunks)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

Querying the RAG Pipeline

In [5]:
query = input("Ask a question about the topic: ")
query_embedding = embedding_model.encode([query])

k = 3
distances, indices = index.search(np.array(query_embedding), k)
retrieved_chunks = [chunks[i] for i in indices[0]]
print("Retrieved chunks:")
for chunk in retrieved_chunks:
    print("- " + chunk)

Retrieved chunks:
- . microsoft, motorola mobility v. apple inc., and apple corps v. apple computer. apple has also had to defend itself against charges on numerous occasions of violating intellectual property rights. most have been dismissed in the courts as shell companies known as patent trolls, with no evidence of actual use of patents in question. on december 21, 2016, nokia announced that in the u. s. and germany, it has filed a suit against apple, claiming that the latter ' s products infringe on nokia ' s patents. most recently, in november 2017, the united states international trade commission announced an investigation into allegations of patent infringement in regards to apple ' s remote desktop technology ; aqua connect, a company that builds remote desktop software, has claimed that apple infringed on two of its patents. in january 2022, ericsson sued apple over payment of royalty of 5g technology. on june 24, 2024, the european commission accused apple of violating the di

Answering the Question with an LLM

In [6]:
qa_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

context = " ".join(retrieved_chunks)
answer = qa_pipeline(question=query, context=context)
print(f"Answer: {answer['answer']}")

Device set to use cpu


Answer: apple corps v. apple computer
