## 0. Install required libraries and Import them

In [None]:
# Install required libraries
# !pip install wikipedia rank-bm25 transformers numpy tiktoken     ## uncomment if running in a new environment

# Import necessary libraries
import wikipedia
from rank_bm25 import BM25Okapi
from transformers import pipeline
import numpy as np


## 1. Fetch and preprocess Simple Wikipedia articles

In [39]:
def fetch_and_split_articles(titles):
    # https://meta.wikimedia.org/wiki/List_of_Wikipedias
    wikipedia.set_lang("en")  # Use English Wikipedia by default
    paragraphs = []
    for title in titles:
        try:
            page = wikipedia.page(title, auto_suggest=False)
            content = page.content
            # Split into paragraphs (separated by double newlines)
            paras = content.split('\n\n')
            paragraphs.extend([para.strip() for para in paras if para.strip()])
        except Exception as e:
            print(f"Could not fetch {title}: {e}")
    return paragraphs

# List of article titles for the demo
titles = ['Dog', 'Moon', 'Computer', 'Solar System', 'Animals']
paragraphs = fetch_and_split_articles(titles)
print(f"Collected {len(paragraphs)} paragraphs from {len(titles)} articles.")


Collected 366 paragraphs from 5 articles.


## 2. Set up BM25 retriever & Load the question-answering model

In [40]:
# Tokenize paragraphs (split into words)
tokenized_paragraphs = [para.lower().split() for para in paragraphs]
bm25 = BM25Okapi(tokenized_paragraphs)

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")


Device set to use cuda:0


## 3. Run example queries

In [None]:
# Function to process a query
def process_query(query):
    # Tokenize the query
    query_tokens = query.lower().split()
    # Get relevance scores for all paragraphs
    scores = bm25.get_scores(query_tokens)
    # Find the most relevant paragraph
    top_idx = np.argmax(scores)
    retrieved_para = paragraphs[top_idx]
    # Extract the answer from the retrieved paragraph
    answer = qa_pipeline(question=query, context=retrieved_para)
    return retrieved_para, answer['answer']

# Run example queries for the showcase
if __name__ == "__main__":
    example_queries = [
        "How long have humans and dogs been together?",
        "How many planets are in our solar system?",
        "Who invented the computer?"
    ]
    
    print("=== RAG System Demo ===")
    for query in example_queries:
        retrieved_para, answer = process_query(query)
        print(f"\n**Query**: {query}")
        print(f"**Retrieved Paragraph**: {retrieved_para[:200]}...")  # Show first 200 characters
        # print(f"**Answer**: {answer}")
        print("-" * 50)


=== RAG System Demo ===

**Query**: How long have humans and dogs been together?
**Retrieved Paragraph**: There are around 450 official dog breeds, the most of any mammal. Dogs began diversifying in the Victorian era, when humans took control of their natural selection. Most breeds were derived from small...
--------------------------------------------------

**Query**: How many planets are in our solar system?
**Retrieved Paragraph**: Besides solar energy, the primary characteristic of the Solar System enabling the presence of life is the heliosphere and planetary magnetic fields (for those planets that have them). These magnetic f...
--------------------------------------------------

**Query**: Who invented the computer?
**Retrieved Paragraph**: The Antikythera mechanism is believed to be the earliest known mechanical analog computer, according to Derek J. de Solla Price. It was designed to calculate astronomical positions. It was discovered ...
-------------------------------------