In [None]:
# Install the required libraries
!pip install -U langchain langchain-community faiss-cpu sentence-transformers transformers wikipedia-api
!pip install requests==2.32.4


Collecting requests<3,>=2 (from langchain)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.32.4
    Uninstalling requests-2.32.4:
      Successfully uninstalled requests-2.32.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0mSuccessfully installed requests-2.32.5
Collecting requests==2.32.4
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Using cached requests-2.32.4-py3-none-any.whl (64 kB)
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.32.5
    Uninstalling requests-2.32.5:


In [None]:
# Core libraries
import wikipediaapi
import re
from collections import Counter
import pandas as pd

# LangChain community modules
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline

# Hugging Face transformers
from transformers import pipeline


In [None]:
# Load a Wikipedia article (you can change the title, e.g., "Artificial intelligence")
wiki_wiki = wikipediaapi.Wikipedia(language='en', user_agent='MyRAGBot/1.0 (anjutos-genai-consultant)')
page = wiki_wiki.page("Artificial intelligence")
text = page.text

print("Loaded article:", page.title)
print("First 500 chars:\n", text[:500])


Loaded article: Artificial intelligence
First 500 chars:
 Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.
High-profile applications of AI incl


In [None]:
# Split text into bigger chunks with overlap
# Larger chunks help keep context together (like McCarthy, Minsky, etc.)
text_splitter = CharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
docs = text_splitter.create_documents([text])

print(f"Number of chunks created: {len(docs)}")




Number of chunks created: 58


In [None]:
# Use a stronger embedding model for semantic search
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Store embeddings in FAISS vector DB
db = FAISS.from_documents(docs, embeddings)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Use a larger LLM for better answers
# google/flan-t5-large is still free and good balance of accuracy vs speed
hf_pipeline = pipeline("text2text-generation", model="google/flan-t5-large", max_length=512)

# Wrap it for LangChain
llm = HuggingFacePipeline(pipeline=hf_pipeline)


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [None]:
# Build RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 8}),  # retrieve top 8 chunks
    return_source_documents=True
)


In [None]:
# Gold answers (manually defined for evaluation)
gold_dataset = [
    {
        "query": "Who are considered pioneers of AI?",
        "answer": "John McCarthy, Marvin Minsky, Allen Newell, and Herbert Simon"
    },
    {
        "query": "What are the main goals of Artificial Intelligence?",
        "answer": "Learning, reasoning, knowledge representation, planning, natural language processing, perception, and robotics"
    },
    {
        "query": "What are some applications of AI mentioned in the article?",
        "answer": "Search engines, recommendation systems, virtual assistants, autonomous vehicles, generative tools, and strategy games"
    }
]


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load embedding model for semantic similarity scoring
semantic_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def compute_f1(pred, gold):
    pred_tokens = re.findall(r"\w+", pred.lower())
    gold_tokens = re.findall(r"\w+", gold.lower())
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return (2 * precision * recall) / (precision + recall)

def semantic_score(pred, gold):
    emb1 = semantic_model.encode(pred, convert_to_tensor=True)
    emb2 = semantic_model.encode(gold, convert_to_tensor=True)
    return float(util.cos_sim(emb1, emb2).item())


In [None]:
eval_results = []

for item in gold_dataset:
    q, gold_answer = item["query"], item["answer"]
    result = qa.invoke({"query": q})
    pred = result["result"]
    sources = " ".join([doc.page_content for doc in result["source_documents"]])

    exact = int(pred.strip().lower() == gold_answer.strip().lower())
    f1 = compute_f1(pred, gold_answer)
    grounded = int(any(word.lower() in sources.lower() for word in pred.split()))
    relevance = int(len(pred.strip()) > 0)
    sem_sim = semantic_score(pred, gold_answer)

    eval_results.append({
        "Query": q,
        "Gold": gold_answer,
        "Predicted": pred,
        "Exact Match": exact,
        "F1": round(f1, 2),
        "Semantic Sim": round(sem_sim, 2),
        "Grounded": grounded,
        "Relevance": relevance
    })

df_metrics = pd.DataFrame(eval_results)
display(df_metrics)

# Summary
summary = {
    "Exact Match %": df_metrics["Exact Match"].mean() * 100,
    "Avg F1 %": df_metrics["F1"].mean() * 100,
    "Avg Semantic Sim %": df_metrics["Semantic Sim"].mean() * 100,
    "Groundedness %": df_metrics["Grounded"].mean() * 100,
    "Relevance %": df_metrics["Relevance"].mean() * 100
}
print("\nEvaluation Summary:")
for k, v in summary.items():
    print(f"{k}: {v:.1f}%")


Token indices sequence length is longer than the specified maximum sequence length for this model (3346 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,Query,Gold,Predicted,Exact Match,F1,Semantic Sim,Grounded,Relevance
0,Who are considered pioneers of AI?,"John McCarthy, Marvin Minsky, Allen Newell, an...","McCulloch and Pitts design for ""artificial neu...",0,0.05,0.46,1,1
1,What are the main goals of Artificial Intellig...,"Learning, reasoning, knowledge representation,...","learning, reasoning, knowledge representation,...",0,0.92,0.9,1,1
2,What are some applications of AI mentioned in ...,"Search engines, recommendation systems, virtua...",High-profile applications of AI include advanc...,0,0.36,0.72,1,1



Evaluation Summary:
Exact Match %: 0.0%
Avg F1 %: 44.3%
Avg Semantic Sim %: 69.3%
Groundedness %: 100.0%
Relevance %: 100.0%
