In [1]:
# RankAI MVP
# Goal: Compare multiple AI models on text summarization
# using accuracy, latency, and hallucination proxies


In [2]:
!pip install transformers datasets rouge-score torch pandas numpy




In [3]:
import time
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import pipeline
from rouge_score import rouge_scorer


In [4]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:20]")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
models = {
    "T5": pipeline("summarization", model="t5-small"),
    "DistilBART": pipeline("summarization", model="sshleifer/distilbart-cnn-12-6"),
    "Pegasus": pipeline("summarization", model="google/pegasus-arxiv")
}


Device set to use cuda:0
Device set to use cuda:0
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-arxiv and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [6]:
results = []

for model_name, model in models.items():
    for row in dataset:
        article = row["article"][:3000]  # safety cap

        start = time.time()
        summary = model(
            article,
            max_length=130,
            min_length=30,
            truncation=True
        )[0]["summary_text"]

        latency = time.time() - start

        results.append({
            "model": model_name,
            "generated": summary,
            "reference": row["highlights"],
            "latency": latency,
            "length": len(summary.split())
        })


Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [7]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)

# Compute accuracy for each result
for r in results:
    score = scorer.score(r["reference"], r["generated"])
    r["accuracy"] = score["rouge1"].fmeasure


In [8]:
# Directional Adjustment

# --- Latency (lower is better â†’ convert to higher-is-better score)
max_latency = max(r["latency"] for r in results)
min_latency = min(r["latency"] for r in results)

for r in results:
    r["latency_score"] = (max_latency - r["latency"]) / (max_latency - min_latency)


# --- Hallucination proxy using output length (lower is better)
max_length = max(r["length"] for r in results)

for r in results:
    hallucination_rate = r["length"] / max_length
    r["hallucination_score"] = 1 - hallucination_rate


In [9]:
# Final Score (Weighted Decision Rule)

W_ACC = 0.5   # accuracy matters most
W_LAT = 0.3   # speed matters
W_HALL = 0.2  # trust matters

for r in results:
    r["final_score"] = (
        W_ACC * r["accuracy"]
        + W_LAT * r["latency_score"]
        + W_HALL * r["hallucination_score"]
    )


In [10]:
import pandas as pd

df = pd.DataFrame(results)

ranking = (
    df.groupby("model")[[
        "accuracy",
        "latency_score",
        "hallucination_score",
        "final_score"
    ]]
    .mean()
    .sort_values("final_score", ascending=False)
)

ranking


Unnamed: 0_level_0,accuracy,latency_score,hallucination_score,final_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DistilBART,0.355551,0.965408,0.657422,0.598882
T5,0.324965,0.921972,0.658594,0.570793
Pegasus,0.100541,0.621374,0.175391,0.271761


In [11]:
# Simple explanation generator

top_model = ranking.index[0]

explanation = f"""
RankAI Recommendation: {top_model}

Why:
- Highest overall final score
- Balanced accuracy, speed, and hallucination risk
- Other models traded accuracy for latency or safety
"""

print(explanation)



RankAI Recommendation: DistilBART

Why:
- Highest overall final score
- Balanced accuracy, speed, and hallucination risk
- Other models traded accuracy for latency or safety

