In [1]:
!pip install -q --upgrade transformers datasets pandas evaluate rouge_score sentencepiece bert-score accelerate

# Install BLEURT directly from its GitHub repository
!pip install -q git+https://github.com/google-research/bleurt.git

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\users\\admin\\anaconda3\\envs\\summarizer_env\\lib\\site-packages\\pandas-2.3.2.dist-info\\METADATA'



In [2]:
import pandas as pd
import torch
import evaluate
from tqdm.notebook import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, MT5ForConditionalGeneration

# --- Configuration ---
MODEL_PATH = "mbart-large-50-cnn-summarizer-en-hi_v11\final_model"
FULL_DATA_PATH = "../Dataset/filtered_articles_CNN.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

PREFIX_ENG = "summarize English: "
PREFIX_HIN = "summarize Hindi: "

TypeError: expected string or bytes-like object

In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print(f"Loading model and moving to {DEVICE}...")

model = MT5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(DEVICE)
model.eval()  # Set model to evaluation mode

print("Model loaded successfully.")

Loading tokenizer...


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading model and moving to cuda...
Model loaded successfully.


Quantitative Evaluation - Trial 1

In [None]:
# Load the full original dataset
print(f"Loading full dataset from: {FULL_DATA_PATH}")
df = pd.read_csv(FULL_DATA_PATH, engine="python", on_bad_lines="skip")
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
raw_dataset = Dataset.from_pandas(df)


# Define the same formatting function used in training
def format_dataset(batch):
    inputs, targets = [], []
    for article, eng_summary, hin_summary in zip(
        batch["raw_news_article"], batch["english_summary"], batch["hindi_summary"]
    ):
        if isinstance(article, str):
            inputs.append(PREFIX_ENG + article)
            targets.append(eng_summary)
            inputs.append(PREFIX_HIN + article)
            targets.append(hin_summary)
    return {"inputs": inputs, "targets": targets}


# Process and split the dataset
print("Processing and splitting the dataset...")
processed_dataset = raw_dataset.map(
    format_dataset, batched=True, remove_columns=raw_dataset.column_names
).flatten()

# IMPORTANT: Use the same test_size and seed to get the identical test set
train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
test_dataset = train_test_split["test"]

# For a quick test, let's use a smaller sample. Remove .select() for the full evaluation.
test_sample = test_dataset.select(range(100))

print(f"Recreated test set with {len(test_sample)} samples for evaluation.")

Loading full dataset from: ../Dataset/final_cleaned_dataset_CNN.csv
Processing and splitting the dataset...


Map:   0%|          | 0/4919 [00:00<?, ? examples/s]

Recreated test set with 100 samples for evaluation.


In [7]:
# Load metrics
print("Loading evaluation metrics...")
rouge_metric = evaluate.load("rouge")
bleurt_metric = evaluate.load("bleurt", module_type="metric", checkpoint="BLEURT-20")
bertscore_metric = evaluate.load("bertscore")

# Generate predictions
predictions = []
references = []
print("Generating summaries for the test set...")

for example in tqdm(test_sample):
    inputs = tokenizer(
        example["inputs"], return_tensors="pt", max_length=1024, truncation=True
    ).to(DEVICE)
    summary_ids = model.generate(
        inputs.input_ids, max_length=256, num_beams=4, early_stopping=True
    )
    prediction = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    predictions.append(prediction)
    references.append(example["targets"])

# Separate by language (even indices are English, odd are Hindi)
eng_preds = predictions[::2]
hin_preds = predictions[1::2]
eng_refs = references[::2]
hin_refs = references[1::2]

# Compute and display results
print("\n--- Computing English Metrics ---")
rouge_eng = rouge_metric.compute(predictions=eng_preds, references=eng_refs)
bleurt_eng = bleurt_metric.compute(predictions=eng_preds, references=eng_refs)
bert_eng = bertscore_metric.compute(
    predictions=eng_preds, references=eng_refs, lang="en"
)

print(f"  ROUGE-2: {rouge_eng['rouge2'] * 100:.2f}")
print(f"  BLEURT Score: {sum(bleurt_eng['scores']) / len(bleurt_eng['scores']):.4f}")
print(
    f"  BERTScore Precision: {sum(bert_eng['precision']) / len(bert_eng['precision']):.4f}"
)


print("\n--- Computing Hindi Metrics ---")
rouge_hin = rouge_metric.compute(predictions=hin_preds, references=hin_refs)
bleurt_hin = bleurt_metric.compute(predictions=hin_preds, references=hin_refs)
bert_hin = bertscore_metric.compute(
    predictions=hin_preds, references=hin_refs, lang="hi"
)

print(f"  ROUGE-2: {rouge_hin['rouge2'] * 100:.2f}")
print(f"  BLEURT Score: {sum(bleurt_hin['scores']) / len(bleurt_hin['scores']):.4f}")
print(
    f"  BERTScore Precision: {sum(bert_hin['precision']) / len(bert_hin['precision']):.4f}"
)

Loading evaluation metrics...


Downloading builder script: 0.00B [00:00, ?B/s]

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]


INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\default\downloads\extracted\64a145a740562dda9fae1ce4fb71155ccaf922d41c2355bee049709b8590e973\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.

INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


Downloading builder script: 0.00B [00:00, ?B/s]

Generating summaries for the test set...


  0%|          | 0/100 [00:00<?, ?it/s]


--- Computing English Metrics ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ROUGE-2: 17.94
  BLEURT Score: -0.3045
  BERTScore Precision: 0.8991

--- Computing Hindi Metrics ---


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

  ROUGE-2: 18.13
  BLEURT Score: -0.2792
  BERTScore Precision: 0.7319


Quantitative Evaluation - Trial 2

In [3]:
# Load the full original dataset
print(f"Loading full dataset from: {FULL_DATA_PATH}")
df = pd.read_csv(FULL_DATA_PATH, engine="python", on_bad_lines="skip")
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
raw_dataset = Dataset.from_pandas(df)


# Define the same formatting function used in training
def format_dataset(batch):
    inputs, targets = [], []
    for article, eng_summary, hin_summary in zip(
        batch["raw_news_article"], batch["english_summary"], batch["hindi_summary"]
    ):
        if isinstance(article, str):
            inputs.append(PREFIX_ENG + article)
            targets.append(eng_summary)
            inputs.append(PREFIX_HIN + article)
            targets.append(hin_summary)
    return {"inputs": inputs, "targets": targets}


# Process and split the dataset
print("Processing and splitting the dataset...")
processed_dataset = raw_dataset.map(
    format_dataset, batched=True, remove_columns=raw_dataset.column_names
).flatten()

# IMPORTANT: Use the same test_size and seed to get the identical test set
train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)
test_dataset = train_test_split["test"]

# For a quick test, let's use a smaller sample. Remove .select() for the full evaluation.
test_sample = test_dataset.select(range(100))

print(f"Recreated test set with {len(test_sample)} samples for evaluation.")

NameError: name 'FULL_DATA_PATH' is not defined

In [11]:
# Load metrics
print("Loading evaluation metrics...")
rouge_metric = evaluate.load("rouge")
bleurt_metric = evaluate.load("bleurt", module_type="metric", checkpoint="BLEURT-20")
bertscore_metric = evaluate.load("bertscore")

# Generate predictions
predictions = []
references = []
print("Generating summaries for the test set...")

for example in tqdm(test_sample):
    inputs = tokenizer(
        example["inputs"], return_tensors="pt", max_length=1024, truncation=True
    ).to(DEVICE)
    summary_ids = model.generate(
        inputs.input_ids, max_length=256, num_beams=4, early_stopping=True
    )
    prediction = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    predictions.append(prediction)
    references.append(example["targets"])

# Separate by language (even indices are English, odd are Hindi)
eng_preds = predictions[::2]
hin_preds = predictions[1::2]
eng_refs = references[::2]
hin_refs = references[1::2]

# Compute and display results
print("\n--- Computing English Metrics ---")
rouge_eng = rouge_metric.compute(predictions=eng_preds, references=eng_refs)
bleurt_eng = bleurt_metric.compute(predictions=eng_preds, references=eng_refs)
bert_eng = bertscore_metric.compute(
    predictions=eng_preds, references=eng_refs, lang="en"
)

print(f"  ROUGE-2: {rouge_eng['rouge2'] * 100:.2f}")
print(f"  BLEURT Score: {sum(bleurt_eng['scores']) / len(bleurt_eng['scores']):.4f}")
print(
    f"  BERTScore Precision: {sum(bert_eng['precision']) / len(bert_eng['precision']):.4f}"
)


print("\n--- Computing Hindi Metrics ---")
rouge_hin = rouge_metric.compute(predictions=hin_preds, references=hin_refs)
bleurt_hin = bleurt_metric.compute(predictions=hin_preds, references=hin_refs)
bert_hin = bertscore_metric.compute(
    predictions=hin_preds, references=hin_refs, lang="hi"
)

print(f"  ROUGE-2: {rouge_hin['rouge2'] * 100:.2f}")
print(f"  BLEURT Score: {sum(bleurt_hin['scores']) / len(bleurt_hin['scores']):.4f}")
print(
    f"  BERTScore Precision: {sum(bert_hin['precision']) / len(bert_hin['precision']):.4f}"
)

Loading evaluation metrics...




INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\default\downloads\extracted\64a145a740562dda9fae1ce4fb71155ccaf922d41c2355bee049709b8590e973\bleurt-base-128.


INFO:tensorflow:Reading checkpoint C:\Users\admin\.cache\huggingface\metrics\bleurt\default\downloads\extracted\64a145a740562dda9fae1ce4fb71155ccaf922d41c2355bee049709b8590e973\bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


Generating summaries for the test set...


  0%|          | 0/100 [00:00<?, ?it/s]


--- Computing English Metrics ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ROUGE-2: 17.94
  BLEURT Score: -0.3045
  BERTScore Precision: 0.8991

--- Computing Hindi Metrics ---
  ROUGE-2: 18.13
  BLEURT Score: -0.2792
  BERTScore Precision: 0.7319


Interactive Summarization

In [4]:
def summarize_article(article_text):
    """
    Generates and prints English and Hindi summaries for a given article text.
    """
    print("=" * 50)
    print("               SOURCE ARTICLE")
    print("=" * 50)
    print(article_text)

    # --- Generate English Summary ---
    english_input = PREFIX_ENG + article_text
    eng_inputs = tokenizer(
        english_input, return_tensors="pt", max_length=1024, truncation=True
    ).to(DEVICE)
    eng_summary_ids = model.generate(
        eng_inputs.input_ids, max_length=150, num_beams=5, early_stopping=True
    )
    english_summary = tokenizer.decode(eng_summary_ids[0], skip_special_tokens=True)

    print("\n" + "=" * 50)
    print("               ENGLISH SUMMARY")
    print("=" * 50)
    print(english_summary)

    # --- Generate Hindi Summary ---
    hindi_input = PREFIX_HIN + article_text
    hin_inputs = tokenizer(
        hindi_input, return_tensors="pt", max_length=1024, truncation=True
    ).to(DEVICE)
    hin_summary_ids = model.generate(
        hin_inputs.input_ids, max_length=200, num_beams=5, early_stopping=True
    )
    hindi_summary = tokenizer.decode(hin_summary_ids[0], skip_special_tokens=True)

    print("\n" + "=" * 50)
    print("                 HINDI SUMMARY")
    print("=" * 50)
    print(hindi_summary)
    print("\n" + "=" * 50)

In [5]:
# Paste any news article here to test the model
article_to_test = """
India's Chandrayaan-3 mission has successfully soft-landed on the lunar surface, making it the fourth country to achieve this feat. The Vikram lander touched down near the Moon's south pole, an unexplored region believed to contain water ice. The successful landing is a historic moment for India's space program, demonstrating advanced capabilities in landing technology. The Pragyan rover will now descend from the lander to explore the lunar terrain and conduct scientific experiments for one lunar day, which is equivalent to 14 Earth days. The mission aims to study the Moon's geology and the potential for a sustained human presence.
"""

summarize_article(article_to_test)

               SOURCE ARTICLE

India's Chandrayaan-3 mission has successfully soft-landed on the lunar surface, making it the fourth country to achieve this feat. The Vikram lander touched down near the Moon's south pole, an unexplored region believed to contain water ice. The successful landing is a historic moment for India's space program, demonstrating advanced capabilities in landing technology. The Pragyan rover will now descend from the lander to explore the lunar terrain and conduct scientific experiments for one lunar day, which is equivalent to 14 Earth days. The mission aims to study the Moon's geology and the potential for a sustained human presence.



NameError: name 'PREFIX_ENG' is not defined

In [12]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import textwrap

# --- Configuration ---
MODEL_PATH = "mbart-large-50-cnn-summarizer-en-hi_v11/final_model"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Global variables to hold the loaded model and tokenizer ---
model = None
tokenizer = None

def load_model():
    """
    Loads the fine-tuned mBART model and tokenizer into memory.
    """
    global model, tokenizer
    
    if model is not None and tokenizer is not None:
        print("Model and tokenizer are already loaded.")
        return

    print(f"Using device: {DEVICE}")
    try:
        print(f"Loading model and tokenizer from: {MODEL_PATH}...")
        model = MBartForConditionalGeneration.from_pretrained(MODEL_PATH).to(DEVICE)
        tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH)
        print("Model and tokenizer loaded successfully and are ready to use.")
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Please ensure the MODEL_PATH is set correctly.")
        model, tokenizer = None, None

def generate_high_quality_summary(article_text):
    """
    Takes a news article string and prints high-quality English and Hindi summaries.
    Assumes the model and tokenizer have already been loaded by load_model().
    """
    if model is None or tokenizer is None:
        print("Model is not loaded. Please run the `load_model()` function in the setup cell first.")
        return

    # --- FINAL Generation Hyperparameters for high-quality abstractive summaries ---
    NUM_BEAMS = 10
    NO_REPEAT_NGRAM_SIZE = 3
    MIN_SUMMARY_LENGTH = 30
    MAX_SUMMARY_LENGTH = 150
    REPETITION_PENALTY = 3.0 # Increase penalty to strongly discourage repetition
    LENGTH_PENALTY = 1.0    # Use a neutral length penalty
    
    # Hybrid settings: Use sampling within the beam search for more creativity
    DO_SAMPLE = True
    EARLY_STOPPING = True
    TOP_K = 50
    TOP_P = 0.95
    TEMPERATURE = 0.8 # Control randomness for more factual but creative output
    
    # --- Print Source Article ---
    print("\n" + "="*80)
    print("SOURCE ARTICLE:")
    print("="*80)
    wrapped_article = "\n".join(textwrap.wrap(article_text, width=80))
    print(wrapped_article)

    # --- Tokenize the article (source language is English) ---
    tokenizer.src_lang = "en_XX"
    inputs = tokenizer(article_text, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)

    # --- Generate English Summary ---
    eng_summary_ids = model.generate(
        inputs.input_ids,
        num_beams=NUM_BEAMS,
        max_length=MAX_SUMMARY_LENGTH,
        min_length=MIN_SUMMARY_LENGTH,
        length_penalty=LENGTH_PENALTY,
        no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
        repetition_penalty=REPETITION_PENALTY,
        do_sample=DO_SAMPLE,
        early_stopping=EARLY_STOPPING,
        top_k=TOP_K,
        top_p=TOP_P,
        temperature=TEMPERATURE,
        forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
    )
    # english_summary = tokenizer.decode(eng_summary_ids[0], skip_special_tokens=True)
    
    # print("\n" + "="*80)
    # print("GENERATED ENGLISH SUMMARY:")
    # print("="*80)
    # wrapped_eng_summary = "\n".join(textwrap.wrap(english_summary, width=80))
    # print(wrapped_eng_summary)

    # --- Generate Hindi Summary ---
    hin_summary_ids = model.generate(
        inputs.input_ids,
        num_beams=NUM_BEAMS,
        max_length=MAX_SUMMARY_LENGTH,
        min_length=MIN_SUMMARY_LENGTH,
        length_penalty=LENGTH_PENALTY,
        no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
        repetition_penalty=REPETITION_PENALTY,
        do_sample=DO_SAMPLE,
        early_stopping=EARLY_STOPPING,
        top_k=TOP_K,
        top_p=TOP_P,
        temperature=TEMPERATURE,
        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
    )
    hindi_summary = tokenizer.decode(hin_summary_ids[0], skip_special_tokens=True)
    
    print("\n" + "="*80)
    print("GENERATED HINDI SUMMARY:")
    print("="*80)
    wrapped_hin_summary = "\n".join(textwrap.wrap(hindi_summary, width=80))
    print(wrapped_hin_summary)
    print("\n" + "="*80)

# --- Load the model automatically when this cell is run ---
load_model()



Using device: cuda
Loading model and tokenizer from: mbart-large-50-cnn-summarizer-en-hi_v11/final_model...
Model and tokenizer loaded successfully and are ready to use.


In [14]:
article_to_test = """
India secured a decisive victory over Australia in the final match of the T20 series, winning by a margin of 35 runs in Bengaluru. Batting first, India posted a competitive total of 198 for 4, thanks to a powerful half-century from captain Suryakumar Yadav, who scored 78 off just 45 balls. In response, Australia's chase faltered early as they lost key wickets to India's fast bowlers.
"""

generate_high_quality_summary(article_to_test)



SOURCE ARTICLE:
 India secured a decisive victory over Australia in the final match of the T20
series, winning by a margin of 35 runs in Bengaluru. Batting first, India posted
a competitive total of 198 for 4, thanks to a powerful half-century from captain
Suryakumar Yadav, who scored 78 off just 45 balls. In response, Australia's
chase faltered early as they lost key wickets to India's fast bowlers.

GENERATED HINDI SUMMARY:
भारत ने बेंगलुरू में टी20 सीरीज़ के अंतिम मैच में ऑस्ट्रेलिया पर 35 रनों की
अंतराल के साथ निर्णायक जीत हासिल की। पहले बल्लेबाजी करते हुए, भारत का
प्रतिस्पर्धी कुल स्कोर 198 रन (4 विकेट) रहा। कप्तान सूर्यकुमार यादव ने एक
शक्तिशाली अर्धशतकीय पारी खेली, जिसमें केवल 45 गेंदों पर 78 विकेट शामिल थे। जवाब
में, ऑस्ट्रेलिया की गेंदबाजी शुरुआती रूप से कमजोर रही क्योंकि उन्होंने भारत के
तेज गेंदबाजों को महत्वपूर्ण विकेट गंवा दिए।



In [15]:
article_to_test = """
A landmark international treaty to combat plastic pollution has been agreed upon by delegates from over 170 countries at a United Nations Environment Assembly session held in Nairobi. Hailed as the most significant environmental pact since the Paris Agreement, the resolution establishes an Intergovernmental Negotiating Committee (INC) tasked with drafting a legally binding agreement by the end of 2026. The future treaty aims to address the full lifecycle of plastic, from its production and design to its disposal and recycling. The negotiations were complex, with debates centering on whether the treaty should focus solely on plastic waste management or include caps on virgin plastic production. Major plastic-producing nations and fossil fuel companies had advocated for a focus on recycling, while a coalition of environmental groups and many developing nations pushed for stricter controls on production itself. The final resolution provides a broad mandate for the INC to consider all options.
"""

generate_high_quality_summary(article_to_test)


SOURCE ARTICLE:
 A landmark international treaty to combat plastic pollution has been agreed
upon by delegates from over 170 countries at a United Nations Environment
Assembly session held in Nairobi. Hailed as the most significant environmental
pact since the Paris Agreement, the resolution establishes an Intergovernmental
Negotiating Committee (INC) tasked with drafting a legally binding agreement by
the end of 2026. The future treaty aims to address the full lifecycle of
plastic, from its production and design to its disposal and recycling. The
negotiations were complex, with debates centering on whether the treaty should
focus solely on plastic waste management or include caps on virgin plastic
production. Major plastic-producing nations and fossil fuel companies had
advocated for a focus on recycling, while a coalition of environmental groups
and many developing nations pushed for stricter controls on production itself.
The final resolution provides a broad mandate for the INC to

In [16]:
article_to_test = """
India's Chandrayaan-3 mission has successfully soft-landed on the lunar surface, making it the fourth country to achieve this feat. The Vikram lander touched down near the Moon's south pole, an unexplored region believed to contain water ice. The successful landing is a historic moment for India's space program, demonstrating advanced capabilities in landing technology. The Pragyan rover will now descend from the lander to explore the lunar terrain and conduct scientific experiments for one lunar day, which is equivalent to 14 Earth days. The mission aims to study the Moon's geology and the potential for a sustained human presence.
"""

generate_high_quality_summary(article_to_test)


SOURCE ARTICLE:
 India's Chandrayaan-3 mission has successfully soft-landed on the lunar
surface, making it the fourth country to achieve this feat. The Vikram lander
touched down near the Moon's south pole, an unexplored region believed to
contain water ice. The successful landing is a historic moment for India's space
program, demonstrating advanced capabilities in landing technology. The Pragyan
rover will now descend from the lander to explore the lunar terrain and conduct
scientific experiments for one lunar day, which is equivalent to 14 Earth days.
The mission aims to study the Moon's geology and the potential for a sustained
human presence.

GENERATED HINDI SUMMARY:
भारत का चंद्रयान-3 मिशन सफलतापूर्वक बुध के दक्षिण ध्रुव के पास सॉफ्ट-लैंड किया।
यह उपलब्धि भारत की अंतरिक्ष प्रणाली के लिए एक ऐतिहासिक क्षण है, जिससे वह चौथा
देश बन गया है जो इस उपलब्धि को हासिल कर रहा है। मंगल के एक अexplored क्षेत्र में
उतरने के बाद, रासायनिक प्रौद्योगिकियां विकसित करने के लिए डिज़ाइन किया गया यह

In [17]:
article_to_test = """
The Indian Space Research Organisation (ISRO) has successfully completed a critical test for its ambitious Gaganyaan mission, which aims to send Indian astronauts to space. The test involved the final integrated validation of the crew module's parachute system at a facility in Chandigarh. The parachutes are essential for ensuring the safe return and landing of the crew module. Officials confirmed that the system performed flawlessly under simulated flight conditions. This milestone moves India one step closer to launching its first crewed spaceflight, which is currently scheduled for late 2025. The Gaganyaan programme is a top priority for the nation's space agency, marking its entry into human space exploration.
"""

generate_high_quality_summary(article_to_test)


SOURCE ARTICLE:
 The Indian Space Research Organisation (ISRO) has successfully completed a
critical test for its ambitious Gaganyaan mission, which aims to send Indian
astronauts to space. The test involved the final integrated validation of the
crew module's parachute system at a facility in Chandigarh. The parachutes are
essential for ensuring the safe return and landing of the crew module. Officials
confirmed that the system performed flawlessly under simulated flight
conditions. This milestone moves India one step closer to launching its first
crewed spaceflight, which is currently scheduled for late 2025. The Gaganyaan
programme is a top priority for the nation's space agency, marking its entry
into human space exploration.

GENERATED HINDI SUMMARY:
भारतीय अंतरिक्ष अनुसंधान संगठन (ISRO) ने अपने महत्वाकांक्षी Gaganyaan मिशन के
लिए एक महत्वपूर्ण परीक्षण सफलतापूर्वक पूरा किया। शिकागो में एक सुविधा द्वारा
आयोजित इस परीक्षण में चालक दल मॉड्यूल के पैराशूट प्रणाली का अंतिम एकीकृत सत्या

In [18]:
article_to_test = """
Google has announced a significant upgrade to its core AI model, Gemini. The new version, named Gemini 1.5 Pro, is designed to handle a much larger amount of information at once. The company claims it can process up to 1 million tokens, which is equivalent to an entire feature-length movie or over 700,000 words of text. This massive context window allows the model to understand and reason about very large documents, codebases, or hours of video content without forgetting earlier details. The new model is initially being made available to developers and enterprise customers through Google's AI Studio and Vertex AI platforms. This development is seen as a major step in the competition against other leading AI models like OpenAI's GPT-4."""

generate_high_quality_summary(article_to_test)


SOURCE ARTICLE:
 Google has announced a significant upgrade to its core AI model, Gemini. The
new version, named Gemini 1.5 Pro, is designed to handle a much larger amount of
information at once. The company claims it can process up to 1 million tokens,
which is equivalent to an entire feature-length movie or over 700,000 words of
text. This massive context window allows the model to understand and reason
about very large documents, codebases, or hours of video content without
forgetting earlier details. The new model is initially being made available to
developers and enterprise customers through Google's AI Studio and Vertex AI
platforms. This development is seen as a major step in the competition against
other leading AI models like OpenAI's GPT-4.

GENERATED HINDI SUMMARY:
Google ने अपनी कोर AI मॉडल Gemini 1.5 Pro के लिए महत्वपूर्ण उन्नति की घोषणा की
है, जिसका उद्देश्य एक साथ बड़े पैमाने पर जानकारी संभालना है। यह संस्करण 1 मिलियन
टूकेन तक पहुंच प्रदान करता है, जो एक पूरी फीचर-लंबे

In [19]:
article_to_test = """TThe Reserve Bank of India (RBI) has announced that it will keep the repo rate unchanged at 6.5% for the eighth consecutive time. The decision was made by the Monetary Policy Committee (MPC) following its recent three-day meeting. RBI Governor Shaktikanta Das stated that the committee is focused on ensuring inflation aligns with the target of 4% while supporting economic growth. The central bank also retained its GDP growth forecast for the current fiscal year at 7.2%. The decision was widely expected by economists, who believe that a stable policy rate is necessary to manage potential food price inflation and global economic uncertainties before considering any rate cuts later in the year."""
generate_high_quality_summary(article_to_test)


SOURCE ARTICLE:
TThe Reserve Bank of India (RBI) has announced that it will keep the repo rate
unchanged at 6.5% for the eighth consecutive time. The decision was made by the
Monetary Policy Committee (MPC) following its recent three-day meeting. RBI
Governor Shaktikanta Das stated that the committee is focused on ensuring
inflation aligns with the target of 4% while supporting economic growth. The
central bank also retained its GDP growth forecast for the current fiscal year
at 7.2%. The decision was widely expected by economists, who believe that a
stable policy rate is necessary to manage potential food price inflation and
global economic uncertainties before considering any rate cuts later in the
year.

GENERATED HINDI SUMMARY:
भारतीय राष्ट्रीय बैंक (RBI) ने लगातार आठवीं बार रिपो दर को 6.5% तक बरकरार रखने
की घोषणा की है। यह निर्णय Monetary Policy Committee (MPC) द्वारा अपनी हालिया तीन
दिवसीय बैठक के बाद लिया गया है, जिसमें RBI के गवर्नर शाकिंटा डेस ने कहा कि समिति
का ध्यान 4% के ल

In [None]:
article_to_test = """The United States has long dominated the global technology market, but China is determined to challenge that supremacy by investing heavily in artificial intelligence, robotics, and the production of high-end chips that power these advanced technologies. With Beijing pouring billions into semiconductor development, the gap between the two nations is narrowing rapidly. Nvidia CEO Jensen Huang recently warned that China is just “nanoseconds behind” the US in chip progress, highlighting the growing competitiveness of Chinese firms. The rise of DeepSeek in 2024 marked a turning point — the Chinese startup launched a ChatGPT rival that was cheaper to train and used fewer high-end chips, momentarily shaking Nvidia’s market value. Since then, China’s momentum in the tech sector has only accelerated, with giants like Alibaba and Huawei unveiling powerful new chips that reportedly rival Nvidia’s processors. Alibaba’s latest chip was said to match Nvidia’s H20 in performance while using less energy, while Huawei announced its strongest chips yet alongside a three-year plan to challenge Nvidia’s AI market dominance and open its technology to the public to reduce dependence on US systems. Other domestic players, including MetaX and Cambricon Technologies, are also gaining traction—MetaX now supplies chips to state-owned China Unicom, while Cambricon’s stock value has more than doubled as investors bet on Beijing’s push for self-reliance. Tech giants such as Tencent have joined the movement, shifting toward Chinese-made chips, and state-backed trade shows are promoting local innovations to attract global investors. Nvidia has acknowledged the growing competition but maintains confidence that customers will choose the best technology available. However, experts caution that China’s claims should be treated carefully due to limited public testing data. According to computer scientist Jawad Haj-Yahya, Chinese semiconductors are approaching US performance levels in predictive AI but still lag in complex analytics. While the technological gap is clearly narrowing, it remains uncertain whether China can fully catch up with American innovation in the near future."""
generate_summary(article_to_test)


SOURCE ARTICLE (truncated):
The United States has long dominated the global technology market, but China is
determined to challenge that supremacy by investing heavily in artificial
intelligence, robotics, and the production of high-end chips that power these
advanced technologies. With Beijing pouring billions into semiconductor
development, the gap between the two nations is narrowing rapidly. Nvidia CEO
Jensen Huang recently warned that China is just “nanoseconds behind” the US in
chip progress, highlighting the growing competitiveness of Chinese firms. The
rise of DeepSeek in 2024 marked a turning point — the Chinese startup launched a
ChatGPT rival that was cheaper to train and used fewer high-end chips,
momentarily shaking Nvidia’s market value. Since then, China’s momentum in the
tech sector has only accelerated, with giants like Alibaba and Huawei unveiling
powerful new chips that reportedly rival Nvidia’s processors. Alibaba’s latest
chip was said to match Nvidia’s H20 in per