In [2]:
# pipeline_cnn_hindi_test_long.py
import torch
import pandas as pd
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration
from indic_trans import transliterate
from tqdm import tqdm
import nltk
nltk.download('punkt')

# ------------------ CONFIG ------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_ROWS = 10          # Test with 10 articles
TOKEN_LIMIT = 2048     # Increased input token length for longer articles
MAX_SUMMARY_LEN = 300  # Increased maximum summary length
MIN_SUMMARY_LEN = 100  # Increased minimum summary length

# ------------------ LOAD DATASET ------------------
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")
df = pd.DataFrame(dataset['train'][:NUM_ROWS])
print(f"Loaded {len(df)} articles for testing.")

# ------------------ LOAD BART MODEL ------------------
print("Loading BART model...")
bart_model_name = "facebook/bart-large-cnn"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name).to(DEVICE)

# ------------------ BART SUMMARISER ------------------
def bart_summarize(text):
    paragraphs = nltk.sent_tokenize(text)
    summaries = []
    for para in paragraphs:
        inputs = bart_tokenizer.encode(para, return_tensors="pt", truncation=True, max_length=TOKEN_LIMIT).to(DEVICE)
        summary_ids = bart_model.generate(
            inputs,
            max_length=MAX_SUMMARY_LEN,
            min_length=MIN_SUMMARY_LEN,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return " ".join(summaries)

# ------------------ INDICTRANS2 TRANSLATION ------------------
def translate_to_hindi(text):
    try:
        return transliterate(text, "hi")
    except Exception as e:
        print(f"Translation failed: {e}")
        return ""

# ------------------ PROCESS DATA ------------------
results = []

print("Processing test articles: summarising and translating...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    article_text = row['article']
    
    # Step 1: English abstractive summary
    eng_summary = bart_summarize(article_text)
    
    # Step 2: Hindi translation
    hi_summary = translate_to_hindi(eng_summary)
    
    results.append({
        "article": article_text,
        "summary_en": eng_summary,
        "summary_hi": hi_summary
    })

# ------------------ SAVE TO CSV ------------------
output_csv = "cnn_dailymail_bart_hindi_test_long.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)
print(f"Saved {len(results)} rows to {output_csv}")

ModuleNotFoundError: No module named 'indic_trans'

In [3]:
# pipeline_cnn_hindi_test_indictrans2.py
import torch
import pandas as pd
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import nltk
nltk.download('punkt')

# ------------------ CONFIG ------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_ROWS = 10          # Test with 10 articles
BART_TOKEN_LIMIT = 2048
BART_MAX_SUMMARY_LEN = 300
BART_MIN_SUMMARY_LEN = 100
INDICTRANS_MAX_LEN = 512

# ------------------ LOAD CNN/DailyMail ------------------
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")
df = pd.DataFrame(dataset['train'][:NUM_ROWS])
print(f"Loaded {len(df)} articles for testing.")

# ------------------ LOAD BART MODEL ------------------
print("Loading BART model for English summarisation...")
bart_model_name = "facebook/bart-large-cnn"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name).to(DEVICE)

# ------------------ LOAD INDICTRANS2 MODEL ------------------
print("Loading IndicTrans2 model for English->Hindi translation...")
indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicTrans2-en-hi")
indic_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicTrans2-en-hi").to(DEVICE)

# ------------------ FUNCTIONS ------------------
def bart_summarize(text):
    paragraphs = nltk.sent_tokenize(text)
    summaries = []
    for para in paragraphs:
        inputs = bart_tokenizer.encode(para, return_tensors="pt", truncation=True, max_length=BART_TOKEN_LIMIT).to(DEVICE)
        summary_ids = bart_model.generate(
            inputs,
            max_length=BART_MAX_SUMMARY_LEN,
            min_length=BART_MIN_SUMMARY_LEN,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return " ".join(summaries)

def translate_to_hindi(text):
    # Split text if longer than max length
    inputs = indic_tokenizer(text, return_tensors="pt", truncation=True, max_length=INDICTRANS_MAX_LEN).to(DEVICE)
    outputs = indic_model.generate(**inputs, max_length=INDICTRANS_MAX_LEN)
    hi_text = indic_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return hi_text

# ------------------ PROCESS DATA ------------------
results = []
print("Processing test articles: summarising and translating...")

for idx, row in tqdm(df.iterrows(), total=len(df)):
    article_text = row['article']
    
    # Step 1: English abstractive summary
    eng_summary = bart_summarize(article_text)
    
    # Step 2: Hindi translation using IndicTrans2
    hi_summary = translate_to_hindi(eng_summary)
    
    # Append results
    results.append({
        "article": article_text,
        "summary_en": eng_summary,
        "summary_hi": hi_summary
    })

# ------------------ SAVE TO CSV ------------------
output_csv = "cnn_dailymail_bart_hindi_indictrans2_test.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)
print(f"Saved {len(results)} rows to {output_csv}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading CNN/DailyMail dataset...
Loaded 10 articles for testing.
Loading BART model for English summarisation...
Loading IndicTrans2 model for English->Hindi translation...


OSError: ai4bharat/IndicTrans2-en-hi is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [2]:
# pipeline_cnn_hindi_test_long_fixed.py
import torch
import pandas as pd
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# ------------------ CONFIG ------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
NUM_ROWS = 10         # Test with 10 articles
TOKEN_LIMIT = 1024    # BART's maximum input token length is 1024
MAX_SUMMARY_LEN = 250 # Max length for the generated summary
MIN_SUMMARY_LEN = 50  # Min length for the generated summary

# ------------------ LOAD DATASET ------------------
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0", trust_remote_code=True)
df = pd.DataFrame(dataset['train'][:NUM_ROWS])
print(f"Loaded {len(df)} articles for testing.")

# ------------------ LOAD MODELS ------------------
# 1. Summarization Model (BART)
print("Loading BART summarization model...")
bart_model_name = "facebook/bart-large-cnn"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name).to(DEVICE)

# 2. Translation Model (English to Hindi)
print("Loading English-to-Hindi translation model...")
translator_model_name = "Helsinki-NLP/opus-mt-en-hi"
translator_tokenizer = AutoTokenizer.from_pretrained(translator_model_name)
translator_model = AutoModelForSeq2SeqLM.from_pretrained(translator_model_name).to(DEVICE)

# ------------------ SUMMARIZER FUNCTION ------------------
def bart_summarize(text):
    """Summarizes the entire article text at once."""
    inputs = bart_tokenizer.encode(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=TOKEN_LIMIT
    ).to(DEVICE)
    
    summary_ids = bart_model.generate(
        inputs,
        max_length=MAX_SUMMARY_LEN,
        min_length=MIN_SUMMARY_LEN,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ------------------ TRANSLATION FUNCTION ------------------
def translate_to_hindi(text):
    """Translates English text to Hindi using a dedicated model."""
    try:
        inputs = translator_tokenizer.encode(text, return_tensors="pt", truncation=True).to(DEVICE)
        generated_tokens = translator_model.generate(inputs, num_beams=5, max_length=512)
        return translator_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Translation failed: {e}")
        return ""

# ------------------ PROCESS DATA ------------------
results = []

print("Processing articles: summarising and translating...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    article_text = row['article']
    
    # Step 1: Generate English abstractive summary from the whole article
    eng_summary = bart_summarize(article_text)
    
    # Step 2: Translate the English summary to Hindi
    hi_summary = translate_to_hindi(eng_summary)
    
    results.append({
        "article": article_text,
        "summary_en": eng_summary,
        "summary_hi": hi_summary
    })

# ------------------ SAVE TO CSV ------------------
output_csv = "cnn_dailymail_bart_hindi_test_long_fixed.csv"
pd.DataFrame(results).to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"Saved {len(results)} rows to {output_csv}")

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'cnn_dailymail' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


Using device: cuda
Loading CNN/DailyMail dataset...
Loaded 10 articles for testing.
Loading BART summarization model...
Loading English-to-Hindi translation model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Processing articles: summarising and translating...


100%|██████████| 10/10 [00:26<00:00,  2.69s/it]


Saved 10 rows to cnn_dailymail_bart_hindi_test_long_fixed.csv
