In [10]:
!pip install -q transformers accelerate sentencepiece tqdm evaluate sacrebleu


import torch, os

#check for cuds
print("CUDA available:", torch.cuda.is_available())
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if DEVICE == "cpu":
    torch.set_num_threads(max(1, os.cpu_count()//2))
print("Running on:", DEVICE)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCUDA available: False
Running on: cpu


# Flan T5 small

In [11]:
#trying t5-small on dummy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

text = "India's data centre boom confronts a looming water challenge"

#Prepare input
inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True)

#Generate summary
outputs = model.generate(**inputs, max_new_tokens=80, num_beams=1)

#Decode output
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(summary)


India's data centre boom faces a looming water challenge


# Summarize Function

In [12]:
#helper functions for summary

#function to create chunks of long articles
def chunk_text(text: str, chunk_tokens=380, overlap_tokens=20):
    ids = tokenizer.encode(text, truncation=False)
    chunks = []
    start = 0
    while start < len(ids):
        end = min(start + chunk_tokens, len(ids))
        chunks.append(ids[start:end])
        if end >= len(ids):
            break
        start = max(0, end - overlap_tokens)
    return [tokenizer.decode(c, skip_special_tokens=True) for c in chunks]

#function to summarize chunks of texts
def summarize_text(text, max_new_tokens=110):
    prompt = "summarize: " + text
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams = 1
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



# main summary function for long texts

def summarize(text):
    toks = tokenizer.encode(text, truncation=False)
    if len(toks) <= 380:
        return summarize_text(text)
    parts = chunk_text(text)
    mini = [summarize_text(p, max_new_tokens=80) for p in parts]
    stitched = " ".join(mini)
    return summarize_text(stitched, max_new_tokens=110)

In [13]:
sample = "India's data centre boom confronts a looming water challenge"
print("SUMMARY:", summarize(sample))


SUMMARY: India's data centre boom faces a looming water challenge


In [14]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
DATA_FOLDER = "/content/drive/MyDrive/BTTAI - News AI Agent/Data"
CSV_PATH = f"{DATA_FOLDER}/CLEANED_BBC_News_Train.csv"
df = pd.read_csv(CSV_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
texts = df["cleaned_txt"].fillna("").astype(str).tolist()
print("Rows:", len(df))
df.head(2)

Rows: 1460


Unnamed: 0,ArticleId,Text,Category,cleaned_txt,word_count
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex-boss launch defence lawyer defend ...,191
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,203


Demo on a small slice first

In [16]:
subset = texts[:5]
summaries = [summarize(t) for t in subset]

df_out = df.head(5).copy()
df_out["t5_model"] = MODEL_NAME
df_out["t5_summary"] = summaries
df_out.head(3)


Unnamed: 0,ArticleId,Text,Category,cleaned_txt,word_count,t5_model,t5_summary
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex-boss launch defence lawyer defend ...,191,google/flan-t5-small,mci agree buyout verizon communication deal va...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,203,google/flan-t5-small,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...,290,google/flan-t5-small,october 2005 across 22 country face-to-face te...


# Test+Evaluate

In [17]:
import re, evaluate

# predictions
preds = summaries
N = len(subset)

def lead3(text: str, lim=600):
    s = re.split(r'(?<=[.!?])\s+', str(text).strip())
    return " ".join(s[:3])[:lim]

refs = [[lead3(t)] for t in subset]

bleu = evaluate.load("sacrebleu")
bleu_score = bleu.compute(predictions=preds, references=refs)["score"]
print(f"BLEU on {N} docs:", round(bleu_score, 2))

BLEU on 5 docs: 26.73
