## Model Load

In [None]:
# model_path = "/content/drive/MyDrive/T5_e10a5"

In [None]:
model_path = "/content/drive/MyDrive/T5_Headline_Model"

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

## Load Data

In [None]:
import pandas as pd

In [None]:
paths = [
    '/content/drive/MyDrive/Inshort-News-DataSet/inshort_news_data-2.csv',
    '/content/drive/MyDrive/Inshort-News-DataSet/inshort_news_data-3.csv',
    '/content/drive/MyDrive/Inshort-News-DataSet/inshort_news_data-4.csv',
    '/content/drive/MyDrive/Inshort-News-DataSet/inshort_news_data-5.csv',
    '/content/drive/MyDrive/Inshort-News-DataSet/inshort_news_data-6.csv',
    '/content/drive/MyDrive/Inshort-News-DataSet/inshort_news_data-7.csv',
  ]
dataframes = [pd.read_csv(path) for path in paths] # returns List
data = pd.concat(dataframes)
df = data.sample(frac=1).reset_index(drop=True) # After 100% sampling data

In [None]:
df.iloc[0]['news_article']

'Tesla was ordered by a Chinese court to pay over ₹1 crore to the buyer of a used Model S car after concluding it concealed structural damage on the vehicle it sold on its official website. It was reportedly discovered part of the vehicle had been cut and welded back together. Tesla will appeal the ruling to a higher court.'

In [None]:
df.iloc[0]['news_headline']

'Tesla asked to give ₹1 cr to used car buyer for hiding damage in China'

In [None]:
actual_headline = "Tesla asked to give ₹1 cr to used car buyer for hiding damage in China"

# Headline Generators

In [None]:
import torch
from transformers import LogitsProcessor
from typing import Dict

In [None]:
# 1. Keyword Extraction (manual/simple for demo)
article = """Tesla was ordered by a Chinese court to pay over ₹1 crore to the buyer of a used Model S car after concluding it concealed structural damage on the vehicle it sold on its official website. It was reportedly discovered part of the vehicle had been cut and welded back together. Tesla will appeal the ruling to a higher court."""
# article = df.iloc[0]['news_article']
keyword = "Chinese"  # manually extracted for this demo

In [None]:
# 5. Prepare input
input_text = "summarize: " + article
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

## Updated LogitsProcessor with Sampling instead of Beam Search  
Status: **Functional**  
Conclusion: Working for single token.  

In [None]:
# 3. Create scores_map with token ID(s) of the keyword ===
keyword_ids = tokenizer.encode(keyword, add_special_tokens=False)
scores_map = {token_id: 8.0 for token_id in keyword_ids}  # Strong additive bias

In [None]:
# 4. Define Additive SEOLogitsProcessor ===
class SEOLogitsProcessor(LogitsProcessor):
    def __init__(self, scores_map: Dict[int, float]):
        self.scores_map = scores_map

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        for token_id, boost in self.scores_map.items():
            scores[:, token_id] += boost
        return scores

In [None]:
# 5. Generate WITHOUT SEO biasing (baseline) ===
seo_processor = SEOLogitsProcessor(scores_map)
output_ids_plain = model.generate(
    input_ids,
    max_length=20,
    do_sample=True,
    top_k=50,
    temperature=0.9
)
title_plain = tokenizer.decode(output_ids_plain[0], skip_special_tokens=True)

In [None]:
# 6. Generate WITH SEO biasing ===
output_ids_seo = model.generate(
    input_ids,
    max_length=20,
    do_sample=True,
    top_k=50,
    temperature=0.6,
    logits_processor=[seo_processor]
)
title_seo = tokenizer.decode(output_ids_seo[0], skip_special_tokens=True)

In [None]:
# === 7. Show Results ===
print("🔹 Without SEO Biasing:", title_plain)
print("🔹 With SEO Biasing   :", title_seo)
print("🔹 Keyword Biased Toward:", keyword)

🔹 Without SEO Biasing: Tesla ordered to pay over 1 cr to used car buyer for hiding damage in China
🔹 With SEO Biasing   : Tesla ordered to pay 1 cr to Chinese buyer for concealing damage to Model S
🔹 Keyword Biased Toward: Chinese


## Logits Processor  

Conclusion: The biasing does not influence the results as beam search is being used.

In [None]:
# 3. Get keyword token ID and create scores_map
keyword_ids = tokenizer.encode(keyword, add_special_tokens=False)
scores_map = {k: 1.0 for k in keyword_ids}  # 1.0 is arbitrary; can experiment

In [None]:
# 4. Define SEOLogitsProcessor
class SEOLogitsProcessor(LogitsProcessor):
    def __init__(self, scores_map: Dict[int, float], temperature: float, vocab_size: int):
        self.temperature = temperature
        self.mask = torch.ones(vocab_size)
        self.seo_words_ids = list(scores_map.keys())
        for k, v in scores_map.items():
            v = max(v, 0.0001)
            self.mask[k] = (10 / v) * temperature

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        if self.temperature == 1:
            return scores
        for k in self.seo_words_ids:
            self.mask[k] *= 1.1
        return scores * self.mask.to(scores.device)

In [None]:
# 6. Generate without SEO processor
output_ids_plain = model.generate(input_ids, max_length=20, num_beams=4)
title_plain = tokenizer.decode(output_ids_plain[0], skip_special_tokens=True)

In [None]:
# 7. Generate with SEO processor
seo_processor = SEOLogitsProcessor(scores_map, temperature=0.9, vocab_size=model.config.vocab_size)
output_ids_seo = model.generate(
    input_ids,
    max_length=20,
    num_beams=4,
    logits_processor=[seo_processor]
)
title_seo = tokenizer.decode(output_ids_seo[0], skip_special_tokens=True)

In [None]:
# 8. Print results
print("🔹Original Title: ", title_plain)
print("🔹SEO-Biased Title: ", title_seo)
print("🔹Keyword Biased Toward: ", keyword)

🔹Original Title:  Tesla asked to give 1 cr to used car buyer for hiding damage in China
🔹SEO-Biased Title:  Tesla asked to give 1 cr to used car buyer for hiding damage in China
🔹Keyword Biased Toward:  Chinese


## Multi token Logits Processor  
Conclusion: not working

In [None]:
class MultiKeywordBiasLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer, keyword_weights: Dict[str, float]):
        self.tokenizer = tokenizer
        self.token_bias_map = {}

        for word, boost in keyword_weights.items():
            token_ids = tokenizer.encode(word, add_special_tokens=False)
            if len(token_ids) == 1:
                self.token_bias_map[token_ids[0]] = boost
            else:
                print(f"Skipping multi-token phrase '{word}'. Use phrase biasing instead.")

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        for token_id, boost in self.token_bias_map.items():
            scores[:, token_id] += boost
        return scores

In [None]:
keywords = {
    "Chinese": 5.0,
    "damage": 4.0,
    "court": 3.0
}

logits_processor = MultiKeywordBiasLogitsProcessor(tokenizer, keywords)

generated = model.generate(
    input_ids,
    max_length=20,
    do_sample=True,
    temperature=0.9,
    top_k=50,
    logits_processor=[logits_processor]
)

print(tokenizer.decode(generated[0], skip_special_tokens=True))

Tesla ordered to pay 1 cr to used car buyer for concealing damage to car


## Phrase Biasing Logits Processor  
Conclusion: Not working

In [None]:
class PhraseBiasingLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer, phrase_bias_map: dict, boost=5.0):
        self.tokenizer = tokenizer
        self.phrase_bias_map = {}
        self.boost = boost

        # Convert each phrase to list of token IDs
        for phrase, score in phrase_bias_map.items():
            token_ids = tokenizer.encode(phrase, add_special_tokens=False)
            if len(token_ids) > 1:
                self.phrase_bias_map[tuple(token_ids[:-1])] = (token_ids[-1], score)
            else:
                print(f"Skipping single-token phrase: '{phrase}'")

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # Get last sequence (assumes batch size = 1)
        input_ids = input_ids[0].tolist()

        for prefix, (next_token, bias_score) in self.phrase_bias_map.items():
            if tuple(input_ids[-len(prefix):]) == prefix:
                scores[:, next_token] += bias_score
        return scores

In [None]:
phrase_bias_map = {
    "Chinese court": 6.0,
    "used car": 5.0,
    "vehicle damage": 4.0
}

logits_processor = PhraseBiasingLogitsProcessor(tokenizer, phrase_bias_map)

generated = model.generate(
    input_ids,
    max_length=20,
    do_sample=True,
    temperature=0.9,
    top_k=50,
    logits_processor=[logits_processor]
)

print(tokenizer.decode(generated[0], skip_special_tokens=True))

Tesla ordered to pay 1 cr to used car buyer for hiding damage in China


## N-gram Phrase Biasing with Custom LogitsProcessor  
Conclusion: not working

In [None]:
class PhraseBiasingLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer, phrase_bias_map: Dict[str, float]):
        self.tokenizer = tokenizer
        self.phrase_token_ids = {
            tuple(tokenizer.encode(phrase, add_special_tokens=False)): boost
            for phrase, boost in phrase_bias_map.items()
        }

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        for phrase_ids, boost in self.phrase_token_ids.items():
            seq_len = len(phrase_ids)
            if seq_len == 0 or input_ids.size(1) < seq_len - 1:
                continue

            # Compare previous (seq_len - 1) tokens
            if tuple(input_ids[0, -seq_len + 1:].tolist()) == phrase_ids[:-1]:
                next_token_id = phrase_ids[-1]
                scores[:, next_token_id] += boost
        return scores


In [None]:
phrase_bias_map = {
    "Chinese court": 8.0,
    "structural damage": 6.0
}

seo_processor = PhraseBiasingLogitsProcessor(tokenizer, phrase_bias_map)

In [None]:
# 6. Generate WITH SEO biasing ===
output_ids_seo = model.generate(
    input_ids,
    max_length=20,
    do_sample=True,
    top_k=50,
    temperature=0.1,
    logits_processor=[seo_processor]
)
title_seo = tokenizer.decode(output_ids_seo[0], skip_special_tokens=True)

In [None]:
# === 7. Show Results ===
print("🔹 Without SEO Biasing:", title_plain)
print("🔹 With SEO Biasing   :", title_seo)
print("🔹 Keyword Biased Toward:", keyword)

🔹 Without SEO Biasing: Tesla to pay 1 cr to Used Car Buyer by Chinese court over damage in S
🔹 With SEO Biasing   : Tesla ordered to pay 1 cr to used car buyer for concealing damage in China
🔹 Keyword Biased Toward: structural


## Hard Constrained Decoding  

Conclusion: Results are non optimal.

In [None]:
# Your required keywords
keywords = ["Chinese"]  # words you want to appear in output

# Convert keywords to token IDs (flattened to handle subwords)
required_token_ids = set()
for word in keywords:
    tokens = tokenizer(word, add_special_tokens=False).input_ids
    required_token_ids.update(tokens)

# Track which required tokens have been seen
seen_token_ids = set()

# Define the constraint function
def prefix_allowed_tokens_fn(batch_id, input_ids):
    global seen_token_ids
    # If all required tokens seen, allow full vocab
    if required_token_ids.issubset(seen_token_ids):
        return list(tokenizer.get_vocab().values())

    # Otherwise, only allow required tokens or frequently likely tokens
    last_token = input_ids[-1].item()
    seen_token_ids.add(last_token)

    # Prioritize required tokens to get them included early
    return list(required_token_ids.union(set(torch.topk(model.lm_head.weight[last_token], 50).indices.tolist())))

# Generate with constrained decoding
output_ids = model.generate(
    input_ids=input_ids,
    max_length=30,
    num_beams=5,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
    early_stopping=True
)

# Decode and print
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated:", output)


Generated: 1 cr ordered to pay Tesla to used car buyer for hiding damage in China
