In [1]:
from transformers import T5Tokenizer, T5Model

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5Model.from_pretrained("t5-small")

input_ids = tokenizer(
    "Studies have been shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

# forward pass
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
last_hidden_states = outputs.last_hidden_state


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [60]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

input_ids = tokenizer("Studies have been shown that <extra_id_0> is a great way to stay healthy.<extra_id_1>", return_tensors="pt").input_ids
outputs = model.generate(input_ids=input_ids, max_new_tokens=30, num_return_sequences=5, num_beams=5)
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(decoded)


['exercise Click here to learn more.', 'exercise Click here for more information.', 'exercise Learn more here.', 'exercise Learn more.', 'exercise Read more about it here.']


In [61]:
def t5_model_generation(model, tokenizer, text):
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids, max_new_tokens=30, num_return_sequences=5, num_beams=5)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    return decoded

In [None]:
raw_text = """if they actually censor anything is another question.
unlike others, medvedev is an internationally recognized historian.
he tells that these people are governmental bureaucrats although some of them have degrees.
main point this is a can anyone clarify, please, where the 21 million number for three countries in version 3 comes from?
i hope you do not suggest to replace three large sections about kirov assassination by his single paragraph?
of course, if there is any sourced information in his text that currently missing, it might be 'added' to current version.
perhaps this article should be merged, but this must be properly done.
yes, that was one of the reasons why she left big sport so early.
of course he did not write about his abuse in reports to kgb superiors.
i like book by radzinsky, but this source provides much more details with a lot of references radzinsky works on a bigger 3-volume biography of stalin right now.
so, you are very welcome to improve this and 'other' articles on the subject using this book.
just remember to focus on content, not contributors.
unfortunately, this is not what sources tell as was already discussed above.
higher estimates include not only mass murders or executions but also avoidable lives lost due to famine and disease due to confiscation or destruction of property, in addition to deaths in forced labor camps or during forced relocation.
the number also does not include civilians executed during russian civil war, etc.
unfortunately i do not have time right now, but i will check some sources that describe repression of children in the soviet union to make changes in relevant articles accordingly - as time allows.
just wanted to tell you about this case.
the grandson of stalin went to court against the radio because the remark was a slander.
the radio's lawyer and editor in chef didn't manage to prove in court that children were executed in ussr, simply because there was no proof.
i hope you reconsider your position in the light of this info."""

In [None]:

masked_text = """if they actually censor anything is another question.
unlike others, medvedev is an internationally recognized historian.
he tells that these people are governmental bureaucrats although some of them have degrees.
main point this is a can anyone clarify, please, where the 21 million number for three countries in version 3 comes from?
i hope you do not suggest to replace three large sections about kirov assassination by his single paragraph?
of course, if there is any sourced information in his text that currently missing, it might be 'added' to current version.
perhaps this article should be merged, but this must be properly done.
yes, that was one of the reasons why she left big sport so early.
of course he did not write about his abuse in reports to kgb superiors.
i like book by radzinsky, but this source provides much more details with a lot of references radzinsky works on a bigger 3-volume biography of stalin right now.
so, you are very welcome to improve this and 'other' articles on the subject using this book.
just remember to focus on content, not contributors.
unfortunately, <extra_id_0> what sources tell as was already discussed above.
higher estimates include not only mass murders or executions but also avoidable lives lost due to famine and disease due to confiscation or destruction of property, in addition to deaths in forced labor camps or during forced relocation.
the number also does not include civilians executed during russian civil war, etc.
unfortunately i do not have time right now, but i will check some sources that describe repression of children in the soviet union to make changes in relevant articles accordingly - as time allows.
just wanted to tell you about this case.
the grandson of stalin went to court against the radio because the remark was a slander.
the radio's lawyer and editor in chef didn't manage to prove in court that children were executed in ussr, simply because there was no proof.
i hope you reconsider your position in the light of this info."""

In [72]:
masked_text = """
i like book by radzinsky, but this source provides much more details with a lot of references radzinsky works on a bigger 3-volume biography of stalin right now.
so, you are very welcome to improve this and 'other' articles on the subject using this book.
just remember to focus on content, not contributors.
unfortunately, <extra_id_0> what sources tell as was already discussed above.
higher estimates include not only mass murders or executions but also avoidable lives lost due to famine and disease due to confiscation or destruction of property, in addition to deaths in forced labor camps or during forced relocation.
the number also does not include civilians executed during russian civil war, etc.
unfortunately i do not have time right now, but i will check some sources that describe repression of children in the soviet union to make changes in relevant articles accordingly - as time allows.
just wanted to tell you about this case.<extra_id_1>"""

In [73]:
t5_model_generation(model, tokenizer, masked_text)

['this is only i am very sorry to hear about it.',
 'this is only i am very sorry for this.',
 'this is only i am very sorry about that.',
 'this is only i am very sorry to hear about this.',
 'this is only i am very sorry to hear this.']

In [26]:
outputs

tensor([[    0, 32099,    25,    54,   103,   424, 32098,   533,     5,     1]])

In [5]:
outputs.encoder_last_hidden_state

tensor([[[ 0.2828,  0.0979,  0.1090,  ..., -0.2794,  0.2154,  0.0370],
         [ 0.1593,  0.0956, -0.0922,  ...,  0.1854,  0.0993,  0.1548],
         [ 0.0076,  0.0090, -0.0092,  ...,  0.0050,  0.0019,  0.0076],
         ...,
         [-0.0119, -0.1228, -0.0638,  ..., -0.0602, -0.1791, -0.0499],
         [-0.0342, -0.0511, -0.1488,  ...,  0.3007, -0.2612, -0.1452],
         [ 0.0670,  0.1300, -0.0356,  ..., -0.0100,  0.0690, -0.0531]]],
       grad_fn=<MulBackward0>)

In [None]:
last_hidden_states

tensor([[[ 2.5253e-01,  1.5952e-01, -1.9853e-01,  ...,  1.0275e-01,
          -3.6560e-04, -8.1293e-03],
         [ 1.8470e-01,  1.0938e-01, -1.7418e-01,  ...,  3.2740e-02,
          -5.3408e-04, -5.0218e-02],
         [ 2.8846e-01,  2.3717e-01, -7.3225e-02,  ...,  5.7853e-02,
          -3.4904e-04, -9.3356e-02],
         [ 4.6172e-02,  4.3064e-01, -7.4659e-02,  ...,  5.6104e-02,
          -3.3899e-04, -1.2440e-01]]], grad_fn=<MulBackward0>)

In [20]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

text = "The results <extra_id_0> beyond doubt."  # masked text
inputs = tokenizer(text, return_tensors='pt')
outputs = model.generate(**inputs, max_new_tokens=10, num_return_sequences=5, 
                         do_sample=True, top_k=50, temperature=1.0)
candidates = tokenizer.batch_decode(outputs, skip_special_tokens=False)


In [21]:
candidates

['<pad> <extra_id_0> cannot be challenged <extra_id_1> . ... <extra_id_2> are',
 '<pad> <extra_id_0> were <extra_id_1> are <extra_id_2> are <extra_id_3> are <extra_id_4> are',
 '<pad> <extra_id_0> are <extra_id_1> are, and still are, <extra_id_2>',
 '<pad> <extra_id_0> of all these evaluations are <extra_id_1> .</s>',
 '<pad> <extra_id_0> are <extra_id_1> are definitive and are beyond doubt.']

In [17]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

def paraphrase_span_t5(
    model,
    tokenizer,
    prefix: str,
    suffix: str,
    num_return_sequences: int = 5,
    num_beams: int = None,
    max_new_tokens: int = 30,
    do_sample: bool = True,
    top_k: int = 50,
    top_p: float = 0.95,
    temperature: float = 0.9,
    device: str = None,
):
    """
    Use a T5-like model to paraphrase ONLY a missing span between prefix and suffix.
    Returns a list of paraphrased spans (not full sentences).

    Arguments:
      - prefix: text before the span to paraphrase
      - suffix: text after the span to paraphrase
      - model: a seq2seq model (e.g. T5ForConditionalGeneration) already loaded
      - tokenizer: the corresponding tokenizer
      - num_return_sequences: how many paraphrases to return
      - num_beams: if specified, enables beam search; else uses sampling
      - max_new_tokens: max length for generated span
      - do_sample, top_k, top_p, temperature: sampling parameters for diversity
      - device: optionally the torch device (e.g. 'cuda' or 'cpu'); if None, uses model.device
    """

    # Build the T5‑style infilling input
    # Note: space / punctuation is sensitive — ensure prefix/suffix punctuation/spacing is correct
    inp = f"<extra_id_0> {prefix.strip()} <extra_id_1> {suffix.strip()}"

    # Encode
    inputs = tokenizer(inp, return_tensors="pt")
    if device is not None:
        inputs = {k: v.to(device) for k, v in inputs.items()}
        model = model.to(device)

    # Generation arguments
    gen_kwargs = {
        "max_new_tokens": max_new_tokens,
        "num_return_sequences": num_return_sequences,
    }

    if num_beams is not None:
        gen_kwargs["num_beams"] = num_beams
        # When using beams, you might disable sampling
        gen_kwargs["do_sample"] = False
    else:
        gen_kwargs["do_sample"] = do_sample
        gen_kwargs["top_k"] = top_k
        gen_kwargs["top_p"] = top_p
        gen_kwargs["temperature"] = temperature

    outputs = model.generate(
        **inputs,
        **gen_kwargs
    )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=False)

    paraphrases = []
    for seq in decoded:
        # Extract text between <extra_id_0> and <extra_id_1>
        # Some outputs may not contain both markers — guard against that
        if "<extra_id_0>" in seq and "<extra_id_1>" in seq:
            span = seq.split("<extra_id_0>", 1)[1].split("<extra_id_1>", 1)[0].strip()
            # Basic cleanup: drop leading/trailing punctuation if needed
            paraphrases.append(span)

    return paraphrases


In [92]:
def get_changed_span_tokens(
    original_ids: list[int],
    new_ids: list[int],
    span_token_idxs: list[int],
    tokenizer
):
    """
    Returns a list of decoded tokens that differ between the original
    token sequence and the new sequence, but only inside span_token_idxs.
    
    Example output:
        ["awesome", "movie"]
    """
    changed_token_ids = []

    for i in span_token_idxs:
        if original_ids[i] != new_ids[i]:
            changed_token_ids.append(new_ids[i])

    # Decode each token individually (important for subword handling)
    decoded = [tokenizer.decode([tid], skip_special_tokens=True).strip()
               for tid in changed_token_ids]

    return decoded

In [96]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def incremental_mlm_paraphrase(
    sentence: str,
    span_token_idxs: list[int],  # indices of tokens of the target n‑gram (after tokenization)
    top_k: int = 10,
):
    """
    Try to paraphrase a multi‑token span by masking and filling one token at a time.
    Returns a list of candidate paraphrased sentences.
    """
    # Tokenize once
    encoding = tokenizer(sentence, return_tensors="pt")
    input_ids = encoding.input_ids[0].tolist()
    results = [""]  # start with empty, we will build up

    # We'll build candidates — but this explodes combinatorially, so careful
    sentences = [input_ids]  # list of token id sequences

    for idx in span_token_idxs:
        new_sentences = []
        for seq in sentences:
            masked = seq.copy()
            masked[idx] = tokenizer.mask_token_id
            input_ids_batch = torch.tensor([masked], device=device)
            with torch.no_grad():
                logits = model(input_ids=input_ids_batch).logits
            # get top_k predictions for masked position
            probs = torch.softmax(logits[0, idx], dim=-1)
            topk_ids = torch.topk(probs, top_k).indices.tolist()
            for tid in topk_ids:
                new_seq = seq.copy()
                new_seq[idx] = tid
                new_sentences.append(new_seq)
        sentences = new_sentences

    # Decode all final candidates
    paraphrases = []
    for seq in sentences:
        changed_tokens = get_changed_span_tokens(input_ids, seq, span_token_idxs, tokenizer)
        text = tokenizer.decode(seq, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        paraphrases.append(text)
    return paraphrases

In [97]:
from typing import List, Tuple, Union
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer

def find_subtoken_spans(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    text: str,
    target: Union[str, List[str]],
    *,
    normalize_spaces: bool = True
) -> List[Tuple[int,int]]:
    """
    Tokenize `text`, then find all spans of tokens matching `target`.
    Returns list of (start_idx, end_idx) inclusive token spans.
    If target is a string, it will be tokenized by tokenizer first.
    """
    # 1. Tokenize full text (get token list)
    encoding = tokenizer(text, return_attention_mask=False, return_tensors=None)
    tokens = encoding.tokens()  # list of token strings

    # 2. Prepare target token list
    if isinstance(target, str):
        target_tokens = tokenizer.tokenize(target)
    else:
        target_tokens = target

    if normalize_spaces:
        # some tokenizers may produce '▁' or 'Ġ' or '##' prefixes; 
        # we rely on exact match of token strings.
        pass

    spans = []
    m = len(target_tokens)
    if m == 0:
        return spans

    # 3. Slide over tokens to find matches
    for i in range(len(tokens) - m + 1):
        if tokens[i:i+m] == target_tokens:
            spans.append((i, i+m-1))

    return spans

In [145]:
def find_phrase_token_spans(text: str, phrase_pretty: str, tokenizer):
    """
    Find all token spans for phrase_pretty inside text using tokenizer-consistent tokenization.
    Works for any tokenizer: BERT, RoBERTa, ALBERT, GPT-like, SentencePiece, etc.
    """

    # Tokenize full text WITH special tokens
    enc = tokenizer(text, add_special_tokens=True)
    text_tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])

    # Tokenize phrase using model rules
    phrase_tokens = tokenizer.tokenize(phrase_pretty)
    n = len(phrase_tokens)

    spans = []

    for i in range(len(text_tokens) - n + 1):
        if text_tokens[i:i+n] == phrase_tokens:
            spans.append(list(range(i, i+n)))

    return spans

In [146]:
raw_text = """if they actually censor anything is another question.
unlike others, medvedev is an internationally recognized historian.
he tells that these people are governmental bureaucrats although some of them have degrees.
main point this is a can anyone clarify, please, where the 21 million number for three countries in version 3 comes from?
i hope you do not suggest to replace three large sections about kirov assassination by his single paragraph?
of course, if there is any sourced information in his text that currently missing, it might be 'added' to current version.
perhaps this article should be merged, but this must be properly done.
yes, that was one of the reasons why she left big sport so early.
of course he did not write about his abuse in reports to kgb superiors.
i like book by radzinsky, but this source provides much more details with a lot of references radzinsky works on a bigger 3-volume biography of stalin right now.
so, you are very welcome to improve this and 'other' articles on the subject using this book.
just remember to focus on content, not contributors.
unfortunately, this is not what sources tell as was already discussed above.
higher estimates include not only mass murders or executions but also avoidable lives lost due to famine and disease due to confiscation or destruction of property, in addition to deaths in forced labor camps or during forced relocation.
the number also does not include civilians executed during russian civil war, etc.
unfortunately i do not have time right now, but i will check some sources that describe repression of children in the soviet union to make changes in relevant articles accordingly - as time allows.
just wanted to tell you about this case.
the grandson of stalin went to court against the radio because the remark was a slander.
the radio's lawyer and editor in chef didn't manage to prove in court that children were executed in ussr, simply because there was no proof.
i hope you reconsider your position in the light of this info."""

raw_phrase=", but this"


In [147]:
location = find_phrase_token_spans(
    tokenizer=tokenizer,
    text=raw_text,
    phrase_pretty=raw_phrase
)

In [148]:
location

[[122, 123, 124], [172, 173, 174]]

In [137]:
def incremental_mlm_paraphrase(
    sentence: str,
    span_token_idxs: list[int],
    original_phrase: str,
    model,
    tokenizer,
    device,
    top_k: int = 10
):
    # Normalize original phrase for comparison
    original_phrase = original_phrase.lower().strip()

    encoding = tokenizer(sentence, return_tensors="pt")
    original_ids = encoding.input_ids[0].tolist()

    # Determine anchor positions
    before_idx = span_token_idxs[0] - 1
    after_idx  = span_token_idxs[-1] + 1

    before_token = original_ids[before_idx] if before_idx >= 0 else None
    after_token  = original_ids[after_idx]  if after_idx < len(original_ids) else None

    # Start from the original token ID sequence
    sequences = [original_ids]

    # ---- Incremental masked-token replacement ----
    for idx in span_token_idxs:
        new_sequences = []

        for seq in sequences:
            masked = seq.copy()
            masked[idx] = tokenizer.mask_token_id

            input_ids_batch = torch.tensor([masked], device=device)
            with torch.no_grad():
                logits = model(input_ids=input_ids_batch).logits

            probs = torch.softmax(logits[0, idx], dim=-1)
            topk_ids = torch.topk(probs, top_k).indices.tolist()

            # Each option sprouts a new sequence
            for tid in topk_ids:
                new_seq = seq.copy()
                new_seq[idx] = tid
                new_sequences.append(new_seq)

        sequences = new_sequences

    # ---- Extract paraphrased spans using anchors ----
    paraphrases = set()
    results = []

    for seq in sequences:
        # Extract span boundaries via anchor method
        start = before_idx + 1 if before_token is not None else 0
        end   = after_idx if after_token is not None else len(seq)

        span_ids = seq[start:end]

        # Decode the phrase
        phrase = tokenizer.decode(
            span_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        ).lower().strip()

        # Skip empty, unchanged, or duplicate phrases
        if not phrase or phrase == original_phrase:
            continue
        if phrase in paraphrases:
            continue

        paraphrases.add(phrase)

        # Token-level decoding
        token_list = [
            tokenizer.decode([tid], skip_special_tokens=True).strip().lower()
            for tid in span_ids
        ]

        results.append({
            "phrase": phrase,
            "tokens": token_list
        })

    return results


In [138]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [139]:
cands = incremental_mlm_paraphrase(raw_text, location[0], raw_phrase, top_k=50, model=model, tokenizer=tokenizer, device=device)

In [140]:
len(cands)

2305

In [141]:
cands

[{'phrase': ', but it', 'tokens': [',', 'but', 'it']},
 {'phrase': ', but that', 'tokens': [',', 'but', 'that']},
 {'phrase': ', but everything', 'tokens': [',', 'but', 'everything']},
 {'phrase': ', but editing', 'tokens': [',', 'but', 'editing']},
 {'phrase': ', but something', 'tokens': [',', 'but', 'something']},
 {'phrase': ', but so', 'tokens': [',', 'but', 'so']},
 {'phrase': ', but they', 'tokens': [',', 'but', 'they']},
 {'phrase': ', but things', 'tokens': [',', 'but', 'things']},
 {'phrase': ', but correction', 'tokens': [',', 'but', 'correction']},
 {'phrase': ', but work', 'tokens': [',', 'but', 'work']},
 {'phrase': ', but translation', 'tokens': [',', 'but', 'translation']},
 {'phrase': ', but corrections', 'tokens': [',', 'but', 'corrections']},
 {'phrase': ', but its', 'tokens': [',', 'but', 'its']},
 {'phrase': ', but revision', 'tokens': [',', 'but', 'revision']},
 {'phrase': ', but all', 'tokens': [',', 'but', 'all']},
 {'phrase': ', but both', 'tokens': [',', 'but'