In [1]:
pip install transformers sentence-transformers datasets trl torch


Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.11.0-py3-none-any.whl.metadata (12 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading sentence_transformers-3.1.1-py3-no

In [2]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2024.9.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==3.1.0a0

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from googletrans import Translator
from dask import delayed, compute
import torch
import numpy as np
from trl import PPOTrainer, PPOConfig
from datasets import Dataset
import cloudpickle as pickle


# Initialize models and tokenizers
t5_model = T5ForConditionalGeneration.from_pretrained('Vamsi/T5_Paraphrase_Paws')
t5_tokenizer = T5Tokenizer.from_pretrained('Vamsi/T5_Paraphrase_Paws')

nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
nli_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Translator for back-translation
# translator = Translator()

# Sample text
text = """Before the formation of kamikaze units, pilots had made deliberate crashes as a last resort when their aircraft
          had suffered severe damage and they did not want to risk being captured or wanted to do as much damage to
          the enemy as possible, since they were crashing anyway. Such situations occurred in both the Axis and Allied
          air forces. Axell and Kase see these suicides as 'individual, impromptu decisions by men who were mentally
          prepared to die'."""

# Function to perform paraphrasing using T5
def paraphrase_t5(text):
    input_text = f"paraphrase: {text} </s>"
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = t5_model.generate(input_ids, max_length=512, num_beams=4, num_return_sequences=2, early_stopping=True)
    paraphrases = [t5_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in outputs]
    return paraphrases

# Function for back-translation (English -> another language -> English)
# def back_translation(text):
#     translated = translator.translate(text, src='en', dest='fr').text  # Translate to French
#     back_translated = translator.translate(translated, src='fr', dest='en').text  # Back to English
#     return back_translated

# Function for NLI-based validation
def validate_nli(original_text, paraphrased_text):
    inputs = nli_tokenizer.encode_plus(original_text, paraphrased_text, return_tensors="pt", truncation=True)
    logits = nli_model(**inputs).logits
    probabilities = torch.softmax(logits, dim=-1).detach().cpu().numpy()[0]
    entailment_prob = probabilities[2]  # Index 2 is for "entailment"
    return entailment_prob > 0.85  # Entailment threshold

# Define a reward function for RL
def reward_function(paraphrase, original):
    # Calculate semantic similarity (higher similarity gets a higher reward)
    original_embedding = similarity_model.encode([original])[0]
    paraphrase_embedding = similarity_model.encode([paraphrase])[0]
    similarity_score = np.dot(original_embedding, paraphrase_embedding) / (np.linalg.norm(original_embedding) * np.linalg.norm(paraphrase_embedding))

    # Use NLI validation as a part of the reward
    entailment_valid = validate_nli(original, paraphrase)

    # Reward based on both similarity and entailment (reward between 0-1)
    return similarity_score * entailment_valid

# Reinforcement Learning using PPO (Proximal Policy Optimization)
def train_paraphraser_with_rl(paraphrases, original_text):
    # RL config
    config = PPOConfig()
    trainer = PPOTrainer(t5_model, config)

    # Prepare dataset
    dataset = Dataset.from_dict({"text": paraphrases})

    for epoch in range(1):  # One epoch for demonstration; increase as needed
        for sample in dataset:
            paraphrase = sample["text"]
            reward = reward_function(paraphrase, original_text)
            trainer.step(paraphrase, reward)  # Update model with reward

    # Return optimized paraphrases
    return paraphrases

# Post-Processing: Style Matching
def style_matching(text, target_style='formal'):
    style_model = T5ForConditionalGeneration.from_pretrained('mrm8488/t5-base-finetuned-sentence-style-transfer')
    style_tokenizer = T5Tokenizer.from_pretrained('mrm8488/t5-base-finetuned-sentence-style-transfer')

    input_text = f"transfer {target_style}: {text} </s>"
    input_ids = style_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = style_model.generate(input_ids, max_length=512, num_beams=4, num_return_sequences=1, early_stopping=True)
    styled_text = style_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return styled_text

# Main function to process text
@delayed
def process_text(text):
    # Step 1: Paraphrase using T5
    paraphrases = paraphrase_t5(text)

    # Step 2: Apply back-translation
    paraphrases = [p for p in paraphrases]

    # Step 3: Apply Reinforcement Learning to optimize paraphrasing
    paraphrases = train_paraphraser_with_rl(paraphrases, text)

    # Step 4: Validate using NLI for semantic preservation
    valid_paraphrases = [p for p in paraphrases if validate_nli(text, p)]

    # Step 5: Style matching for formal tone (optional)
    styled_paraphrases = [style_matching(p) for p in valid_paraphrases]

    # If no paraphrases pass validation, return the original text
    if not styled_paraphrases:
        return text

    return styled_paraphrases[0]  # Return the first valid and styled paraphrase

# Dask for scalability - parallel processing
texts = [text]  # Replace with a list of texts if processing multiple
results = compute(*[process_text(t) for t in texts], scheduler='processes')

# Output the transformed content
for result in results:
    print("Transformed Content:", result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]