In [4]:
pip install bert-score torch



In [5]:
original_path = "neutral_test.txt"
styled_generated_path = "styled_test_generated.txt"

In [6]:
from bert_score import score
import statistics

def load_text_file(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]
        return lines
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return []

def calculate_metrics():
    refs = load_text_file(original_path)
    cands = load_text_file(styled_generated_path)

    if len(refs) != len(cands):
        print(f"Error: The files have different numbers of lines!")
        print(f"original.txt: {len(refs)} lines")
        print(f"styled_generated.txt:  {len(cands)} lines")
        return

    print(f"Processing {len(refs)} pairs of sentences")
    print("Downloading model and calculating... (This takes time on the first run)")
    P, R, F1 = score(cands, refs, lang="en", verbose=True)
    print("")
    print("results")
    print("="*30)
    print(f"Average F1 Score: {F1.mean():.4f}")
    print(f"Average Precision: {P.mean():.4f}")
    print(f"Average Recall:    {R.mean():.4f}")
    print("="*30)

    if F1.mean() > 0.8:
        print("Excellent! Content is well preserved.")
    elif F1.mean() > 0.6:
        print("Moderate content preservation.")
    else:
        print("Low preservation. The meaning has changed significantly")

if __name__ == "__main__":
    calculate_metrics()

Processing 500 pairs of sentences
Downloading model and calculating... (This takes time on the first run)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/16 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 11.37 seconds, 43.96 sentences/sec

results
Average F1 Score: 0.9191
Average Precision: 0.9180
Average Recall:    0.9206
Excellent! Content is well preserved.


 BERTScore: Content Preservation
Result: F1 Score: 0.9191

Conclusion: The model has ideally preserved the semantic meaning of the original text. The core message remains intact

style check

In [7]:
from transformers import pipeline
import statistics
from tqdm import tqdm
def load_lines(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Error: File {filename} not found.")
        return []

def calculate_style_metrics():
    original_lines = load_lines(original_path)
    cleaned_lines = load_lines(styled_generated_path)
    original_lines = original_lines
    cleaned_lines = cleaned_lines

    print(f"Loading Zero-Shot Classification model...")
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    candidate_labels = ["Donald Trump style tweet", "Neutral formal statement"]

    print(" Calculating style scores for origina text (Baseline)")
    original_scores = []
    for line in tqdm(original_lines):
        result = classifier(line, candidate_labels)
        trump_idx = result['labels'].index("Donald Trump style tweet")
        score = result['scores'][trump_idx]
        original_scores.append(score)

    print(" Calculating style scores for styled generated text (Bot Output)...")
    cleaned_scores = []
    for line in tqdm(cleaned_lines):
        result = classifier(line, candidate_labels)
        trump_idx = result['labels'].index("Donald Trump style tweet")
        score = result['scores'][trump_idx]
        cleaned_scores.append(score)

    avg_orig = statistics.mean(original_scores)
    avg_clean = statistics.mean(cleaned_scores)
    style_change = avg_clean - avg_orig

    print("\n" + "="*40)
    print("       STYLE TRANSFER METRICS       ")
    print("="*40)
    print(f" Avg Trump-Score (Input/Original):  {avg_orig:.4f} (Should be low)")
    print(f" Avg Trump-Score (Output/Bot):      {avg_clean:.4f} (Should be high)")
    print("-" * 40)
    print(f" Style Change (SC) Metric:          {style_change:.4f}")
    print("="*40)

    if avg_clean > 0.7:
        print(" Strong style detected!")
    elif avg_clean > 0.5:
        print(" Moderate style detected.")
    else:
        print(" Weak style transfer.")

if __name__ == "__main__":
    calculate_style_metrics()

Loading Zero-Shot Classification model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


 Calculating style scores for origina text (Baseline)


  2%|▏         | 10/500 [00:01<01:18,  6.25it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 500/500 [00:30<00:00, 16.16it/s]


 Calculating style scores for styled generated text (Bot Output)...


100%|██████████| 500/500 [00:29<00:00, 16.71it/s]


       STYLE TRANSFER METRICS       
 Avg Trump-Score (Input/Original):  0.2463 (Should be low)
 Avg Trump-Score (Output/Bot):      0.3359 (Should be high)
----------------------------------------
 Style Change (SC) Metric:          0.0897
 Weak style transfer.





Zero-Shot Classification: Style Intensity
Result: Shift from 0.2463 to 0.3359 (Gain: +0.0897)

Conclusion: weak result. The stylistic shift is minimal. The model is not aggressive enough in mimicking the distinct target style (Donald Trump).

In [8]:
pip install evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py

In [14]:
import evaluate
from tqdm import tqdm
import os
import math

def load_lines(filename):
    if not os.path.exists(filename):
        print(f" Error: {filename} not found.")
        return []
    with open(filename, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

def calculate_metrics():

    print(" Loading files...")
    refs = load_lines(original_path)
    preds = load_lines(styled_generated_path)

    if not refs or not preds:
        return


    print("-" * 40)
    print(" Calculating BLEU Score")
    bleu = evaluate.load("bleu")
    results_bleu = bleu.compute(predictions=preds, references=refs)

    print(f" BLEU Score: {results_bleu['bleu']:.4f}")

    print("-" * 40)
    print(" Calculating Perplexity (Fluency)...")
    print("Downloading GPT-2 model for perplexity check...")
    perplexity = evaluate.load("perplexity", module_type="metric")

    results_ppl = perplexity.compute(predictions=preds, model_id='gpt2')

    avg_ppl = results_ppl['mean_perplexity']

    print(f" Average Perplexity: {avg_ppl:.2f}")
    print("-" * 40)

    print(f"1. BLEU: {results_bleu['bleu']:.4f}")
    print("   - > 0.4: Very similar text (weak style transfer).")
    print("   - 0.1 - 0.4: Golden middle (style changed, but structure preserved).")
    print("   - < 0.1: Text is completely changed (risk of losing meaning).")
    print(f"2. Perplexity: {avg_ppl:.2f}")
    print("   - 10-50: Very natural English.")
    print("   - 50-100: Acceptable, but with some odd phrasing (typical of Trump's style!).")
    print("   - > 100: Grammatical errors or nonsense.")

if __name__ == "__main__":
    calculate_metrics()

 Loading files...
----------------------------------------
 Calculating BLEU Score


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

 BLEU Score: 0.3301
----------------------------------------
 Calculating Perplexity (Fluency)...
Downloading GPT-2 model for perplexity check...


Downloading builder script: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/32 [00:00<?, ?it/s]

 Average Perplexity: 27.10
----------------------------------------
1. BLEU: 0.3301
   - > 0.4: Very similar text (weak style transfer).
   - 0.1 - 0.4: Golden middle (style changed, but structure preserved).
   - < 0.1: Text is completely changed (risk of losing meaning).
2. Perplexity: 27.10
   - 10-50: Very natural English.
   - 50-100: Acceptable, but with some odd phrasing (typical of Trump's style!).
   - > 100: Grammatical errors or nonsense.


BLEU Score: Structural Change
Result: BLEU: 0.3301

Conclusion: it confirms that the model is actively rewriting the text, successfully changing vocabulary and sentence structure while preserving the content frame.


Perplexity (PPL): Fluency
Result: PPL: 27.10

Conclusion: Excellent. The generated text is highly fluent and natural in English (PPL < 50 is desirable).

## Summary

We have the following results based on metrics


Overall Verdict: High-Quality Paraphraser with Weak Stylization
The model is a reliable and high-quality generator. It produces fluent English (PPL 27.10) and perfectly preserves the input meaning (BERTScore 0.92). However, there is an issue in stylization: the model is too conservative, resulting in only a weak change toward the target style (Zero-Shot Score 0.34).
