In [1]:
from google.colab import drive

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

Mounting Google Drive...
Mounted at /content/drive


In [2]:
import subprocess
import sys
import importlib
import pkg_resources

def install_package(package_name, pip_command=None):
    """Install a package and return True if successful, False otherwise."""
    try:
        importlib.import_module(package_name.split('==')[0].replace('-', '_'))
        return True
    except ImportError:
        try:
            cmd = pip_command or f"pip install {package_name}"
            result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
            print(result.stdout)
            return True
        except subprocess.CalledProcessError as e:
            print(f"Error installing {package_name}: {e.stderr}")
            return False

# List of packages to install
packages = [
    ("transformers", None),
    ("datasets", None),
    ("rouge-score", None),
    ("easse", "pip install git+https://github.com/feralvam/easse.git"),
    ("textstat", None),
    ("nltk", None)
]

# Install packages
all_installed = True
for pkg_name, pip_cmd in packages:
    if not install_package(pkg_name, pip_cmd):
        print(f"Failed to install {pkg_name}. Please try manually.")
        all_installed = False

# Verify installed packages and print versions
if all_installed:
    print("All packages installed successfully.")
    print("\nInstalled package versions:")
    for pkg_name, _ in packages:
        try:
            pkg = pkg_name.split('==')[0].replace('-', '_')
            version = pkg_resources.get_distribution(pkg).version
            print(f"{pkg}: {version}")
        except Exception as e:
            print(f"Could not get version for {pkg_name}: {e}")
else:
    print("Some packages failed to install. Check errors above.")

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2afde01509bfce89b7b6ccca2ce8f04c17d0eea33c98740483ba8c08fe78224f
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2

Collecting git+https://github.com/feralvam/easse.git
  Cloning https://github.com/feralvam/easse.git to /tmp/pip-req-build-ohvxk2tt
  Resolved https://github.com/feralvam/easse.git to commit 6a4352ec299ed03fda8ee45445ca43d9c7673e89
  Preparing metadata (setup.py): st

In [3]:
import sys
import importlib
import warnings
warnings.filterwarnings('ignore')

# Import all required modules
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer, BartForConditionalGeneration,
    T5Tokenizer, T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from textstat import flesch_kincaid_grade
from tqdm import tqdm
from easse.sari import corpus_sari

# Download NLTK data
nltk.download('punkt', quiet=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Print PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Confirm easse import
print("easse imported successfully.")
print("All libraries and environment setup completed successfully.")

Using device: cuda
PyTorch version: 2.6.0+cu124
easse imported successfully.
All libraries and environment setup completed successfully.


In [4]:
from torch.utils.data import Dataset
from transformers import BartTokenizer

class WikiLargeDataset(Dataset):
    """Custom dataset for WikiLarge text simplification data"""

    def __init__(self, complex_texts, simple_texts, tokenizer, max_length=384):
        try:
            if len(complex_texts) != len(simple_texts):
                raise ValueError(f"Mismatched text lengths: {len(complex_texts)} complex vs {len(simple_texts)} simple")
            if not all(isinstance(t, str) for t in complex_texts):
                raise TypeError("All complex texts must be strings")
            if not all(isinstance(t, str) for t in simple_texts):
                raise TypeError("All simple texts must be strings")

            self.complex_texts = complex_texts
            self.simple_texts = simple_texts
            self.tokenizer = tokenizer
            self.max_length = max_length
        except Exception as e:
            print(f"Error initializing WikiLargeDataset: {e}")
            raise

    def __len__(self):
        return len(self.complex_texts)

    def __getitem__(self, idx):
        try:
            complex_text = str(self.complex_texts[idx]).strip()
            simple_text = str(self.simple_texts[idx]).strip()

            if not complex_text or not simple_text:
                raise ValueError(f"Empty text at index {idx}: complex='{complex_text}', simple='{simple_text}'")

            # Tokenize input (complex text)
            source = self.tokenizer(
                complex_text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Tokenize target (simple text)
            target = self.tokenizer(
                simple_text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            return {
                'input_ids': source['input_ids'].flatten(),
                'attention_mask': source['attention_mask'].flatten(),
                'labels': target['input_ids'].flatten()
            }
        except Exception as e:
            print(f"Error getting item at index {idx}: {e}")
            raise

# Test dataset class
try:
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    sample_complex = ["The quick brown fox jumps over the lazy dog."]
    sample_simple = ["The fox jumps over the dog."]
    dataset = WikiLargeDataset(sample_complex, sample_simple, tokenizer)
    sample_item = dataset[0]
    print("Sample dataset created successfully.")
    print("Sample dataset item keys:", sample_item.keys())
    print("Input IDs shape:", sample_item['input_ids'].shape)
except Exception as e:
    print(f"Error testing WikiLargeDataset: {e}")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Sample dataset created successfully.
Sample dataset item keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: torch.Size([384])


In [5]:
import pandas as pd
import traceback
import psutil
import os

def print_memory_usage(step):
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / 1024 ** 2  # MB
    print(f"Memory usage at {step}: {mem:.2f} MB")

def get_sample_data():
    return {
        'complex': [
            "The implementation of sophisticated algorithms facilitates computational efficiency.",
            "Due to unprecedented meteorological circumstances, operations are suspended.",
            "The pharmaceutical intervention ameliorated chronic cardiovascular symptoms."
        ],
        'simple': [
            "Smart algorithms improve computer performance.",
            "Bad weather caused a pause in operations.",
            "The medicine helped heart-related symptoms."
        ]
    }

def load_huggingface_data(max_samples=10000):
    print("🔄 Loading WikiLarge dataset...")
    try:
        df = pd.read_csv("hf://datasets/bogdancazan/wikilarge-text-simplification/wiki.full.aner.ori.train.95.tsv", sep="\t")
        print("✅ Dataset loaded successfully!")
        print(f"Columns: {df.columns}")
        print(f"Raw dataset size: {len(df):,} pairs")
        df = df[['Normal', 'Simple']].rename(columns={'Normal': 'complex', 'Simple': 'simple'})
        df = df.dropna().query('complex.str.len() > 10 and simple.str.len() > 5')
        if len(df) == 0:
            raise ValueError("No valid data after cleaning")
        if max_samples and len(df) > max_samples:
            df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
        if not (df['complex'].apply(lambda x: isinstance(x, str)).all() and df['simple'].apply(lambda x: isinstance(x, str)).all()):
            raise TypeError("Non-string values in dataset")
        print(f"Loaded {len(df):,} pairs")
        print("\nSample data:")
        for i in range(min(3, len(df))):
            print(f"Complex: {df.iloc[i]['complex']}")
            print(f"Simple: {df.iloc[i]['simple']}")
            print("-" * 50)
        print_memory_usage("After Dataset Loading")
        return df
    except Exception as e:
        print(f"❌ Error loading dataset: {str(e)}\nTraceback:\n{traceback.format_exc()}")
        print("Falling back to sample data...")
        df = pd.DataFrame(get_sample_data())
        print(f"Loaded {len(df):,} sample pairs")
        print("\nSample data:")
        for i in range(len(df)):
            print(f"Complex: {df.iloc[i]['complex']}")
            print(f"Simple: {df.iloc[i]['simple']}")
            print("-" * 50)
        print_memory_usage("After Sample Data Loading")
        return df

try:
    df = load_huggingface_data(max_samples=10000)
except Exception as e:
    print(f"Fatal error loading data: {str(e)}")
    raise

🔄 Loading WikiLarge dataset...
✅ Dataset loaded successfully!
Columns: Index(['Normal', 'Simple'], dtype='object')
Raw dataset size: 148,843 pairs
Loaded 10,000 pairs

Sample data:
Complex: the land before time dvd the film explores issues of prejudice between the different species and the hardships they endure in their journey as they are guided by the spirit of littlefoot s mother.
Simple: in addition to the movies there is the land before time sing along songs lrb o o rrb and
--------------------------------------------------
Complex: to commemorate his death josquin des prez composed the motet la d ploration de la mort de johannes ockeghem a setting of the poem nymphes des bois by jean molinet.
Simple: when ockeghem died josquin des prez composed a motet called la d ploration de la mort de johannes ockeghem in his honour.
--------------------------------------------------
Complex: he suffered serious head injuries and was sidelined for the rest of the season replaced by andrea de c

In [6]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from sklearn.model_selection import train_test_split

class TextSimplificationModel:
    def __init__(self, model_name='facebook/bart-base'):
        print(f"Initializing {model_name} model...")
        try:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.tokenizer = BartTokenizer.from_pretrained(model_name)
            self.model = BartForConditionalGeneration.from_pretrained(model_name)
            self.model.to(self.device)
            print(f"Model loaded on {self.device}")
            print(f"Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
            if self.device.type == 'cuda':
                print(f"GPU memory allocated: {torch.cuda.memory_allocated(self.device)/1e9:.2f} GB")
        except Exception as e:
            print(f"Error initializing model: {e}")
            raise

    def prepare_data(self, df, test_size=0.2, batch_size=16):
        print("Preparing data...")
        try:
            print(f"Input DataFrame size: {len(df)}")
            df = df.dropna().query('complex.str.len() > 10 and simple.str.len() > 5')
            if len(df) == 0:
                raise ValueError("No valid data after cleaning")
            train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
            print(f"Train DataFrame size: {len(train_df)}")
            print(f"Test DataFrame size: {len(test_df)}")
            train_dataset = WikiLargeDataset(train_df['complex'].tolist(), train_df['simple'].tolist(), self.tokenizer)
            test_dataset = WikiLargeDataset(test_df['complex'].tolist(), test_df['simple'].tolist(), self.tokenizer)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
            print(f"Training samples: {len(train_dataset)}")
            print(f"Test samples: {len(test_dataset)}")
            print(f"Batch size: {batch_size}, Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")
            return train_loader, test_loader
        except Exception as e:
            print(f"Error preparing data: {e}")
            raise

    def test_inference(self, text):
        print("Running test inference...")
        try:
            self.model.eval()
            with torch.no_grad():
                inputs = self.tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(self.device)
                output = self.model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128, num_beams=4, early_stopping=True)
                simplified = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
                print("Test inference successful.")
                print(f"Test input: {text}")
                print(f"Test output: {simplified}")
        except Exception as e:
            print(f"Error during test inference: {e}")
            raise

try:
    simplifier = TextSimplificationModel(model_name='facebook/bart-base')
    train_loader, test_loader = simplifier.prepare_data(df, batch_size=16)
    test_text = "The quick brown fox jumps over the lazy dog."
    simplifier.test_inference(test_text)
except Exception as e:
    print(f"Fatal error: {e}")
    raise

Initializing facebook/bart-base model...


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Model loaded on cuda
Model parameters: 139,420,416
GPU memory allocated: 0.56 GB
Preparing data...
Input DataFrame size: 10000
Train DataFrame size: 8000
Test DataFrame size: 2000
Training samples: 8000
Test samples: 2000
Batch size: 16, Train batches: 500, Test batches: 125
Running test inference...
Test inference successful.
Test input: The quick brown fox jumps over the lazy dog.
Test output: The quick brown fox jumps over the lazy dog.


In [7]:
import os
import torch
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
from google.colab import drive

def train(self, train_loader, test_loader, epochs=3, learning_rate=1e-4, patience=2, min_delta=0.01, accum_steps=4):
    print("Starting training for BART-base...")
    try:
        if not train_loader or not test_loader:
            raise ValueError("Train or test loader is empty")
        print(f"Dataset size: {len(train_loader.dataset)} train samples, {len(test_loader.dataset)} test samples")
        print(f"Expected train batches: {len(train_loader)}, test batches: {len(test_loader)}")
        self.model.gradient_checkpointing_enable()
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs // accum_steps
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        self.model.train()
        best_val_loss = float('inf')
        early_stop_count = 0
        save_path = "/content/drive/MyDrive/NLP/bart_simplification_model"
        os.makedirs(save_path, exist_ok=True)
        print(f"Epochs: {epochs}, Learning rate: {learning_rate}, Batch size: {train_loader.batch_size}, Gradient accumulation steps: {accum_steps}")
        for epoch in range(epochs):
            total_loss = 0
            self.model.train()
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
            for batch_idx, batch in enumerate(progress_bar):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss / accum_steps
                loss.backward()
                total_loss += loss.item() * accum_steps
                if (batch_idx + 1) % accum_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                progress_bar.set_postfix({'loss': loss.item() * accum_steps})
            avg_train_loss = total_loss / len(train_loader)
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in test_loader:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss += outputs.loss.item()
            avg_val_loss = val_loss / len(test_loader)
            print(f"Epoch {epoch+1} completed. Average Train Loss: {avg_train_loss:.4f}")
            print(f"Validation Loss: {avg_val_loss:.4f}")
            if avg_val_loss < best_val_loss - min_delta:
                best_val_loss = avg_val_loss
                early_stop_count = 0
                try:
                    self.model.save_pretrained(save_path)
                    self.tokenizer.save_pretrained(save_path)
                    print(f"Model checkpoint saved at {save_path}")
                except Exception as e:
                    print(f"Error saving model checkpoint: {e}")
            else:
                early_stop_count += 1
                if early_stop_count >= patience:
                    print(f"Early stopping triggered after epoch {epoch+1}")
                    break
        print("Training completed successfully.")
    except Exception as e:
        print(f"Error during training: {e}")
        raise

TextSimplificationModel.train = train

try:
    simplifier.train(train_loader, test_loader, epochs=3, learning_rate=1e-4)
except Exception as e:
    print(f"Fatal error during training: {e}")
    raise

Starting training for BART-base...
Dataset size: 8000 train samples, 2000 test samples
Expected train batches: 500, test batches: 125
Epochs: 3, Learning rate: 0.0001, Batch size: 16, Gradient accumulation steps: 4


Epoch 1/3: 100%|██████████| 500/500 [18:28<00:00,  2.22s/it, loss=0.135]


Epoch 1 completed. Average Train Loss: 1.5930
Validation Loss: 0.1119
Model checkpoint saved at /content/drive/MyDrive/NLP/bart_simplification_model


Epoch 2/3: 100%|██████████| 500/500 [18:32<00:00,  2.23s/it, loss=0.109]


Epoch 2 completed. Average Train Loss: 0.1094
Validation Loss: 0.1077


Epoch 3/3: 100%|██████████| 500/500 [18:32<00:00,  2.22s/it, loss=0.0933]


Epoch 3 completed. Average Train Loss: 0.0967
Validation Loss: 0.1070
Early stopping triggered after epoch 3
Training completed successfully.


In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from google.colab import drive

class TextSimplificationModel:
    def __init__(self, model_name='t5-small'):
        print(f"Initializing {model_name} model...")
        try:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.tokenizer = T5Tokenizer.from_pretrained(model_name)
            self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
            print(f"Model loaded on {self.device}")
            print(f"Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
            if self.device.type == 'cuda':
                print(f"GPU memory allocated: {torch.cuda.memory_allocated(self.device)/1e9:.2f} GB")
        except Exception as e:
            print(f"Error initializing model: {e}")
            raise

    def prepare_data(self, df, test_size=0.2, batch_size=16):
        print("Preparing data...")
        try:
            print(f"Input DataFrame size: {len(df)}")
            df = df.dropna().query('complex.str.len() > 10 and simple.str.len() > 5')
            if len(df) == 0:
                raise ValueError("No valid data after cleaning")
            train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
            print(f"Train DataFrame size: {len(train_df)}")
            print(f"Test DataFrame size: {len(test_df)}")
            train_dataset = WikiLargeDataset(train_df['complex'].tolist(), train_df['simple'].tolist(), self.tokenizer)
            test_dataset = WikiLargeDataset(test_df['complex'].tolist(), test_df['simple'].tolist(), self.tokenizer)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
            print(f"Training samples: {len(train_dataset)}")
            print(f"Test samples: {len(test_dataset)}")
            print(f"Batch size: {batch_size}, Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")
            return train_loader, test_loader
        except Exception as e:
            print(f"Error preparing data: {e}")
            raise

    def train(self, train_loader, test_loader, epochs=3, learning_rate=1e-4, patience=2, min_delta=0.01, accum_steps=4):
        print("Starting training for T5-small...")
        try:
            if not train_loader or not test_loader:
                raise ValueError("Train or test loader is empty")
            print(f"Dataset size: {len(train_loader.dataset)} train samples, {len(test_loader.dataset)} test samples")
            print(f"Expected train batches: {len(train_loader)}, test batches: {len(test_loader)}")
            self.model.gradient_checkpointing_enable()
            optimizer = AdamW(self.model.parameters(), lr=learning_rate)
            total_steps = len(train_loader) * epochs // accum_steps
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
            self.model.train()
            best_val_loss = float('inf')
            early_stop_count = 0
            save_path = "/content/drive/MyDrive/NLP/t5_simplification_model"
            os.makedirs(save_path, exist_ok=True)
            print(f"Epochs: {epochs}, Learning rate: {learning_rate}, Batch size: {train_loader.batch_size}, Gradient accumulation steps: {accum_steps}")
            for epoch in range(epochs):
                total_loss = 0
                self.model.train()
                progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
                for batch_idx, batch in enumerate(progress_bar):
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss / accum_steps
                    loss.backward()
                    total_loss += loss.item() * accum_steps
                    if (batch_idx + 1) % accum_steps == 0:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                    progress_bar.set_postfix({'loss': loss.item() * accum_steps})
                avg_train_loss = total_loss / len(train_loader)
                self.model.eval()
                val_loss = 0
                with torch.no_grad():
                    for batch in test_loader:
                        input_ids = batch['input_ids'].to(self.device)
                        attention_mask = batch['attention_mask'].to(self.device)
                        labels = batch['labels'].to(self.device)
                        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                        val_loss += outputs.loss.item()
                avg_val_loss = val_loss / len(test_loader)
                print(f"Epoch {epoch+1} completed. Average Train Loss: {avg_train_loss:.4f}")
                print(f"Validation Loss: {avg_val_loss:.4f}")
                if avg_val_loss < best_val_loss - min_delta:
                    best_val_loss = avg_val_loss
                    early_stop_count = 0
                    try:
                        self.model.save_pretrained(save_path)
                        self.tokenizer.save_pretrained(save_path)
                        print(f"Model checkpoint saved at {save_path}")
                    except Exception as e:
                        print(f"Error saving model checkpoint: {e}")
                else:
                    early_stop_count += 1
                    if early_stop_count >= patience:
                        print(f"Early stopping triggered after epoch {epoch+1}")
                        break
            print("Training completed successfully.")
        except Exception as e:
            print(f"Error during training: {e}")
            raise

try:
    simplifier = TextSimplificationModel(model_name='t5-small')
    train_loader, test_loader = simplifier.prepare_data(df, batch_size=16)
    simplifier.train(train_loader, test_loader, epochs=3, learning_rate=1e-4)
except Exception as e:
    print(f"Fatal error: {e}")
    raise

Initializing t5-small model...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded on cuda
Model parameters: 60,506,624
GPU memory allocated: 0.82 GB
Preparing data...
Input DataFrame size: 10000
Train DataFrame size: 8000
Test DataFrame size: 2000
Training samples: 8000
Test samples: 2000
Batch size: 16, Train batches: 500, Test batches: 125
Starting training for T5-small...
Dataset size: 8000 train samples, 2000 test samples
Expected train batches: 500, test batches: 125
Epochs: 3, Learning rate: 0.0001, Batch size: 16, Gradient accumulation steps: 4


Epoch 1/3:   0%|          | 0/500 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Epoch 1/3: 100%|██████████| 500/500 [09:46<00:00,  1.17s/it, loss=0.126]
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 completed. Average Train Loss: 1.1308
Validation Loss: 0.1493
Model checkpoint saved at /content/drive/MyDrive/NLP/t5_simplification_model


Epoch 2/3: 100%|██████████| 500/500 [09:44<00:00,  1.17s/it, loss=0.189]


Epoch 2 completed. Average Train Loss: 0.1490
Validation Loss: 0.1302
Model checkpoint saved at /content/drive/MyDrive/NLP/t5_simplification_model


Epoch 3/3: 100%|██████████| 500/500 [09:43<00:00,  1.17s/it, loss=0.194]


Epoch 3 completed. Average Train Loss: 0.1398
Validation Loss: 0.1284
Training completed successfully.


In [9]:
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from easse.sari import corpus_sari
from textstat import flesch_kincaid_grade
from transformers import BartTokenizer, BartForConditionalGeneration

def evaluate_and_simplify(self, test_loader, sample_texts=None):
    print("Loading model from Google Drive...")
    try:
        save_path = "/content/drive/MyDrive/NLP/bart_simplification_model"
        self.tokenizer = BartTokenizer.from_pretrained(save_path)
        self.model = BartForConditionalGeneration.from_pretrained(save_path).to(self.device)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

    print("Evaluating on test set...")
    self.model.eval()
    predictions = []
    references = []
    complex_texts = []
    fkgl_scores = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3,
                repetition_penalty=2.0
            )
            for i in range(len(outputs)):
                pred = self.tokenizer.decode(outputs[i], skip_special_tokens=True).strip()
                ref = self.tokenizer.decode(batch['labels'][i], skip_special_tokens=True).strip()
                orig = self.tokenizer.decode(batch['input_ids'][i], skip_special_tokens=True).strip()
                if pred and ref and orig:
                    predictions.append(pred)
                    references.append(ref)
                    complex_texts.append(orig)
                    fkgl_scores.append(flesch_kincaid_grade(pred))

    print(f"Collected {len(predictions)} predictions, {len(references)} references, {len(complex_texts)} complex texts")

    # BLEU
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25)) for pred, ref in zip(predictions, references)]
    avg_bleu = np.mean(bleu_scores) if bleu_scores else 0

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)
    avg_rouge = {key: np.mean(scores) for key, scores in rouge_scores.items()}

    # SARI
    try:
        sari_score = corpus_sari(orig_sents=complex_texts, sys_sents=predictions, refs_sents=[references])
    except Exception as e:
        print(f"Error calculating SARI: {e}")
        sari_score = 0

    # FKGL
    avg_fkgl = np.mean(fkgl_scores) if fkgl_scores else 0

    print("\n=== EVALUATION RESULTS ===")
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge['rouge1']:.4f}")
    print(f"Average ROUGE-2: {avg_rouge['rouge2']:.4f}")
    print(f"Average ROUGE-L: {avg_rouge['rougeL']:.4f}")
    print(f"SARI Score: {sari_score:.4f}")
    print(f"Average FKGL Score: {avg_fkgl:.4f}")

    print("\nSample Test Set Outputs:")
    for i in range(min(3, len(predictions))):
        print(f"\nExample {i+1}:")
        print(f"Complex: {complex_texts[i]}")
        print(f"Reference: {references[i]}")
        print(f"Prediction: {predictions[i]}")
        print(f"FKGL (Prediction): {fkgl_scores[i]:.2f}")
        print("-" * 60)

    if sample_texts:
        print("\nSample Text Simplification:")
        for text in sample_texts:
            with torch.no_grad():
                inputs = self.tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(self.device)
                output = self.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=128,
                    num_beams=4,
                    early_stopping=True,
                    no_repeat_ngram_size=3,
                    repetition_penalty=2.0
                )
                simplified = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
                print(f"\nInput: {text}")
                print(f"Simplified: {simplified}")
                print(f"FKGL: {flesch_kincaid_grade(simplified):.2f}")
                print("-" * 60)

    return avg_bleu, avg_rouge, sari_score, avg_fkgl, predictions, references

TextSimplificationModel.evaluate_and_simplify = evaluate_and_simplify

# Sample texts for simplification
sample_texts = [
    "The implementation of sophisticated algorithms facilitates computational efficiency.",
    "Due to unprecedented meteorological circumstances, operations are temporarily suspended.",
    "The pharmaceutical intervention ameliorated chronic cardiovascular symptoms."
]

try:
    avg_bleu, avg_rouge, sari_score, avg_fkgl, predictions, references = simplifier.evaluate_and_simplify(test_loader, sample_texts)
except Exception as e:
    print(f"Fatal error during evaluation: {e}")
    raise

Loading model from Google Drive...
Model loaded successfully.
Evaluating on test set...


Evaluating: 100%|██████████| 125/125 [04:49<00:00,  2.32s/it]


Collected 2000 predictions, 2000 references, 2000 complex texts

=== EVALUATION RESULTS ===
Average BLEU Score: 0.3020
Average ROUGE-1: 0.6144
Average ROUGE-2: 0.4465
Average ROUGE-L: 0.5758
SARI Score: 37.3415
Average FKGL Score: 10.1348

Sample Test Set Outputs:

Example 1:
Complex: rattle was awarded a cbe in and made a knight bachelor in.
Reference: rattle was given the award of cbe in and made a knight bachelor in.
Prediction: rattle was awarded a cbe in and made a knight bachelor in.
FKGL (Prediction): 5.81
------------------------------------------------------------

Example 2:
Complex: originally called federal grove the name was changed in the s to honor auburn new york.
Reference: auburn was originally called federal grove during the s they changed their name to honor auburn new york.
Prediction: originally called federal grove the name was changed in the s to honor auburn new york.
FKGL (Prediction): 8.35
------------------------------------------------------------

Example 

In [12]:
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from easse.sari import corpus_sari
from textstat import flesch_kincaid_grade
from transformers import T5Tokenizer, T5ForConditionalGeneration

def evaluate_and_simplify(self, test_loader, sample_texts=None):
    print("Loading T5-small from Google Drive...")
    try:
        save_path = "/content/drive/MyDrive/NLP/t5_simplification_model"
        self.tokenizer = T5Tokenizer.from_pretrained(save_path)
        self.model = T5ForConditionalGeneration.from_pretrained(save_path).to(self.device)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

    print("Evaluating on test set...")
    self.model.eval()
    predictions = []
    references = []
    complex_texts = []
    fkgl_scores = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3,
                repetition_penalty=2.0
            )
            for i in range(len(outputs)):
                pred = self.tokenizer.decode(outputs[i], skip_special_tokens=True).strip()
                ref = self.tokenizer.decode(batch['labels'][i], skip_special_tokens=True).strip()
                orig = self.tokenizer.decode(batch['input_ids'][i], skip_special_tokens=True).strip()
                if pred and ref and orig:
                    predictions.append(pred)
                    references.append(ref)
                    complex_texts.append(orig)
                    fkgl_scores.append(flesch_kincaid_grade(pred))

    print(f"Collected {len(predictions)} predictions, {len(references)} references, {len(complex_texts)} complex texts")

    # BLEU
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25)) for pred, ref in zip(predictions, references)]
    avg_bleu = np.mean(bleu_scores) if bleu_scores else 0

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)
    avg_rouge = {key: np.mean(scores) for key, scores in rouge_scores.items()}

    # SARI
    try:
        sari_score = corpus_sari(orig_sents=complex_texts, sys_sents=predictions, refs_sents=[references])
    except Exception as e:
        print(f"Error calculating SARI: {e}")
        sari_score = 0

    # FKGL
    avg_fkgl = np.mean(fkgl_scores) if fkgl_scores else 0

    print("\n=== EVALUATION RESULTS ===")
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge['rouge1']:.4f}")
    print(f"Average ROUGE-2: {avg_rouge['rouge2']:.4f}")
    print(f"Average ROUGE-L: {avg_rouge['rougeL']:.4f}")
    print(f"SARI Score: {sari_score:.4f}")
    print(f"Average FKGL Score: {avg_fkgl:.4f}")

    print("\nSample Test Set Outputs:")
    for i in range(min(3, len(predictions))):
        print(f"\nExample {i+1}:")
        print(f"Complex: {complex_texts[i]}")
        print(f"Reference: {references[i]}")
        print(f"Prediction: {predictions[i]}")
        print(f"FKGL (Prediction): {fkgl_scores[i]:.2f}")
        print("-" * 60)

    if sample_texts:
        print("\nSample Text Simplification:")
        for text in sample_texts:
            with torch.no_grad():
                # Add simplify prefix but ensure it's not in output
                inputs = self.tokenizer(f"simplify: {text}", max_length=512, padding=True, truncation=True, return_tensors='pt').to(self.device)
                output = self.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=128,
                    num_beams=4,
                    early_stopping=True,
                    no_repeat_ngram_size=3,
                    repetition_penalty=2.0
                )
                simplified = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
                # Remove 'simplify:' if it appears in output
                simplified = simplified.replace("simplify:", "").strip()
                print(f"\nInput: {text}")
                print(f"Simplified: {simplified}")
                print(f"FKGL: {flesch_kincaid_grade(simplified):.2f}")
                print("-" * 60)

    return avg_bleu, avg_rouge, sari_score, avg_fkgl, predictions, references

TextSimplificationModel.evaluate_and_simplify = evaluate_and_simplify

sample_texts = [
    "The implementation of sophisticated algorithms facilitates computational efficiency.",
    "Due to unprecedented meteorological circumstances, operations are temporarily suspended.",
    "The pharmaceutical intervention ameliorated chronic cardiovascular symptoms."
]

try:
    avg_bleu, avg_rouge, sari_score, avg_fkgl, predictions, references = simplifier.evaluate_and_simplify(test_loader, sample_texts)
except Exception as e:
    print(f"Fatal error during evaluation: {e}")
    raise

Loading T5-small from Google Drive...
Model loaded successfully.
Evaluating on test set...


Evaluating: 100%|██████████| 125/125 [04:35<00:00,  2.20s/it]


Collected 2000 predictions, 2000 references, 2000 complex texts

=== EVALUATION RESULTS ===
Average BLEU Score: 0.2588
Average ROUGE-1: 0.5898
Average ROUGE-2: 0.4098
Average ROUGE-L: 0.5448
SARI Score: 35.3320
Average FKGL Score: 10.0840

Sample Test Set Outputs:

Example 1:
Complex: rattle was awarded a cbe in and made a knight bachelor in.
Reference: rattle was given the award of cbe in and made a knight bachelor in.
Prediction: rattle was awarded a cbe and made a knight bachelor in.
FKGL (Prediction): 5.86
------------------------------------------------------------

Example 2:
Complex: originally called federal grove the name was changed in the s to honor auburn new york.
Reference: auburn was originally called federal grove during the s they changed their name to honor auburn new york.
Prediction: the name was changed in the s to honor auburn new york.
FKGL (Prediction): 2.86
------------------------------------------------------------

Example 3:
Complex: monza is a city and com

In [13]:
print("=== MODEL COMPARISON ===")
print("BART-base Results:")
print("BLEU: 0.3020, ROUGE-1: 0.6144, ROUGE-2: 0.4465, ROUGE-L: 0.5758, SARI: 37.3415, FKGL: 10.1348")
print("T5-small Results:")
print("BLEU: 0.2588, ROUGE-1: 0.5898, ROUGE-2: 0.4098, ROUGE-L: 0.5448, SARI: 35.3320, FKGL: 10.0840")
print("\nObservations:")
print("- BART-base outperforms T5-small across all metrics, with higher BLEU, ROUGE, and SARI.")
print("- Both models struggle with sample text simplification, producing high FKGL scores.")
print("- T5-small shows minimal changes (e.g., 'ameliorated' → 'improved'), while BART-base often reproduces inputs.")
print("\nRecommendations:")
print("- Improve dataset quality (e.g., filter noisy pairs in WikiLarge).")
print("- Increase training epochs or adjust learning rate for T5-small.")
print("- Explore hybrid approaches or better pre-training for simplification.")

=== MODEL COMPARISON ===
BART-base Results:
BLEU: 0.3020, ROUGE-1: 0.6144, ROUGE-2: 0.4465, ROUGE-L: 0.5758, SARI: 37.3415, FKGL: 10.1348
T5-small Results:
BLEU: 0.2588, ROUGE-1: 0.5898, ROUGE-2: 0.4098, ROUGE-L: 0.5448, SARI: 35.3320, FKGL: 10.0840

Observations:
- BART-base outperforms T5-small across all metrics, with higher BLEU, ROUGE, and SARI.
- Both models struggle with sample text simplification, producing high FKGL scores.
- T5-small shows minimal changes (e.g., 'ameliorated' → 'improved'), while BART-base often reproduces inputs.

Recommendations:
- Improve dataset quality (e.g., filter noisy pairs in WikiLarge).
- Increase training epochs or adjust learning rate for T5-small.
- Explore hybrid approaches or better pre-training for simplification.
