In [2]:
!pip install "fsspec<=2023.6.0" --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 req

In [None]:
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GPT2PreTrainedModel,
    GPT2Model,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
from datasets import load_dataset
import numpy as np
import math
import random
import os

# ==================== 1. Initial Setup and Common Functions ====================

def set_seed(seed=42):
    """Set seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Load dataset and tokenizer
# Using a standard dataset for language modeling
dataset = load_dataset("wikitext", "wikitext-103-v1")
dataset = dataset.filter(lambda example: example['text'].strip() != '')
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
# Add a padding token to the tokenizer if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Preprocess and tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=False)

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

block_size = 128 # Input sequence length for the model

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)

train_dataset = lm_dataset["train"]
eval_dataset = lm_dataset["validation"]

# Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# Define model configuration
# We create the model from scratch and do not use pre-trained weights
model_config = AutoConfig.from_pretrained(
    "distilgpt2",
    vocab_size=len(tokenizer),
    n_ctx=block_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    output_hidden_states=True, # Required to access embeddings
)

# Define base training arguments
base_training_args = {
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 16,
    "num_train_epochs": 1, # Only one epoch for a quick run
    "weight_decay": 0.01,
    "load_best_model_at_end": True,
    "metric_for_best_model": "loss",
    "greater_is_better": False, # For loss, lower is better
    "seed": 42,
    "data_seed": 42,
}


# ==================== 3. Running the Experiments ====================

results = {}

# --- Mode 1: Standard Training ---
print("="*50)
print("🚀 Starting Mode 1: Standard Training")
print("="*50)
set_seed(42)
# Create model from scratch using the config
standard_model = AutoModelForCausalLM.from_config(model_config)

training_args_standard = TrainingArguments(output_dir="./results/clm_standard", learning_rate=5e-5, **base_training_args)
trainer_standard = Trainer(
    model=standard_model,
    args=training_args_standard,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer_standard.train()
eval_results_standard = trainer_standard.evaluate()
results['standard'] = eval_results_standard



# ==================== 4. Displaying Final Results for Comparison ====================
print("\n\n" + "="*50)
print("🏆 Final Results Comparison on Validation Set 🏆")
print("="*50)

# Calculate Perplexity for each mode
try:
    perplexity_standard = math.exp(results['standard']['eval_loss'])
except OverflowError:
    perplexity_standard = float("inf")




print(f"\n--- Mode 1: Standard Training ---")
print(f"Eval Loss: {results['standard']['eval_loss']:.4f}")
print(f"Perplexity: {perplexity_standard:.4f}")


print("\n" + "="*50)
print("Note: Lower perplexity indicates a better model performance.")

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1059 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🚀 Starting Mode 1: Standard Training


  trainer_standard = Trainer(


Epoch,Training Loss,Validation Loss


In [1]:
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GPT2PreTrainedModel,
    GPT2Model,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
from datasets import load_dataset
import numpy as np
import math
import random
import os

# ==================== 1. Initial Setup and Common Functions ====================

def set_seed(seed=42):
    """Set seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Load dataset and tokenizer
# Using a standard dataset for language modeling
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset = dataset.filter(lambda example: example['text'].strip() != '')

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
# Add a padding token to the tokenizer if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Preprocess and tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=False)

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

block_size = 128 # Input sequence length for the model

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)

train_dataset = lm_dataset["train"]
eval_dataset = lm_dataset["validation"]

# Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# Define model configuration
# We create the model from scratch and do not use pre-trained weights
model_config = AutoConfig.from_pretrained(
    "distilgpt2",
    vocab_size=len(tokenizer),
    n_ctx=block_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    output_hidden_states=True, # Required to access embeddings
)

# Define base training arguments
base_training_args = {
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 16,
    "num_train_epochs": 2, # Only one epoch for a quick run
    "weight_decay": 0.01,
    "load_best_model_at_end": True,
    "metric_for_best_model": "loss",
    "greater_is_better": False, # For loss, lower is better
    "seed": 42,
    "data_seed": 42,
}

# ==================== 2. Custom Model Class for the Second Mode ====================

class GPT2WithRegularityLoss(GPT2PreTrainedModel):
    def __init__(self, config, regularity_weight=0.01):
        super().__init__(config)
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.regularity_weight = regularity_weight
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            **kwargs,
        )
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)

        total_loss = None
        if labels is not None:
            # Calculate the main language modeling loss (Cross-Entropy)
            loss_fct = nn.CrossEntropyLoss()
            lm_loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))

            if self.training:
                # Calculate the auxiliary Regularity loss
                # We use embeddings from the first layer (index 1)
                embeddings = transformer_outputs.hidden_states[1]

                # Mask tokens that are ignored in the main loss (usually padding)
                active_embeddings = embeddings[labels != -100]

                if active_embeddings.shape[0] < 2:
                    regularity_loss = torch.tensor(0.0, device=self.device)
                else:
                    direction_vectors = active_embeddings[1:] - active_embeddings[:-1]
                    if direction_vectors.shape[0] < 2:
                        regularity_loss = torch.tensor(0.0, device=self.device)
                    else:
                        # Calculate the mean cosine similarity
                        cos = nn.CosineSimilarity(dim=1)
                        similarities = cos(direction_vectors[:-1], direction_vectors[1:])
                        # We want to maximize similarity, so the loss is 1.0 - similarity
                        regularity_loss = (1.0 - similarities).mean()

                total_loss = lm_loss + self.regularity_weight * regularity_loss
            else:
                total_loss = lm_loss

        return CausalLMOutputWithCrossAttentions(
            loss=total_loss,
            logits=lm_logits,
            hidden_states=transformer_outputs.hidden_states if self.training else None
        )

# ==================== 3. Running the Experiments ====================

results = {}



# --- Mode 2: Training with Regularity Loss ---
print("\n" + "="*50)
print("🚀 Starting Mode 2: Training with Regularity Loss")
print("="*50)
set_seed(42)
# Create custom model from scratch
regularity_model = GPT2WithRegularityLoss(model_config, regularity_weight=0.01)

training_args_regularity = TrainingArguments(output_dir="./results/clm_regularity", learning_rate=5e-5, **base_training_args)
trainer_regularity = Trainer(
    model=regularity_model,
    args=training_args_regularity,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer_regularity.train()
eval_results_regularity = trainer_regularity.evaluate()
results['regularity'] = eval_results_regularity


# ==================== 4. Displaying Final Results for Comparison ====================
print("\n\n" + "="*50)
print("🏆 Final Results Comparison on Validation Set 🏆")
print("="*50)



try:
    perplexity_regularity = math.exp(results['regularity']['eval_loss'])
except OverflowError:
    perplexity_regularity = float("inf")



print(f"\n--- Mode 2: Training with Regularity Loss ---")
print(f"Eval Loss: {results['regularity']['eval_loss']:.4f}")
print(f"Perplexity: {perplexity_regularity:.4f}")

print("\n" + "="*50)
print("Note: Lower perplexity indicates a better model performance.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]


🚀 Starting Mode 2: Training with Regularity Loss


  trainer_regularity = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mmhranjbar[0m ([33mmhranjbar-postech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.7402,0.468746
2,0.2888,0.282885




🏆 Final Results Comparison on Validation Set 🏆

--- Mode 2: Training with Regularity Loss ---
Eval Loss: 0.2829
Perplexity: 1.3270

Note: Lower perplexity indicates a better model performance.


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2LMHeadModel
import torch.nn as nn
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
import os

# ==================== 1. تعریف مجدد کلاس مدل سفارشی ====================
class GPT2WithRegularityLoss(GPT2LMHeadModel):
    def __init__(self, config, regularity_weight=0.01):
        super().__init__(config)
        self.regularity_weight = regularity_weight

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )

        if outputs.loss is not None and self.training:
            lm_loss = outputs.loss
            embeddings = outputs.hidden_states[1]
            batch_regularity_loss = []

            for i in range(embeddings.shape[0]):
                sentence_labels = labels[i]
                sentence_embeddings = embeddings[i]
                active_embeddings = sentence_embeddings[sentence_labels != -100]

                if active_embeddings.shape[0] >= 2:
                    direction_vectors = active_embeddings[1:] - active_embeddings[:-1]
                    if direction_vectors.shape[0] >= 2:
                        cos = nn.CosineSimilarity(dim=1)
                        similarities = cos(direction_vectors[:-1], direction_vectors[1:])
                        regularity_loss = (1.0 - similarities).mean()
                        batch_regularity_loss.append(regularity_loss)

            if batch_regularity_loss:
                final_regularity_loss = torch.stack(batch_regularity_loss).mean()
                outputs.loss = lm_loss + self.regularity_weight * final_regularity_loss

        return outputs

# ==================== 2. بارگیری مدل‌ها و توکنایزر ====================
print("Loading models from checkpoints...")

output_dir_standard = "./results/clm_standard/checkpoint-4670"
output_dir_regularity = "./results/clm_regularity/checkpoint-4670"

# بارگیری توکنایزر - سعی کن از همان مسیر مدل بارگیری کنیم
try:
    tokenizer = AutoTokenizer.from_pretrained(output_dir_standard)
    print("Tokenizer loaded from standard model checkpoint")
except:
    try:
        tokenizer = AutoTokenizer.from_pretrained(output_dir_regularity)
        print("Tokenizer loaded from regularity model checkpoint")
    except:
        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
        print("Tokenizer loaded from distilgpt2")

# اطمینان از تنظیمات صحیح توکنایزر
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})

print(f"Tokenizer info:")
print(f"- pad_token: {tokenizer.pad_token}")
print(f"- eos_token: {tokenizer.eos_token}")
print(f"- pad_token_id: {tokenizer.pad_token_id}")
print(f"- eos_token_id: {tokenizer.eos_token_id}")

model_standard = AutoModelForCausalLM.from_pretrained(output_dir_standard)
model_regularity = GPT2WithRegularityLoss.from_pretrained(output_dir_regularity)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_standard.to(device)
model_regularity.to(device)
print(f"Models loaded successfully on {device}.")

# ==================== 3. تعریف تابع برای تولید متن ====================

def generate_text(model, tokenizer, prompt, max_length=50):
    """Generates text using the input model and prompt."""
    model.eval()  # قرار دادن مدل در حالت ارزیابی

    # ==================== تغییر کلیدی اینجا است ====================
    # به طور موقت output_hidden_states را غیرفعال می‌کنیم تا با متد generate تداخل نداشته باشد
    original_config = getattr(model.config, 'output_hidden_states', False)
    model.config.output_hidden_states = False
    # =============================================================

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            return_dict_in_generate=True  # این را اضافه می‌کنیم
        )

    # بازگرداندن کانفیگ به حالت اولیه
    model.config.output_hidden_states = original_config

    # ==================== حل مشکل اصلی ====================
    # outputs یک GenerateDecoderOnlyOutput است، نه tensor
    # باید از .sequences استفاده کنیم
    print(f"Debug: outputs type = {type(outputs)}")

    if hasattr(outputs, 'sequences'):
        generated_sequences = outputs.sequences
        print(f"Debug: sequences shape = {generated_sequences.shape}")

        # حالا می‌توانیم به راحتی decode کنیم
        generated_text = tokenizer.decode(generated_sequences[0], skip_special_tokens=True)
        return generated_text.strip()
    else:
        # fallback برای حالت‌های قدیمی
        print("Using fallback method...")
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        return generated_text.strip()


def generate_text_simple(model, tokenizer, prompt, max_length=50):
    """نسخه ساده‌تر تابع تولید متن"""
    model.eval()

    # کاملاً ساده و بدون تغییر config
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # استفاده از batch_decode که معمولاً بهتر کار می‌کند
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return generated_text.strip()


# ==================== 4. تولید و مقایسه خروجی‌ها ====================

prompts = [
    "The best way to learn a new skill is",
    "Once upon a time, in a land far away,",
    "Artificial intelligence is a field that",
    "The future of our planet depends on",
    "My name is"
]

print("\n" + "="*70)
print("🤖 Generating text samples to compare the models...")
print("="*70)

for prompt in prompts:
    print(f"\nPrompt: '{prompt}...'")
    print("-" * 70)

    try:
        output_standard = generate_text(model_standard, tokenizer, prompt)
        print(f"Standard Model Output:\n{output_standard}")
    except Exception as e:
        print(f"Error with standard model: {e}")

    print("-" * 70)

    try:
        output_regularity = generate_text(model_regularity, tokenizer, prompt)
        print(f"Model with Regularity Loss Output:\n{output_regularity}")
    except Exception as e:
        print(f"Error with regularity model: {e}")

    print("="*70)

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading models from checkpoints...
Tokenizer loaded from standard model checkpoint
Tokenizer info:
- pad_token: [PAD]
- eos_token: <|endoftext|>
- pad_token_id: 50257
- eos_token_id: 50256
Models loaded successfully on cuda.

🤖 Generating text samples to compare the models...

Prompt: 'The best way to learn a new skill is...'
----------------------------------------------------------------------
Debug: outputs type = <class 'transformers.generation.utils.GenerateDecoderOnlyOutput'>
Debug: sequences shape = torch.Size([1, 50])
Standard Model Output:
The best way to learn a new skill is a young of its second season . With the second season , a number three weeks of the season and in the chart . The show 's episode was a " One of the second season " , which they
----------------------------------------------------------------------
Debug: outputs type = <class 'transformers.generation.utils.GenerateDecoderOnlyOutput'>
Debug: sequences shape = torch.Size([1, 50])
Model with Regularity Loss