In [None]:
pip install transformers datasets peft accelerate py7zr

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType

# Load the SamSum dataset
dataset = load_dataset("samsum")

# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Preprocess the dataset: Tokenizing the data and adding labels
def preprocess_function(examples):
    inputs = tokenizer(examples["dialogue"], truncation=True, padding="max_length", max_length=512)
    # Example criterion: label 1 if dialogue length > 100 words, else label 0
    inputs["labels"] = [1 if len(dialogue.split()) > 100 else 0 for dialogue in examples["dialogue"]]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Configuration for LoRA fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    r=16,  # Rank of the update matrices
    lora_alpha=32,  # Alpha scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none"  # Bias strategy for LoRA
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500
)

# Trainer setup
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)

# Fine-tuning the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./finetuned_roberta_samsum_lora")

In [None]:
!zip -r /content/roberta_ft.zip /content/finetuned_roberta_samsum_lora

In [None]:
from google.colab import files
files.download("/content/roberta_ft.zip")

In [None]:
pip install evaluate

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from peft import PeftModel
import evaluate

# Load the SamSum dataset
dataset = load_dataset("samsum")

# Preprocess the dataset (ensure it's the same preprocessing function used before)
def preprocess_function(examples):
    inputs = tokenizer(examples["dialogue"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = [1 if len(dialogue.split()) > 100 else 0 for dialogue in examples["dialogue"]]
    return inputs

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the validation dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)
validation_dataset = tokenized_datasets["validation"]

# Evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Helper function to evaluate a model
def evaluate_model(model, tokenizer, dataset):
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer
    )

    # Get predictions
    predictions = trainer.predict(dataset)

    # Extract predicted labels and true labels
    preds = torch.argmax(torch.tensor(predictions.predictions), dim=-1)
    labels = torch.tensor(predictions.label_ids)

    # Calculate metrics
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_score = f1_metric.compute(predictions=preds, references=labels, average="weighted")

    return accuracy, f1_score

# -----------------------------------------------
# 1. Evaluate the base RoBERTa model
# -----------------------------------------------
print("Evaluating Base RoBERTa Model...")

# Load the base RoBERTa model
base_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Evaluate the base model
base_accuracy, base_f1 = evaluate_model(base_model, tokenizer, validation_dataset)

print(f"Base Model Accuracy: {base_accuracy['accuracy']:.4f}")
print(f"Base Model F1 Score: {base_f1['f1']:.4f}")

# -----------------------------------------------
# 2. Evaluate the Fine-Tuned RoBERTa model
# -----------------------------------------------
print("\nEvaluating Fine-Tuned RoBERTa Model...")

# Load the fine-tuned RoBERTa model with LoRA
fine_tuned_model = RobertaForSequenceClassification.from_pretrained("./finetuned_roberta_samsum_lora")
fine_tuned_model = PeftModel.from_pretrained(fine_tuned_model, "./finetuned_roberta_samsum_lora")

# Evaluate the fine-tuned model
fine_tuned_accuracy, fine_tuned_f1 = evaluate_model(fine_tuned_model, tokenizer, validation_dataset)

print(f"Fine-Tuned Model Accuracy: {fine_tuned_accuracy['accuracy']:.4f}")
print(f"Fine-Tuned Model F1 Score: {fine_tuned_f1['f1']:.4f}")

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer
from peft import PeftModel
import evaluate

# Load the pruned model with LoRA (replace with your model path)
model = RobertaForSequenceClassification.from_pretrained("./finetuned_roberta_samsum_lora")
model = PeftModel.from_pretrained(model, "./finetuned_roberta_samsum_lora")

# Function to prune LoRA weights (as defined before)
def prune_lora_weights(model, percentage=0.5):
    for n, m in model.named_modules():
        if isinstance(m, torch.nn.Linear) and "lora" in n:
            weight = m.weight.data
            num_weights_to_prune = int(weight.numel() * percentage)
            # _, indices = torch.topk(torch.abs(weight).view(-1), num_weights_to_prune, largest=False)
            indices = torch.randperm(weight.numel())[:num_weights_to_prune]
            weight.view(-1)[indices] = 0

# Prune the model weights (e.g., 50% sparsity)
prune_lora_weights(model, percentage=0.5)

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Load the Samsum dataset
dataset = load_dataset("samsum")

# Preprocess the dataset (ensure it's the same preprocessing function used before)
def preprocess_function(examples):
    inputs = tokenizer(examples["dialogue"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = [1 if len(dialogue.split()) > 100 else 0 for dialogue in examples["dialogue"]]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
validation_dataset = tokenized_datasets["validation"]

# Evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Helper function to evaluate a model (same as before)
def evaluate_model(model, tokenizer, dataset):
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer
    )
    predictions = trainer.predict(dataset)
    preds = torch.argmax(torch.tensor(predictions.predictions), dim=-1)
    labels = torch.tensor(predictions.label_ids)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_score = f1_metric.compute(predictions=preds, references=labels, average="weighted")
    return accuracy, f1_score

# Evaluate the pruned model
pruned_accuracy, pruned_f1 = evaluate_model(model, tokenizer, validation_dataset)

print(f"Pruned Model Accuracy: {pruned_accuracy['accuracy']:.4f}")
print(f"Pruned Model F1 Score: {pruned_f1['f1']:.4f}")

In [None]:
import torch
import copy
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer
from peft import PeftModel
import evaluate
import matplotlib.pyplot as plt

# Load the fine-tuned model with LoRA
model = RobertaForSequenceClassification.from_pretrained("./finetuned_roberta_samsum_lora")
model = PeftModel.from_pretrained(model, "./finetuned_roberta_samsum_lora")

# Function to prune LoRA weights randomly
def prune_lora_weights_randomly(model, percentage=0.5):
    for n, m in model.named_modules():
        if isinstance(m, torch.nn.Linear) and "lora" in n:
            weight = m.weight.data
            num_weights_to_prune = int(weight.numel() * percentage)
            indices = torch.randperm(weight.numel())[:num_weights_to_prune]
            weight.view(-1)[indices] = 0

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Load the Samsum dataset
dataset = load_dataset("samsum")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["dialogue"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = [1 if len(dialogue.split()) > 100 else 0 for dialogue in examples["dialogue"]]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
validation_dataset = tokenized_datasets["validation"]

# Evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Helper function to evaluate a model
def evaluate_model(model, tokenizer, dataset):
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer
    )
    predictions = trainer.predict(dataset)
    preds = torch.argmax(torch.tensor(predictions.predictions), dim=-1)
    labels = torch.tensor(predictions.label_ids)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_score = f1_metric.compute(predictions=preds, references=labels, average="weighted")
    return accuracy, f1_score

# Store results for plotting
accuracies = []
f1_scores = []

# Pruning percentages to test
pruning_percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for percentage in pruning_percentages:
    # Create a copy of the model
    pruned_model = copy.deepcopy(model)

    # Prune the copied model weights
    prune_lora_weights_randomly(pruned_model, percentage)

    # Evaluate the pruned model
    pruned_accuracy, pruned_f1 = evaluate_model(pruned_model, tokenizer, validation_dataset)

    print(f"Pruned Model ({percentage*100:.0f}% sparsity) - Accuracy: {pruned_accuracy['accuracy']:.4f}, F1: {pruned_f1['f1']:.4f}")

    accuracies.append(pruned_accuracy['accuracy'])
    f1_scores.append(pruned_f1['f1'])

# Plotting the results
plt.figure(figsize=(10, 5))
plt.plot(pruning_percentages, accuracies, marker='o', label="Accuracy")
plt.plot(pruning_percentages, f1_scores, marker='x', label="F1 Score")
plt.xlabel("Pruning Percentage")
plt.ylabel("Performance")
plt.title("Model Performance vs. Pruning Percentage")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import torch
import copy
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer
from peft import PeftModel
import evaluate
import matplotlib.pyplot as plt

# Load the fine-tuned model with LoRA
model = RobertaForSequenceClassification.from_pretrained("./finetuned_roberta_samsum_lora")
model = PeftModel.from_pretrained(model, "./finetuned_roberta_samsum_lora")

# Calculate the number of fine-tuning parameters
total_lora_params = sum(p.numel() for n, p in model.named_parameters() if 'lora' in n)
print(f"Total fine-tuning parameters: {total_lora_params}")

# Function to prune LoRA weights randomly
def prune_lora_weights_randomly(model, percentage=0.5):
    for n, m in model.named_modules():
        if isinstance(m, torch.nn.Linear) and "lora" in n:
            weight = m.weight.data
            num_weights_to_prune = int(weight.numel() * percentage)
            _, indices = torch.topk(torch.abs(weight).view(-1), num_weights_to_prune, largest=False)
            weight.view(-1)[indices] = 0

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Load the Samsum dataset
dataset = load_dataset("samsum")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["dialogue"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = [1 if len(dialogue.split()) > 100 else 0 for dialogue in examples["dialogue"]]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
validation_dataset = tokenized_datasets["validation"]

# Evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Helper function to evaluate a model
def evaluate_model(model, tokenizer, dataset):
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer
    )
    predictions = trainer.predict(dataset)
    preds = torch.argmax(torch.tensor(predictions.predictions), dim=-1)
    labels = torch.tensor(predictions.label_ids)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_score = f1_metric.compute(predictions=preds, references=labels, average="weighted")
    return accuracy, f1_score

# Store results for plotting
accuracies = []
f1_scores = []

# Pruning percentages to test
pruning_percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for percentage in pruning_percentages:
    # Create a copy of the model
    pruned_model = copy.deepcopy(model)

    # Prune the copied model weights
    prune_lora_weights_randomly(pruned_model, percentage)

    # Evaluate the pruned model
    pruned_accuracy, pruned_f1 = evaluate_model(pruned_model, tokenizer, validation_dataset)

    print(f"Pruned Model ({percentage*100:.0f}% sparsity) - Accuracy: {pruned_accuracy['accuracy']:.4f}, F1: {pruned_f1['f1']:.4f}")

    accuracies.append(pruned_accuracy['accuracy'])
    f1_scores.append(pruned_f1['f1'])

# Plotting the results
plt.figure(figsize=(10, 5))
plt.plot(pruning_percentages, accuracies, marker='o', label="Accuracy")
plt.plot(pruning_percentages, f1_scores, marker='x', label="F1 Score")
plt.xlabel("Pruning Percentage")
plt.ylabel("Performance")
plt.title("Model Performance vs. Pruning Percentage")
plt.legend()
plt.grid(True)
plt.show()