
# Parameter-Efficient Finetuning of a Language Model
This notebook demonstrates how to perform parameter-efficient finetuning on a dataset of programming problem descriptions using a pre-trained language model.


In [None]:

# Import necessary libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch



## Data Loading and Preprocessing
We'll start by loading the data and performing basic preprocessing.


In [None]:

# Load the data
data_path = './output.tsv'
data = pd.read_csv(data_path, sep='\t')

# Display the first few rows of the dataset
data.head()



## Model Selection
Load a pre-trained language model that we will finetune.


In [None]:

# Load the pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # assuming binary classification for simplicity



## Parameter-Efficient Finetuning Setup
We will freeze most of the pre-trained model's layers and only finetune a small portion to save resources and time.


In [None]:

# Freeze all layers except the classification head
for param in model.base_model.parameters():
    param.requires_grad = False



## Training
Set up the training parameters and start training.


In [None]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Convert DataFrame to Hugging Face dataset
dataset = Dataset.from_pandas(data)

# Create a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset  # for demonstration, using the same dataset as eval
)

# Train the model
trainer.train()



## Evaluation
Evaluate the model performance.


In [None]:

# Evaluate the model
trainer.evaluate()
