In [4]:
!pip install numpy
!pip install transformers
!pip install peft
!pip install sklearn
!pip install pandas
!pip install evaluate
!pip install torch

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m:

## Import Packages

In [5]:
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

## Load Dataset

In [7]:
df = pd.read_csv('/content/data/training_data_elements_of_statistical_learning.csv')

In [8]:
df.head()

Unnamed: 0,Question,Answer
0,What is the main purpose of cross-validation?,Cross-validation is used to assess how well a ...
1,How does Lasso Regression differ from Ridge Re...,"Lasso regression applies an L1 penalty, encour..."
2,What is regularization in machine learning?,Regularization is a technique to reduce model ...
3,Explain the bias-variance tradeoff.,The bias-variance tradeoff is the balance betw...
4,What is the purpose of Random Forests?,Random Forests reduce variance by averaging mu...


### Tokenize the data

In [38]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenized_examples=[]

for index, row in df.iterrows():
  qa_pair = f"Question: {row['Question']} Answer: {row['Answer']}"

  # Tokenize the text
  tokenized = tokenizer(
      qa_pair,
      max_length=512, # This can be adjusted
      truncation=True,
      return_tensors='pt' # Returns PyTorch tensors for use in training
  )

  # Append tokenized data, where labels are identical to input_ids
  tokenized_examples.append({
      'input_ids': tokenized['input_ids'].squeeze(),
      'attention_mask': tokenized['attention_mask'].squeeze(),
      'labels': tokenized['input_ids'].squeeze()  # The labels here are the expected outputs
  })
tokenized_dataset = tokenized_examples



In [29]:
import torch
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # We need to access each tokenized entry
        item = self.data[idx]
        return {
            'input_ids': item['input_ids'],
            'attention_mask': item['attention_mask'],
            'labels': item['labels']
        }

# Convert the preprocessed list of examples to a PyTorch Dataset
qa_dataset = QADataset(tokenized_dataset)



## Setup Model

In [37]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering

# Load the GPT-2 model
model = AutoModelForCausalLM.from_pretrained('gpt2')
qa_model = AutoModelForQuestionAnswering.from_pretrained('gpt2')


from peft import get_peft_model, LoraConfig, TaskType

# LoRA configuration for GPT-2 fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Causal Language Model
    inference_mode=False,
    r=16,  # Rank of the LoRA matrices (determines complexity of the adaptation)
    lora_alpha=32,  # Scaling factor to control LoRA update magnitude
    lora_dropout=0.1  # Dropout rate during fine-tuning
)

# Wrap GPT-2 model with LoRA for parameter-efficient fine-tuning
model = get_peft_model(model, lora_config)



Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from transformers import TrainingArguments, Trainer

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory where the model checkpoints will be saved
    num_train_epochs=3,  # Number of training epochs (you can adjust this)
    per_device_train_batch_size=4,  # Batch size per GPU (you can tweak this to fit in GPU memory)
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay to prevent overfitting
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,  # How often to log progress
    save_steps=100,  # Save model every 100 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    evaluation_strategy='steps',
    eval_steps=100,
    load_best_model_at_end=True  # Load the best model after training
)




## Train the Model

In [None]:
# Make sure input_ids and labels are the same (labels are required for language modeling tasks)
def preprocess_function(examples):
    return {'input_ids': examples['input_ids'], 'labels': examples['input_ids']}

# Assuming `qa_dataset` is your tokenized dataset, you can map it as follows:
train_dataset = qa_dataset.map(preprocess_function)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer  # The tokenizer we previously loaded
)

# Start training
trainer.train()
