In [None]:
# Dependency setup in Google Colab
!pip install numpy
!pip install transformers
!pip install peft
!pip install sklearn
!pip install pandas
!pip install evaluate
!pip install torch

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m:

## Import Packages

In [1]:
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [3]:
df = pd.read_csv('./training_data_elements_of_statistical_learning.csv')

In [4]:
df.head()

Unnamed: 0,Question,Answer
0,What is the main purpose of cross-validation?,Cross-validation is used to assess how well a ...
1,How does Lasso Regression differ from Ridge Re...,"Lasso regression applies an L1 penalty, encour..."
2,What is regularization in machine learning?,Regularization is a technique to reduce model ...
3,Explain the bias-variance tradeoff.,The bias-variance tradeoff is the balance betw...
4,What is the purpose of Random Forests?,Random Forests reduce variance by averaging mu...


### Tokenize the data

In [5]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenized_examples=[]

for index, row in df.iterrows():
  qa_pair = f"Question: {row['Question']} Answer: {row['Answer']}"

  # Tokenize the text
  tokenized = tokenizer(
      qa_pair,
      max_length=512, # This can be adjusted
      truncation=True,
      return_tensors='pt' # Returns PyTorch tensors for use in training
  )

  # Append tokenized data, where labels are identical to input_ids
  tokenized_examples.append({
      'input_ids': tokenized['input_ids'].squeeze(),
      'attention_mask': tokenized['attention_mask'].squeeze(),
      'labels': tokenized['input_ids'].squeeze()  # The labels here are the expected outputs
  })
tokenized_dataset = tokenized_examples

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Alternate Approach to Tokenization

In [14]:
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token as the EOS token
tokenizer.pad_token = tokenizer.eos_token

class QADataset(Dataset):
    def __init__(self, df):
        self.examples = []
        for index, row in df.iterrows():
            question = f"Question: {row['Question']}"
            answer = f"Answer: {row['Answer']}"
            qa_pair = f"{question} {answer}"

            # Tokenize question-answer pair
            tokenized = tokenizer(
                qa_pair,
                max_length=512,
                truncation=True,
                padding='max_length',  # Pad to max length for uniform batch sizes
                return_tensors='pt'
            )

            # Set labels to ignore question tokens
            labels = tokenized['input_ids'].clone()
            question_length = len(tokenizer(question, truncation=True)['input_ids'])
            labels[:, :question_length] = -100  # Ignore question tokens in the labels

            self.examples.append({
                'input_ids': tokenized['input_ids'].squeeze(),
                'attention_mask': tokenized['attention_mask'].squeeze(),
                'labels': labels.squeeze()
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Instantiate the dataset and data loader
qa_dataset = QADataset(df)
# Add pin_memory=True if using GPU, can expirement with num_workers
qa_dataloader = DataLoader(qa_dataset, batch_size=8, shuffle=True)


## Setup Model

In [16]:
from transformers import GPT2LMHeadModel
from peft import get_peft_model, LoraConfig, TaskType

# Load the GPT-2 model for causal language modeling
model = GPT2LMHeadModel.from_pretrained('gpt2')


# LoRA configuration for GPT-2 fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Causal Language Model
    inference_mode=False,
    r=16,  # Rank of the LoRA matrices (determines complexity of the adaptation)
    lora_alpha=32,  # Scaling factor to control LoRA update magnitude
    lora_dropout=0.1  # Dropout rate during fine-tuning
)

# Wrap GPT-2 model with LoRA for parameter-efficient fine-tuning
model = get_peft_model(model, lora_config)





In [20]:
from transformers import TrainingArguments, Trainer

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory where the model checkpoints will be saved
    num_train_epochs=3,  # Number of training epochs (you can adjust this)
    per_device_train_batch_size=4,  # Batch size per GPU (you can tweak this to fit in GPU memory)
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay to prevent overfitting
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,  # How often to log progress
    save_steps=100,  # Save model every 100 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    evaluation_strategy='no',
    # eval_steps=100, # For use when a validation dataset is present
    load_best_model_at_end=False  # Load the best model after training, can use this option with validation dataset
)




## Train the Model

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=qa_dataset,
    tokenizer=tokenizer  # The tokenizer we previously loaded
)

# Start training
trainer.train()


  trainer = Trainer(
 13%|█▎        | 10/75 [01:37<10:21,  9.56s/it]

{'loss': 9.4578, 'grad_norm': 4.156976699829102, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 27%|██▋       | 20/75 [03:17<09:19, 10.17s/it]

{'loss': 8.9165, 'grad_norm': 5.134925365447998, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


 40%|████      | 30/75 [04:50<07:08,  9.52s/it]

{'loss': 8.2572, 'grad_norm': 5.042084693908691, 'learning_rate': 3e-05, 'epoch': 1.2}


 53%|█████▎    | 40/75 [06:29<05:52, 10.06s/it]

{'loss': 8.1536, 'grad_norm': 6.086359024047852, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


 67%|██████▋   | 50/75 [07:57<03:18,  7.93s/it]

{'loss': 7.6807, 'grad_norm': 4.993514060974121, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


 80%|████████  | 60/75 [09:28<02:13,  8.91s/it]

{'loss': 7.4703, 'grad_norm': 5.50108528137207, 'learning_rate': 1e-05, 'epoch': 2.4}


 93%|█████████▎| 70/75 [11:01<00:45,  9.17s/it]

{'loss': 7.2218, 'grad_norm': 5.771157741546631, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


100%|██████████| 75/75 [11:42<00:00,  9.37s/it]

{'train_runtime': 702.6205, 'train_samples_per_second': 0.418, 'train_steps_per_second': 0.107, 'train_loss': 8.09895716349284, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=8.09895716349284, metrics={'train_runtime': 702.6205, 'train_samples_per_second': 0.418, 'train_steps_per_second': 0.107, 'total_flos': 77352567570432.0, 'train_loss': 8.09895716349284, 'epoch': 3.0})

## Save the Model

In [22]:
# Save model and tokenizer
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.json',
 './trained_model\\merges.txt',
 './trained_model\\added_tokens.json')

In [1]:
from transformers import pipeline, AutoModelForCausalLM, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./trained_model")
tokenizer = GPT2Tokenizer.from_pretrained("./trained_model")

# Set up the pipeline with the model and tokenizer, pipeline has a lot of hyperparameters that can be adjusted
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


## Generate Answers

In [28]:
question = "What is the purpose of gradient descent in machine learning?"
answer = qa_pipeline(f"Question: {question} Answer:", max_length=200, num_return_sequences=1)[0]['generated_text']

# Print or process the answer
print(answer)

Question: What is the purpose of gradient descent in machine learning? Answer: Machine learning is the process of improving our understanding and solving problems of different types: learning as a process - that leads to new data sets. It is part of the human body: you can read much about it on our website: http://learn.tech.com/learn-learning/

The aim of gradient descent is to improve understanding in our understanding. The principle of gradient descent is to improve to solve various problems and improve the general human understanding of the world, our understanding is to search for knowledge, and our understanding is to know. It is the same principle, because every thing that you say affects our understanding, it's just as important as this one that we explain. If we understand your problem, to try to explain it to solve it, we're not a human. For example, if I tell you how to solve some problem (a problem where you understand the problem - you can come and discuss


In [2]:
question = "What is cross-validation?"
answer = qa_pipeline(f"Question: {question} Answer:", max_length=200, num_return_sequences=1)[0]['generated_text']

print(answer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Question: What is cross-validation? Answer: It's a mechanism to determine whether a user can change the system (i.e., to do something on a system that does not have some system that does). For the purpose of cross-validation, the input should be a system that already has a valid input and uses that system's inputs, or other input.


A user may be able to change the input and create new programs in the past, and their program is validated if they're used by that user instead of a system for their own use. For a typical cross-validation process, the user has to provide the system with input that they would like to change. If, for example, an old system works, then the user can simply change either a new system to produce a program that gets used by another user, or they can either set up a new system or they can change what the previous system was used for and do the current one and create an


In [3]:
question = "Can you explain how Support Vector Machines work?"
answer = qa_pipeline(f"Question: {question} Answer:", max_length=200, num_return_sequences=1)[0]['generated_text']

print(answer)

Question: Can you explain how Support Vector Machines work? Answer: Each Support Vector Machine connects to the next supporting support Vector. Since we are using our support vector machines, we create their own support Vector Matrix.

Example: Using an IDF module.

If you're wondering how we get our Support Vector Machines for the IDFG module we've already described, we have provided a support Vector Matrix to all support Vector Machines.

Supplies on a Support Vector Machine Support Vector Matrix X 1.0 1.4 5.9 13.5 10.9

Supplies on a Support Vector Machine Support Vector Matrix Y 2.0 3.9 14.5 17.4 10.8

Since the support Vector Machines are in use in the program C++, we can't help but notice that we have an IDFG module that has its own support Vector Matrix that does the rest.

Supplies on a Support Vector Machine Support Vector Matrix X 12 1
