In [2]:
!pip install numpy
!pip install transformers
!pip install peft
!pip install scikit-learn
!pip install pandas
!pip install evaluate
!pip install torch
!pip install huggingface_hub
!pip install bitsandbytes
!pip install trl

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Import Packages

In [1]:
import pandas as pd

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

## Load Dataset

In [2]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/LLM Dataset - v1.xlsx')

In [3]:
print(df.iloc[0][1])

The main types of statistical learning are supervised learning, where an outcome variable is predicted, and unsupervised learning, which seeks to identify patterns or structures in data without an outcome variable.


  print(df.iloc[0][1])


### Loading the tokenizer and the model

In [None]:
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoModelForCausalLM
)


# Since the 8B model is so big it is recommended to quantize the model to a lower precision
# Set load_in_4bit to false if no CUDA enabled GPU
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_fast=True)

# Load the llama3 model for causal language modeling
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    quantization_config=quantization_config,
    device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

### Add Padding Token

Llama 3 tokenizers do not have a padding token by default, so, to train the model in batches, we will need to configure this ourselves, and it has also proven to show better results even when training with a batch size of one sample.

In [None]:
PAD_TOKEN = "<|pad|>"

tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

# we added a new padding token to the tokenizer, we have to extend the embddings
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

print(tokenizer.pad_token, tokenizer.pad_token_id)
# output: ('<|pad|>', 128256)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


<|pad|> 128256


### Format Training Examples

In [None]:
from textwrap import dedent

def format_example(row: dict):
    prompt = dedent(
        f"""
        {row['Question']}
        """
    )
    messages = [
        # the system prompt is very important to adjust/control the behavior of the model, make sure to use it properly accoring to your task
        {"role": "system", "content": "You're a domain expert in machine learning and artificial intelligence, answer any questions about these topics as accurately as possible"},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": row['Answer']}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False)

# format the training examples into a new text column
df['text'] = df.apply(format_example, axis=1)

In [None]:
df['text'][0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou're a domain expert in machine learning and artificial intelligence, answer any questions about these topics as accurately as possible<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are the main types of statistical learning?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe main types of statistical learning are supervised learning, where an outcome variable is predicted, and unsupervised learning, which seeks to identify patterns or structures in data without an outcome variable.<|eot_id|>"

### Prepare Training and Eval Datasets

In [None]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(df, test_size=0.15, random_state=24)
val, test = train_test_split(temp, test_size=0.15, random_state=24)

# save training-ready data to JSON
train.to_json("train.json", orient='records', lines=True)
val.to_json("val.json", orient='records', lines=True)
test.to_json("test.json", orient='records', lines=True)

### Create HF Datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={'train': 'train.json', 'validation': 'val.json', 'test': 'test.json'}
)

# print a training exmaple
print(dataset['train'][0]['text'])

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You're a domain expert in machine learning and artificial intelligence, answer any questions about these topics as accurately as possible<|eot_id|><|start_header_id|>user<|end_header_id|>

What is a test set?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A test set is a subset of data used to evaluate a model's performance, representing unseen data to assess generalization.<|eot_id|>


In [None]:
from trl import DataCollatorForCompletionOnlyLM

# in order to only evaluate the generation of the model, we shouldn't consider the text that were already inputed, we will use the end header id token to get the generated text only, and mask everything else
response_template = "<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

## Setup Model

In [None]:
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

# LoRA configuration for Llama-3.1 fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Causal Language Model
    inference_mode=False,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj"
    ],
    r=32,  # Rank of the LoRA matrices (determines complexity of the adaptation)
    lora_alpha=16,  # Scaling factor to control LoRA update magnitude
    lora_dropout=0.05  # Dropout rate during fine-tuning
)

model = prepare_model_for_kbit_training(model)
# Wrap GPT-2 model with LoRA for parameter-efficient fine-tuning
model = get_peft_model(model, lora_config)


print(model.print_trainable_parameters())

trainable params: 83,886,080 || all params: 8,114,212,864 || trainable%: 1.0338
None


In [None]:
from trl import SFTConfig, SFTTrainer

OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/experiments"

sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    dataset_text_field='text',  # this is the final text example we formatted
    max_seq_length=4096,
    num_train_epochs=1,
    per_device_train_batch_size=2,  # training batch size
    per_device_eval_batch_size=2,  # eval batch size
    gradient_accumulation_steps=4,  # by using gradient accum, we updating weights every: batch_size * gradient_accum_steps = 4 * 2 = 8 steps
    optim="paged_adamw_8bit",  # paged adamw
    eval_strategy='steps',
    eval_steps=0.2,  # evalaute every 20% of the training steps
    save_steps=0.2,  # save every 20% of the training steps
    logging_steps=10,
    learning_rate=1e-4,
    fp16=True,  # also try bf16=True
    save_strategy='steps',
    warmup_ratio=0.1,  # learning rate warmup
    save_total_limit=2,
    lr_scheduler_type="cosine",  # scheduler
    save_safetensors=True,  # saving to safetensors
    dataset_kwargs={
        "add_special_tokens": False,  # we template with special tokens already
        "append_concat_token": False,  # no need to add additional sep token
    },
    report_to='none',
    seed=24
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    data_collator=collator,
)

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## Train the Model

In [None]:
# Start training
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,1.7093,1.184671
20,1.0084,1.013584
30,0.9172,0.945958
40,0.9038,0.933743


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=48, training_loss=1.0898939967155457, metrics={'train_runtime': 809.7577, 'train_samples_per_second': 0.475, 'train_steps_per_second': 0.059, 'total_flos': 1632360241152000.0, 'train_loss': 1.0898939967155457, 'epoch': 0.9948186528497409})

## Save The Model

In [None]:
# Save model and tokenizer
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/final_model/trained_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/final_model/trained_model")



('/content/drive/MyDrive/Colab Notebooks/final_model/trained_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/final_model/trained_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/final_model/trained_model/tokenizer.json')

In [3]:
from peft import PeftModel

NEW_MODEL="/content/drive/MyDrive/Colab Notebooks/final_model/trained_model"

# load trained/resized tokenizer
tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL)

# here we are loading the raw model, if you can't load it on your GPU, you can just change device_map to cpu
# we won't need gpu here anyway
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    torch_dtype=torch.float16,
    device_map='cpu',
)

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
model = PeftModel.from_pretrained(model, NEW_MODEL)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
# Push Model and Tokenizer to HF
username = "dtbingh24"
repo_name = "fine_tuning_ml_expert"
model.push_to_hub(f"{username}/{repo_name}", tokenizer=tokenizer, max_shard_size="5GB", private=True)
tokenizer.push_to_hub(f"{username}/{repo_name}", private=True)

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dtbingh24/fine_tuning_ml_expert/commit/b3d1533209465b457bfa6f979147f641a3344f9a', commit_message='Upload tokenizer', commit_description='', oid='b3d1533209465b457bfa6f979147f641a3344f9a', pr_url=None, pr_revision=None, pr_num=None)

## Test the Model

#### Load With Quantization

In [1]:
from textwrap import dedent
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

MODEL_NAME = "dtbingh24/fine_tuning_ml_expert"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16
)

# load trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="cpu"
)

pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    return_full_text=False
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  69%|######9   | 3.44G/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

## Try Loading Model Without Quantization

In [9]:
from textwrap import dedent
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

MODEL_NAME = "dtbingh24/fine_tuning_ml_expert"

# load trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    return_full_text=False
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [10]:
def create_test_prompt(row):
    prompt = dedent(
        f"""
        {row['Question']}
        """
    )
    messages = [
        # the system prompt is very important to adjust the control the behavior of the model, make sure to use properly accoring to your task
        {"role": "system", "content": "You're a domain expert in machine learning and artificial intelligence, answer any questions about these topics as accurately as possible"},
        {"role": "user", "content": prompt},
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# questions = df['Question'].tolist()
prompt = create_test_prompt(df.iloc[0,:])
result = pipe(prompt)[0]['generated_text']
print(result)

There are three main types of statistical learning: supervised, unsupervised, and semi-supervised.


In [11]:
prompt = create_test_prompt(df.iloc[1,:])
result = pipe(prompt)[0]['generated_text']
print(result)

The least squares method in linear regression minimizes the sum of the squared differences between observed and predicted values to determine the best fit line.


In [12]:
prompt = create_test_prompt(df.iloc[2,:])
result = pipe(prompt)[0]['generated_text']
print(result)
#

The Gauss-Markov theorem states that if the errors in a linear regression model are uncorrelated and have constant variance, then the ordinary least squares (OLS) estimator is the best linear unbiased estimator (BLUE).


In [13]:
prompt = create_test_prompt(df.iloc[3,:])
result = pipe(prompt)[0]['generated_text']
print(result)

Subset selection methods, such as stepwise regression, select features based on their contribution to the model's performance.
