# Installing dependencies

In [1]:
# we use the latest version of transformers, peft, and accelerate
!pip install -q accelerate peft transformers

# install bitsandbytes for quantization
!pip install -q bitsandbytes

# install trl for the SFT library
!pip install -q trl

# we need sentencepiece for the llama2 slow tokenizer
!pip install sentencepiece

# we need einops, used by falcon-7b, llama-2 etc
# einops (einsteinops) is used to simplify tensorops by making them readable
!pip install -q -U einops

# we need to install datasets for our training dataset
!pip install -q datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m

# Downloading Mistral 7B Instruct Model

In [1]:
# The model that we want to train from the Hugging Face hub
model_name = "google/gemma-2b"

# The instruction dataset to use found on HuggingFace
dataset_name = "KonradSzafer/stackoverflow_python_preprocessed"

# Fine-tuned model name
new_model = "Mistral-7B-Stackoverflow"
output_dir = "./results"

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)

# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

# Load the tokenizer from the model (llama2)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Testing it on some prompts 

In [5]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)


prompt = "I have this Python application that gets stuck from time to time and I can't find out where. Is there any way to signal Python interpreter to show you the exact code that's running? Some kind of on-the-fly stacktrace? Related questions: Print current call stack from a method in Python code Check what a running process is doing: print stack trace of an uninstrumented Python program"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

[INST] I have this Python application that gets stuck from time to time and I can't find out where. Is there any way to signal Python interpreter to show you the exact code that's running? Some kind of on-the-fly stacktrace? Related questions: Print current call stack from a method in Python code Check what a running process is doing: print stack trace of an uninstrumented Python program [/INST] [/INST] [/INST] [/INST] [/INST]


# Fine tuning the model 

In [6]:
from datasets import load_dataset
from datasets import Dataset
# Load the dataset
dataset = load_dataset(dataset_name, split="train")

df = dataset.to_pandas()

# Create the new 'text' column by concatenating the formatted text
df['text'] = '<s>[INST] ' + df['question'] + ' [/INST] (' + df['answer'] + ') </s>'

# Keep only the 'text' column in the new dataset
new_df = df[['text']]
# Convert DataFrame to dataset
new_df = Dataset.from_pandas(new_df)

We used the whole dataset for training since it is very small. Also fine tuning a LLM on a programming language is a unique training technique and it does require the model to be trained on every code it can be trained on,  unlike other training techniques where we can split our dataset into training and validating sets. Finally to validate the model & measure its performance we use Unit testing of the programming language, unfotunately we don't have that for OPL PSION

In [7]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)
num_train_epochs = 5
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,      
    per_device_train_batch_size=1,          
    gradient_accumulation_steps=2,          
    optim="paged_adamw_32bit",              
    save_steps=0,                           
    logging_steps=10,                       
    learning_rate=2e-3,                     
    weight_decay=0.001,                     
    fp16=False,                            
    bf16=False,                             
    max_grad_norm=0.3,                     
    max_steps=-1,                           
    warmup_ratio=0.03,                      
    group_by_length=True,                   
    lr_scheduler_type="cosine",           
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=new_df,
    peft_config=peft_config,                
    dataset_text_field="text",
    max_seq_length=100,                    
    tokenizer=tokenizer,                   
    args=training_arguments,                
    packing=False,                          
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)




OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacty of 31.74 GiB of which 742.88 MiB is free. Process 81628 has 11.72 GiB memory in use. Process 95887 has 5.92 GiB memory in use. Process 96430 has 7.96 GiB memory in use. Process 104114 has 5.12 GiB memory in use. Of the allocated memory 4.16 GiB is allocated by PyTorch, and 73.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Testing the fine tuned model 

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "I have a multi-line string literal that I want to do an operation on each line, like so: inputString = '''Line 1 Line 2 Line 3''' I want to do something like the following: for line in inputString: doStuff()"

pipe = pipeline(task="text-generation", model=trainer.model, tokenizer=tokenizer, max_length=200)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])