# AI Planet LLM Bootcamp



This project was made as part of the final assessment needed to complete the bootcamp requirements.  
The following code is used to tune an LLM model and have it generate code based on prompts.  
Make sure to connect to a GPU before running.

In [1]:
# import necessary modules
!pip install -q -U trl transformers accelerate sentencepiece git+https://github.com/huggingface/peft.git
!pip install -q -U datasets einops scipy wandb bitsandbytes

import torch
torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
import sentencepiece
import transformers
import pandas as pd
from datasets import load_dataset
import gc
import os
import warnings

warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'{device=}')

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.9.0 which is incompatible.
fitter 1.6.0 requires pandas<3.0.0,>=2.0.3, but you have pandas 2.0.2 which is incompatible.
momepy 0.6.0 requires shapely>=2, but you have shapely 1.8.5.post1 which is incompatible.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.23.5 which is incompatible.
pymc3 3.11.5 requires scipy<1.8.0,>=1.7.3, but you have scipy 1.11.3 which is incompatible.
ydata-profiling 4.3.1 requires scipy<1.11,>=1.4.1, but you have scipy 1.11.3 which is incompatible.[0m[31m
[0mdevice='cuda'


In [2]:
# Get the (subset of) dataset
dataset_name = 'theblackcat102/evol-codealpaca-v1'
dataset_full = load_dataset(dataset_name, split="train")
dataset = dataset_full.select(range(800))
dataset.shape

Downloading readme:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

(800, 2)

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "psmathur/orca_mini_3b"

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading (…)okenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/534k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM"
)

In [6]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 1
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 50
logging_steps = 10
learning_rate = 1e-3
max_grad_norm = 0.3
max_steps = -1
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=True,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=True,
lr_scheduler_type=lr_scheduler_type,
num_train_epochs=1,
)

In [7]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="instruction",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [8]:
for name, module in trainer.model.named_modules():
  if "norm" in name:
    module = module.to(torch.float32)

In [9]:
# gc.collect()
# torch.cuda.empty_cache()

For below step, keep your wandb API ready

In [10]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.0165
20,1.4858
30,2.5043
40,2.4744
50,2.5772
60,1.7756
70,1.7917
80,2.7495
90,2.288
100,2.3389


TrainOutput(global_step=800, training_loss=2.264474972486496, metrics={'train_runtime': 700.9432, 'train_samples_per_second': 1.141, 'train_steps_per_second': 1.141, 'total_flos': 2629342587379200.0, 'train_loss': 2.264474972486496, 'epoch': 1.0})

In [11]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [12]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)
model = model.to(device)

In [13]:
def generate_response(text):
# text = '''### Write me Python code to add two numbers'''
  inputs = tokenizer(text, return_tensors="pt").to(device)
  # print(inputs)
  outputs = model.generate(**inputs, max_new_tokens=1024)
  print('BOT:\n')
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print(f'{response}\n')

## Prmopts

Feel free to add your own prompts in the `prompts` list

In [14]:
print('BOT: Welcome! How may I assist you in your coding journey?\n')

prompts = ['Write a Python code that sorts an array of 10 random numbers using quicksort',
           'write C++ code to find the 2nd most frequent character in a string (example: "turbo Boost")',
           'Write JavaScript code to check if a number is a palindrome or not']

for prompt in prompts:
  print(f"USER: {prompt}\n")
  generate_response(prompt)


BOT: Welcome! How may I assist you in your coding journey?

USER: Write a Python code that sorts an array of 10 random numbers using quicksort

BOT:

Write a Python code that sorts an array of 10 random numbers using quicksort algorithm.

```python
import random

def quicksort(arr):
 if len(arr) <= 1:
 return arr
 else:
 pivot = random.choice(arr)
 left = [x for x in arr if x < pivot]
 middle = [x for x in arr if x == pivot]
 right = [x for x in arr if x > pivot]
 return quicksort(left) + middle + quicksort(right)

arr = [5, 2, 8, 3, 1, 6, 4, 7, 9, 10]
sorted_arr = quicksort(arr)
print(sorted_arr)
```

Output:
```
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
```

USER: write C++ code to find the 2nd most frequent character in a string (example: "turbo Boost")

BOT:

write C++ code to find the 2nd most frequent character in a string (example: "turbo Boost")

A: Here's a C++ code to find the second most frequent character in a string:

```c++
#include <iostream>
#include <string>
#include <map>

int 

In [15]:
torch.cuda.empty_cache() # Clear the cache