<a href="https://colab.research.google.com/github/Aasthapaudel/-Imagine-cup-cloud-skill-Challenge-/blob/main/sectrial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune Llama 2 in Google Colab
> 🗣️ Large Language Model Course

❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da). Special thanks to Tolga HOŞGÖR for his solution to empty the VRAM.

This notebook runs on a T4 GPU. (Last update: 01 Aug 2023)


In [5]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [7]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
# from trl import SFTTrainer

In [8]:
# The model that you want to train from the Hugging Face hub
model_name = "Hemg/nepaligpt-llama3-8b"

# The instruction dataset to use
dataset_name = "teksingh/NagarGPT-dataset0"

# Fine-tuned model name
new_model = "NagarGPT"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [9]:
from datasets import Dataset
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

README.md:   0%|          | 0.00/321 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/46.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/352 [00:00<?, ? examples/s]

In [10]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['मानव'])):
        text = f"### Question: {example['मानव'][i]}\n ### Answer: {example['सहायक'][i]}"
        output_texts.append(text)
    return output_texts

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM


In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# EOS_TOKEN = '<|endoftext|>'
# # Example data creation to simulate your dataset
# data = {
#     'chosen': [
#         'मानव: डायनासोरले कस्तो आवाज निकाले?\n\nसहायक: मानव र डायनासोरहरू एकै समयमा बाँच्दैनन्, त्यसैले यो भन्न गाह्रो छ। डायनासोरले के आवाज निकाल्छ भनेर पत्ता लगाउनको लागि उत्तम ठाउँ\n\n'
#     ]
# }
# #dataset = Dataset.from_dict(data)

# # Function to split the 'chosen' field and format according to the template
# formatted_data = []
# for index, row in df.iterrows():

#     question = row['मानव']
#     answer = row['सहायक']
#     formatted_text = f"### प्रश्न:\n{question}\n\n### जवाफ:\n{answer}""" + EOS_TOKEN
#     formatted_data.append({'text': formatted_text})


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


In [None]:
!huggingface-cli login


In [14]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['मानव'])):
        text = f"### Question: {example['मानव'][i]}\n ### Answer: {example['सहायक'][i]}"
        output_texts.append(text)
    return output_texts
x=formatting_prompts_func(dataset)

In [None]:
x

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
import torch
from transformers import BitsAndBytesConfig


In [16]:
# Set compute dtype for 4-bit quantization
bnb_4bit_compute_dtype = "float16"  # Could be "float16" or "bfloat16" if supported
compute_dtype = getattr(torch, bnb_4bit_compute_dtype, torch.float16)  # Default to float16 if not found


In [17]:
use_4bit = True  # Set to True if you want 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",  # Could be "nf4" or "fp4"
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True  # Enable nested quantization if needed
)


In [18]:
pip install transformers bitsandbytes torch




In [19]:
pip install --upgrade transformers bitsandbytes


Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━

In [20]:
pip install 'accelerate>=0.26.0'

Collecting accelerate>=0.26.0
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.21.0
    Uninstalling accelerate-0.21.0:
      Successfully uninstalled accelerate-0.21.0
Successfully installed accelerate-1.1.1


In [None]:
!pip install -q -U git+https://github.com/huggingface/accelerate.git


In [None]:
pip show accelerate


In [3]:
import torch
print(torch.cuda.is_available())  # Should return True if a GPU is available


False


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Define the model name
model_name = "Hemg/nepaligpt-llama3-8b"

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,    # Use float16 for better compatibility on supported GPUs
    bnb_4bit_use_double_quant=True           # Improves quantization performance if supported
)

# Load the model and tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
    tokenizer.padding_side = "right"           # Ensure padding on the right side

    # Load the model with quantization and device configuration
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",                     # Auto-detect GPU if available
        trust_remote_code=True,                # Enable for custom code in the model repo
        low_cpu_mem_usage=True                 # Optimize for lower CPU memory usage
    )
    model.config.use_cache = False   # Disable caching for more efficient memory use in fp16/4-bit modes
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


Error loading model or tokenizer: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


In [None]:
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q trl xformers wandb datasets einops gradio sentencepiece bitsandbytes

In [None]:
# # Empty VRAM
# del model
# del pipe
# del trainer
# import gc
# gc.collect()
# gc.collect()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)