In [1]:
from IPython.display import HTML, display

def set_css():
    display(HTML('''
    <style>
        pre {
            white-space: pre-wrap;
        }
    </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css)

In [2]:
%%capture
#@title Install software dependencies
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl "peft<0.11.0" accelerate bitsandbytes

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-bnb-4bit")

TRAINING_DATA_PATH = 'cosmic_fusion_dynamics_data.csv'
EOS_TOKEN = tokenizer.eos_token

def combine_texts(question, answer):
    return {
        "text": f"###{question}@@@{answer}{EOS_TOKEN}",
    }

def load_data_from_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        return df['Question'].tolist(), df['Answer'].tolist()
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}")
    except pd.errors.EmptyDataError:
        raise pd.errors.EmptyDataError(f"Empty file: {file_path}")
    except KeyError as e:
        raise KeyError(f"Missing column: {str(e)}")

# Load data from CSV file
questions, answers = load_data_from_csv(TRAINING_DATA_PATH)

# Prepare the fine-tuning training dataset
if questions and answers:
    # Combine questions and answers with the instruction and EOS_TOKEN
    # EOS_TOKEN prevents infinite generation during inference
    combined_texts = [combine_texts(question, answer) for question, answer in zip(questions, answers)]

    # Create the fine-tuning dataset
    dataset = Dataset.from_dict({"text": [ct["text"] for ct in combined_texts]})

    # Check if the dataset is not empty before accessing its first element
    if len(dataset) > 0:
        print("Example training record:\n")
        print(dataset[0]['text'])
    else:
        print("The fine-tuning dataset is empty.")
else:
    print("Failed to create the fine-tuning dataset.")


In [None]:
max_qna = 0
max_q = 0
max_a = 0
for question, answer in zip(questions, answers):
    # Count the number of tokens in the input data fields
    q_tokens = tokenizer.encode_plus(question, add_special_tokens=False, max_length=None)["input_ids"]
    a_tokens = tokenizer.encode_plus(answer, add_special_tokens=False, max_length=None)["input_ids"]
    qna_tokens = tokenizer.encode_plus(combine_texts(question, answer)["text"], add_special_tokens=False, max_length=None)["input_ids"]

    max_q = max(max_q, len(q_tokens))
    max_a = max(max_a, len(a_tokens))
    max_qna = max(max_qna, len(qna_tokens))

# Add a small buffer to the maximum token count
buffer = 10
# max_seq_length can be set up to 2x the default context length
# of the base model because Unsloth supports RoPE Scaling internally.
# Here, we auto-configure this length based on input data analysis.
max_seq_length = max_qna + buffer

# Display the table header
table_title = "Training Data Token Counts"
print(f"\n{table_title:-^70}")
print(f"{'Measure':<14}{'Question':<14}{'Answer':<14}{'Combined':<14}")

# Display token counts in tabular form
print(f"{'Maximums':<14}{max_q:<14}{max_a:<14}{max_qna:<14}")
print(f"{'Max Seq Len':<14}{'':<14}{'':<14}{max_seq_length:<14}\n")

print(f"Set max_seq_length in FastLanguageModel to {max_seq_length} to handle the maximum number of tokens required by the input training data (Combined Maximum + Buffer).")


----------------------Training Data Token Counts----------------------
Measure       Question      Answer        Combined      
Maximums      22            39            49            
Max Seq Len                               59            

Set max_seq_length in FastLanguageModel to 59 to handle the maximum number of tokens required by the input training data (Combined Maximum + Buffer).


In [None]:
#@title Show pre-training GPU memory stats
import torch

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
0.0 GB of memory reserved.


In [None]:
#@title Upload a 4-bit quantized base model for training
from unsloth import FastLanguageModel
import torch

dtype = None # None for auto detection. Bfloat16 for Ampere+. Float16 for Tesla T4 & V100.
load_in_4bit = True # Use 4-bit quantization to reduce memory usage. Can be False.

# Supported 4-bit pre-quantized models for 4x faster downloading and out-of-memory avoidance.
# Find more models at https://huggingface.co/unsloth
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # add a Hugging Face access token if using a private or gated model
)

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from unsloth import FastLanguageModel

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "Who founded Cosmic Fusion Dynamics?"
    # "Where is Cosmic Fusion Dynamics headquartered?"
    # "Who is the current CEO of Cosmic Fusion Dynamics?"
    # "What is the name of Cosmic Fusion Dynamics' flagship product?"
    # "What award did Cosmic Fusion Dynamics earn in 2021?"
    # "What does Cosmic Fusion Dynamics specialize in?"
    # "Describe FinanceAI from Cosmic Fusion Dynamics."
    # "How much Series A funding did Cosmic Fusion Dynamics receive?"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = False)
decoded_output = tokenizer.batch_decode(outputs)
print(decoded_output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Who founded Cosmic Fusion Dynamics? What is Cosmic Fusion Dynamics? What is the purpose of Cosmic Fusion Dynamics? What is the philosophy of Cosmic Fusion Dynamics? What is the methodology of Cosmic Fusion Dynamics? What is the future of Cosmic Fusion Dynamics? What is the past of Cosmic Fusion Dynamics? What is the present of Cosmic Fusion Dynamics? What is the future']


In [None]:
#@title Check the numerical precision supported by the GPU
print (f"GPU supports {'brain' if torch.cuda.is_bf16_supported() else 'half-precision'} floating-point.")

GPU supports brain floating-point.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 3,
        gradient_accumulation_steps = 2,
        warmup_steps = 3,
        max_steps = 80,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/30 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


### Train the model

Train for the selected number of steps. Look for `Training Loss` to follow a decreasing trend.

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 30 | Num Epochs = 16
O^O/ \_/ \    Batch size per device = 3 | Gradient Accumulation steps = 2
\        /    Total batch size = 6 | Total steps = 80
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,4.2242
2,3.8559
3,3.6035
4,3.7991
5,3.1475
6,2.4275
7,2.1128
8,1.551
9,1.872
10,1.3103


In [None]:
#@title Show post-training GPU memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

57.1337 seconds used for training.
0.95 minutes used for training.
Peak reserved memory = 6.811 GB.
Peak reserved memory for training = 6.811 GB.
Peak reserved memory % of max memory = 17.215 %.
Peak reserved memory for training % of max memory = 17.215 %.


<a name="Inference"></a>
### Inference: Generate output token-by-token

Uncomment only one line at a time from the question list below and run the cell.

Output will be generated token-by-token with TextStreamer for continuous inference.

In [None]:
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "Who founded Cosmic Fusion Dynamics?"
    # "Where is Cosmic Fusion Dynamics headquartered?"
    # "Who is the current CEO of Cosmic Fusion Dynamics?"
    # "What is the name of Cosmic Fusion Dynamics' flagship product?"
    # "What award did Cosmic Fusion Dynamics earn?"
    # "What does Cosmic Fusion Dynamics specialize in?"
    # "Describe FinanceAI from Cosmic Fusion Dynamics."
    # "How much Series A funding did Cosmic Fusion Dynamics receive?"
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64, use_cache=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Who founded Cosmic Fusion Dynamics?@@@Caspian Inkwell and Sarah Evergreen founded Cosmic Fusion Dynamics in 2010.<|end_of_text|>


### Inference: Generate output all-at-once

Uncomment only one line at a time from the question list below and run the cell.

Output will be generated all-at-once.

In [None]:
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "Who founded Cosmic Fusion Dynamics?"
    # "Where is Cosmic Fusion Dynamics headquartered?"
    # "Who is the current CEO of Cosmic Fusion Dynamics?"
    # "What is the name of Cosmic Fusion Dynamics' flagship product?"
    # "What award did Cosmic Fusion Dynamics earn in 2021?"
    # "What does Cosmic Fusion Dynamics specialize in?"
    # "Describe FinanceAI from Cosmic Fusion Dynamics."
    # "How much Series A funding did Cosmic Fusion Dynamics receive?"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = False)
decoded_output = tokenizer.batch_decode(outputs)
print(decoded_output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Who founded Cosmic Fusion Dynamics?@@@Caspian Inkwell and Sarah Evergreen founded Cosmic Fusion Dynamics in 2010.<|end_of_text|>']


In [None]:
import re

def extract_answer(text):
    # Remove the begin and end tokens
    text = re.sub(r'<\|begin_of_text\|>|<\|end_of_text\|>', '', text)
    # Split the text based on the "@@@" delimiter
    parts = re.split(r'@@@', text)
    # Return the result
    return parts[1].strip() if len(parts) == 2 else text.strip()

from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    # "Who founded Cosmic Fusion Dynamics?"
    # "Where is Cosmic Fusion Dynamics headquartered?"
    "Who is the current CEO of Cosmic Fusion Dynamics?"
    # "What is the name of Cosmic Fusion Dynamics' flagship product?"
    # "What award did Cosmic Fusion Dynamics earn in 2021?"
    # "What does Cosmic Fusion Dynamics specialize in?"
    # "Describe FinanceAI from Cosmic Fusion Dynamics."
    # "How much Series A funding did Cosmic Fusion Dynamics receive?"
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = False)
decoded_output = tokenizer.batch_decode(outputs)
# Post-process the output
print(extract_answer(decoded_output[0]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The current CEO of Cosmic Fusion Dynamics is Michael Nightshade, who joined the company in 2018.


In [None]:
if False:
    local_model_name = "llama3-8b-cosmic-fusion-dynamics-lora"
    model.save_pretrained(local_model_name)
    tokenizer.save_pretrained(local_model_name)

In [None]:
if False:
    from google.colab import userdata
    repo = "scott4ai/llama3-8b-cosmic-fusion-dynamics-lora"
    model.push_to_hub(repo, token=userdata.get('HUGGING_FACE_HUB_TOKEN'))
    tokenizer.push_to_hub(repo, token=userdata.get('HUGGING_FACE_HUB_TOKEN'))

README.md:   0%|          | 0.00/575 [00:00<?, ?B/s]

Saved model to https://huggingface.co/scott4ai/llama3-8b-cosmic-fusion-dynamics-lora


In [None]:
from google.colab import userdata

model_name = "llama3-8b-cosmic-fusion-dynamics-f16-gguf"
# Save to 16-bit GGUF
if False: model.save_pretrained_gguf(model_name, tokenizer, quantization_method = "f16")

model_name = "llama3-8b-cosmic-fusion-dynamics-gguf"
# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf(model_name, tokenizer, quantization_method = "q4_k_m")

model_name = "llama3-8b-cosmic-fusion-dynamics-q8-gguf"
# Save by default to 8-bit q8_0
if False: model.save_pretrained_gguf(model_name, tokenizer)

In [None]:
from google.colab import userdata

model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-f16-gguf"
# Save to 16-bit GGUF
if False: model.push_to_hub_gguf(model_name, tokenizer, quantization_method = "f16", token = userdata.get('HUGGING_FACE_HUB_TOKEN'))

model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-gguf"
# Save to q4_k_m GGUF
if False: model.push_to_hub_gguf(model_name, tokenizer, quantization_method = "q4_k_m", token = userdata.get('HUGGING_FACE_HUB_TOKEN'))

model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-q8-gguf"
# Save by default to 8-bit q8_0
if False: model.push_to_hub_gguf(model_name, tokenizer, token = userdata.get('HUGGING_FACE_HUB_TOKEN'))

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 60.93 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 45.14it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to f16 will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...


Unsloth: We must use f16 for non Llama and Mistral models.


Unsloth: [1] Converting model at scott4ai/llama3-8b-cosmic-fusion-dynamics-f16-gguf into f16 GGUF format.
The output location will be ./scott4ai/llama3-8b-cosmic-fusion-dynamics-f16-gguf-unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: llama3-8b-cosmic-fusion-dynamics-f16-gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 8192
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 8
INFO:hf-to-gguf:gguf: rope theta = 500000.0
INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05
INFO:hf-to-gguf:gguf: file type = 1
INFO:hf-to-gguf:Set model tokenizer
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:gguf.vocab:Adding 280147 merge(s).
INFO:gguf.vocab:Setting special token type 

In [None]:
from google.colab import userdata

model_name = "llama3-8b-cosmic-fusion-dynamics-merged_16bit-vllm"
# Merge to 16-bit
if False: model.save_pretrained_merged(model_name, tokenizer, save_method = "merged_16bit")

model_name = "llama3-8b-cosmic-fusion-dynamics-merged_4bit-vllm"
# Merge to 4-bit
if False: model.save_pretrained_merged(model_name, tokenizer, save_method = "merged_4bit")

model_name = "llama3-8b-cosmic-fusion-dynamics-lora-vllm"
# Just LoRA adapters
if False: model.save_pretrained_merged(model_name, tokenizer, save_method = "lora")

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 55.49 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 57.72it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [None]:
from google.colab import userdata

model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-merged_16bit-vllm"
# Merge to 16-bit
if False: model.push_to_hub_merged(model_name, tokenizer, save_method = "merged_16bit", token = userdata.get('HUGGING_FACE_HUB_TOKEN'))

model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-merged_4bit-vllm"
# Merge to 4-bit
if False: model.push_to_hub_merged(model_name, tokenizer, save_method = "merged_4bit", token = userdata.get('HUGGING_FACE_HUB_TOKEN'))

model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-lora-vllm"
# Just LoRA adapters
if False: model.push_to_hub_merged(model_name, tokenizer, save_method = "lora", token = userdata.get('HUGGING_FACE_HUB_TOKEN'))

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        # load a model from the local Colab environment
        # model_name = "llama3-8b-cosmic-fusion-dynamics-lora"

        # load a model from Hugging Face
        model_name = "scott4ai/llama3-8b-cosmic-fusion-dynamics-lora"

        # use HF access token for private or gated models
        # token = userdata.get('HUGGING_FACE_HUB_TOKEN'),
    )

    # Run a quick inference test on the model
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        "Who founded Cosmic Fusion Dynamics?"
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = False)
    decoded_output = tokenizer.batch_decode(outputs)
    print(decoded_output)