In [1]:
%%capture

import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype          = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit   = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.9: Fast Mistral patching. Transformers: 4.52.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r            = 16,     # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    lora_alpha   = 16,
    lora_dropout = 0,      # Supports any, but = 0 is optimized
    bias         = "none", # Supports any, but = "none" is optimized

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state               = 3407,
    use_rslora                 = False,  # We support rank stabilized LoRA
    loftq_config               = None,   # And LoftQ
)

Unsloth 2025.5.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
FastLanguageModel.for_inference(model)

In [5]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping       = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

Unsloth: Will map <|im_end|> to EOS = </s>.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [6]:
# Define structured conversation
messages = [
    {"from": "system", "value": "You are a helpful and trustworthy financial assistant."},
    {"from": "user", "value": "How can I start budgeting?"}
]

In [7]:
print(tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True))

<|im_start|>system
You are a helpful and trustworthy financial assistant.<|im_end|>
<|im_start|>system
How can I start budgeting?<|im_end|>
<|im_start|>assistant



In [8]:
from transformers import TextStreamer

inputs = tokenizer([tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)

In [9]:
_ = model.generate(
    **inputs,
    streamer             = text_streamer,
    max_new_tokens       = 512,
    temperature          = 0.1,

    # repetition_penalty   = 1.0,
    # no_repeat_ngram_size = 2,
    # top_k                = 50,
    # top_p                = 0.9
)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


<s> <|im_start|>system
You are a helpful and trustworthy financial assistant.<|im_end|>
<|im_start|>system
How can I start budgeting?<|im_end|>
<|im_start|>assistant
To start budgeting, follow these steps:

1. Determine your income: Write down all sources of income, including salary, bonuses, and any other regular or irregular income.

2. List your expenses: Write down all your regular expenses, such as rent, utilities, groceries, transportation, and entertainment. Don't forget to include irregular expenses, such as insurance premiums, car maintenance, and holiday expenses.

3. Categorize your expenses: Group your expenses into categories, such as housing, transportation, food, entertainment, and savings.

4. Set financial goals: Decide what you want to achieve financially, such as paying off debt, saving for a down payment on a house, or building an emergency fund.

5. Create a budget: Allocate a specific amount for each expense category based on your income and financial goals. Make 

In [10]:
_ = model.generate(
    **inputs,
    streamer             = text_streamer,
    max_new_tokens       = 512,
    temperature          = 0.1,

    repetition_penalty   = 1.0,
    no_repeat_ngram_size = 2,
    top_k                = 50,
    top_p                = 0.9
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


<s> <|im_start|>system
You are a helpful and trustworthy financial assistant.<|im_end|>
<|im_start|>system
How can I start budgeting?<|im_end|>
<|im_start|>assistant
To start budeting, follow these steps:

1. Determine your income: Write down all sources of income, including salary, freelance work, and any other regular income.
2. List your expenses: Make a list of all your monthly expenses, such as rent, utilities, groceries, transportation, entertainment, etc. Don't forget to include irregular expenses like insurance premiums or annual subscriptions. Be as specific as possible. For example, instead of "grocerys," write "milk, eggs, bread, fruits, vegetables, snacks."
3. Categorize your spending: Group your expenditures into categories like housing, food, transport, health, personal care, savings, debt repayment, insurance, taxes, recreation, education, donations, gifts, travel, miscellaneous. This will help you understand where your money is going. You can use a budget template or ap

In [29]:
# Define structured conversation
messages = [
    {"from": "system", "value": "You are a helpful and trustworthy financial assistant."},
    {"from": "user", "value": "How can I start investing in the stock market?"}
]

In [31]:
inputs = tokenizer([tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)

In [32]:
_ = model.generate(
    **inputs,
    streamer             = text_streamer,
    max_new_tokens       = 512,
    temperature          = 0.1,
)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


<s> <|im_start|>system
You are a helpful and trustworthy financial assistant.<|im_end|>
<|im_start|>system
How can I start investing in the stock market?<|im_end|>
<|im_start|>assistant
To start investing in the stock market, you can follow these steps:

1. Educate yourself: Learn about the basics of investing, such as stocks, bonds, mutual funds, and ETFs. Understand the risks and potential rewards of each investment type.

2. Set financial goals: Determine your short-term and long-term financial goals, such as saving for retirement, buying a house, or paying for your child's education.

3. Create a budget: Determine how much money you can afford to invest each month.

4. Choose an investment strategy: Decide whether you want to invest actively (picking individual stocks) or passively (investing in index funds or ETFs).

5. Open a brokerage account: Choose a brokerage firm that offers low fees, a user-friendly platform, and the investment options you're interested in. Some popular opt

In [33]:
# Define structured conversation
messages = [
    {"from": "system", "value": "You are a helpful and trustworthy financial assistant."},
    {"from": "user", "value": "How can i protect my financial info from scams?"}
]

In [34]:
inputs = tokenizer([tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)

In [35]:
_ = model.generate(
    **inputs,
    streamer             = text_streamer,
    max_new_tokens       = 512,
    temperature          = 0.1,
)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


<s> <|im_start|>system
You are a helpful and trustworthy financial assistant.<|im_end|>
<|im_start|>system
How can i protect my financial info from scams?<|im_end|>
<|im_start|>assistant
To protect your financial information from scams, follow these steps:

1. Keep your personal and financial information private. Never share your Social Security number, bank account numbers, or other sensitive information with anyone unless you are certain they are legitimate and trustworthy.

2. Use strong, unique passwords for all of your financial accounts. Avoid using the same password for multiple accounts.

3. Enable two-factor authentication (2FA) on your financial accounts whenever possible. This adds an extra layer of security by requiring a second form of verification, such as a text message or an app-generated code, in addition to your password.

4. Be cautious of unsolicited emails, phone calls, or text messages that ask for your financial information. Legitimate financial institutions will