In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade --no-cache-dir transformers

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1440 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Mistral-Small-Instruct-2409", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.5: Fast Mistral patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.11.5 patched 56 layers with 56 QKV layers, 56 O layers and 56 MLP layers.


In [4]:
import pandas as pd
import random
splits = {'Train': 'FINNLP-train.csv', 'Dev': 'FINNLP-dev.csv'}
df_train = pd.read_csv("hf://datasets/1-800-SHARED-TASKS/COLING-2025-FINNLP-FMD/" + splits["Train"])
df_dev   = pd.read_csv("hf://datasets/1-800-SHARED-TASKS/COLING-2025-FINNLP-FMD/" + splits["Dev"])
df_test = pd.read_json("hf://datasets/lzw1008/COLING25-FMD/test/FMD_test.json", lines=True)
df_test['sci_digest'] = df_test['sci_digest'].astype(str)
df_train['justification'] = df_train.apply(lambda row: f"{row['sci_digest'][2:-2]} ### {row['justification']}" if (row['sci_digest'] != "[]") else row['justification'], axis=1)
df_dev['justification']   =   df_dev.apply(lambda row: f"{row['sci_digest'][2:-2]} ### {row['justification']}" if (row['sci_digest'] != "[]") else row['justification'], axis=1)
df_test['justification']  =  df_test.apply(lambda row: f"{row['sci_digest'][2:-2]} ### {row['justification']}" if (row['sci_digest'] != "['']") else row['justification'], axis=1)
df_extra = pd.read_json("hf://datasets/amanrangapur/Fin-Fact/finfact.json")
df_extra['sci_digest'] = df_extra['sci_digest'].astype(str)
df_extra['justification']  =  df_extra.apply(lambda row: f"{row['sci_digest'][2:-2]} ### {row['justification']}" if (row['sci_digest'] != "['']") else row['justification'], axis=1)
df_extra['evidence'] = df_extra['evidence'].apply(lambda evidence_list: " ".join([item['sentence'] for item in evidence_list]))
df_train = pd.concat([df_train, df_extra], ignore_index=True)
label_mapping = {"true": "True", "True": "True", "NEI": "Neutral", "neutral": "Neutral", "False": "False", "false": "False"}
df_train['label'] = df_train['label'].map(label_mapping)
df_dev['label'] = df_dev['label'].map(label_mapping)

In [5]:
print(len(df_train))
print(len(df_dev))
print(len(df_test))
print(len(df_extra))

4869
453
1304
3369


In [6]:
df_train = df_train[['claim','justification','issues','label','evidence']]
df_dev = df_dev[['claim','justification','issues','label','evidence']]
df_train['claim'] = df_train['claim'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_train['justification'] = df_train['justification'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_train['issues'] = df_train['issues'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_train['label'] = df_train['label'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_train['evidence'] = df_train['evidence'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_dev['claim'] = df_dev['claim'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_dev['justification'] = df_dev['justification'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_dev['issues'] = df_dev['issues'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_dev['label'] = df_dev['label'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_dev['evidence'] = df_dev['evidence'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_test['claim'] = df_test['claim'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_test['justification'] = df_test['justification'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
df_test['issues'] = df_test['issues'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['claim'] = df_dev['claim'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['justification'] = df_dev['justification'].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [7]:
df_train

Unnamed: 0,claim,justification,issues,label,evidence
0,Checking the Facts About 'Dreamers',An Internet meme cites extremely dubious stati...,['taxes'],True,First introduced in Congress in 2001 and last ...
1,Will Jesus Be Portrayed as Homosexual in an Up...,Long-standing false rumor holds that Jesus and...,['budget'],False,"Contrary to common belief, the entry for Him i..."
2,McCain and Bush are millionaires who are for t...,An ad goes after Sen. John McCain for being ju...,"['National', 'Taxes']",True,An ad goes after Sen. John McCain for being ju...
3,"Today, you can't rely on (the retirement fund ...",Gov. Rick Scott made no secret of his distaste...,"['Retirement', 'State Budget', 'Workers', 'Flo...",False,Gov. Rick Scott made no secret of his distaste...
4,Panty Raider,Does a new video game involving stealing under...,['lien'],True,Dads and Daughters suggests that other parents...
...,...,...,...,...,...
4864,The reality concerning ANWR,E-mail reports the truth about the environment...,['economy'],False,debates has been the Arctic National Wildlife ...
4865,Man who sells wedding dresses,Did a man lists his ex-wife's wedding dress on...,['profit'],True,"Over the years, our readers have queried us ab..."
4866,Idaho clergy members compelled to conduct same...,Rumor: Two Idaho pastors were threatened with ...,['income'],False,The difference between churches and businesses...
4867,Did Billboards Displayed in the U.S. Promote t...,A billboard campaign aimed at dispelling rumor...,['profit'],True,This is a genuine photograph of a billboard se...


In [8]:
# prompt: # prompt: df_train : new column 'input' = 'claim' column + "<###>" + 'justification' column    , new column 'output' = 'label' column in uppercase + "<###>" + 'evidence' column
df_train['input'] = df_train['claim'] + " <###> " + df_train['justification']
df_train['output'] = df_train['label'].str.upper() + " <###> " + df_train['evidence']

In [9]:
df_dev['input'] = df_dev['claim'] + " <###> " + df_dev['justification']
df_dev['output'] = df_dev['label'].str.upper() + " <###> " + df_dev['evidence']

df_test['input'] = df_test['claim'] + " <###> " + df_test['justification']

In [10]:
from datasets import Dataset
train_dataset    = Dataset.from_pandas(df_train)
dev_dataset      = Dataset.from_pandas(df_dev)
test_dataset     = Dataset.from_pandas(df_test)

In [11]:
df_train

Unnamed: 0,claim,justification,issues,label,evidence,input,output
0,Checking the Facts About 'Dreamers',An Internet meme cites extremely dubious stati...,['taxes'],True,First introduced in Congress in 2001 and last ...,Checking the Facts About 'Dreamers' <###> An I...,TRUE <###> First introduced in Congress in 200...
1,Will Jesus Be Portrayed as Homosexual in an Up...,Long-standing false rumor holds that Jesus and...,['budget'],False,"Contrary to common belief, the entry for Him i...",Will Jesus Be Portrayed as Homosexual in an Up...,"FALSE <###> Contrary to common belief, the ent..."
2,McCain and Bush are millionaires who are for t...,An ad goes after Sen. John McCain for being ju...,"['National', 'Taxes']",True,An ad goes after Sen. John McCain for being ju...,McCain and Bush are millionaires who are for t...,TRUE <###> An ad goes after Sen. John McCain f...
3,"Today, you can't rely on (the retirement fund ...",Gov. Rick Scott made no secret of his distaste...,"['Retirement', 'State Budget', 'Workers', 'Flo...",False,Gov. Rick Scott made no secret of his distaste...,"Today, you can't rely on (the retirement fund ...",FALSE <###> Gov. Rick Scott made no secret of ...
4,Panty Raider,Does a new video game involving stealing under...,['lien'],True,Dads and Daughters suggests that other parents...,Panty Raider <###> Does a new video game invol...,TRUE <###> Dads and Daughters suggests that ot...
...,...,...,...,...,...,...,...
4864,The reality concerning ANWR,E-mail reports the truth about the environment...,['economy'],False,debates has been the Arctic National Wildlife ...,The reality concerning ANWR <###> E-mail repor...,FALSE <###> debates has been the Arctic Nation...
4865,Man who sells wedding dresses,Did a man lists his ex-wife's wedding dress on...,['profit'],True,"Over the years, our readers have queried us ab...",Man who sells wedding dresses <###> Did a man ...,"TRUE <###> Over the years, our readers have qu..."
4866,Idaho clergy members compelled to conduct same...,Rumor: Two Idaho pastors were threatened with ...,['income'],False,The difference between churches and businesses...,Idaho clergy members compelled to conduct same...,FALSE <###> The difference between churches an...
4867,Did Billboards Displayed in the U.S. Promote t...,A billboard campaign aimed at dispelling rumor...,['profit'],True,This is a genuine photograph of a billboard se...,Did Billboards Displayed in the U.S. Promote t...,TRUE <###> This is a genuine photograph of a b...


In [12]:
alpaca_prompt = """
Please determine if the claim is True, False, or Not Enough Information based on the context. Extract only the sentences most relevant to the claim for your evidence.

### Input:
Claim and Context : {input}

### Expected Output:
Label and evidence : {output}
"""
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    input = examples["input"]
    output = examples["output"]
    texts = []
    for input, output in zip(input, output):
      text = alpaca_prompt.format(input=input, output=output) + EOS_TOKEN
      texts.append(text)
    return { "text" : texts, }
formatted_train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
formatted_dev_dataset   = dev_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/4869 [00:00<?, ? examples/s]

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_train_dataset,
    eval_dataset = formatted_dev_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        num_train_epochs = 3,
        warmup_steps = 5,
        evaluation_strategy="steps",
        eval_steps = 25,
        learning_rate = 1e-3,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 1024,
        output_dir = "outputs",
    ),
)



Map (num_proc=2):   0%|          | 0/4869 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/453 [00:00<?, ? examples/s]

In [14]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
12.486 GB of memory reserved.


In [None]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,869 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 912
 "-____-"     Number of trainable parameters = 95,420,416
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m1024-m[0m ([33m1024m[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
!pip install tqdm
from tqdm.auto import tqdm
model = FastLanguageModel.for_inference(model)
import pandas as pd
val_df = pd.DataFrame(dev_dataset)
val_df.info()

In [None]:
alpaca_prompt = """
Please determine if the claim is True, False, or Not Enough Information based on the context. Extract only the sentences most relevant to the claim for your evidence.

### Input:
Claim and Context : {}

### Expected Output:
Label and evidence :
"""
EOS_TOKEN = tokenizer.eos_token

In [None]:
trail_df = val_df[:1]
for index, row in tqdm(trail_df.iterrows(), total=trail_df.shape[0], desc="Making Predictions"):
  inputs = tokenizer([prompt_template.format(row['claim'],row['justification'])],return_tensors="pt").to("cuda")
  outputs = model.generate(**inputs, max_new_tokens=128, num_return_sequences=1, do_sample=False, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id,)
  decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print(decoded_output)

In [None]:
"""from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(tokenizer, chat_template = "llama-3.1",)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [{"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},]
inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt",).to("cuda")
outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True, temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)"""

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nContinue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe Fibonacci sequence is a series of numbers in which each number is the sum of the two preceding numbers. The sequence is: 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144,']

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")
# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")
# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")
# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")