In [8]:
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer

In [9]:
model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
dataset_id = "medalpaca/medical_meadow_medical_flashcards"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
import pandas as pd
dataset = load_dataset(dataset_id)
df = pd.DataFrame(dataset['train'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33955 entries, 0 to 33954
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input        33955 non-null  object
 1   output       33955 non-null  object
 2   instruction  33955 non-null  object
dtypes: object(3)
memory usage: 795.9+ KB


# How dataset look like as a dataframe

In [18]:
df.to_csv('dataset.csv', index=False)

#Preprocessing Pipeline

In [19]:
def format_dataset(dataset, keys, instruction_col_name, response_col_name):
    """Format the dataset by retaining only necessary columns and renaming them."""
    cols_to_remove = [key for key in keys if key not in [instruction_col_name, response_col_name]]
    dataset = dataset.remove_columns(cols_to_remove)
    dataset = dataset.rename_column(instruction_col_name, "instruction")
    dataset = dataset.rename_column(response_col_name, "response")
    return dataset

def prepare_datasets(dataset, instruction_col_name, response_col_name):
    """Format and split the dataset for training and evaluation."""
    available_cols = list(dataset["train"].features.keys())
    formatted_dataset = format_dataset(
        dataset, available_cols, instruction_col_name, response_col_name
    )

    if "valid" in formatted_dataset:
        train_dataset = formatted_dataset["train"]
        eval_dataset = formatted_dataset["valid"]
    elif "test" in formatted_dataset:
        train_dataset = formatted_dataset["train"]
        eval_dataset = formatted_dataset["test"]
    else:
        split_dataset = formatted_dataset["train"].train_test_split(test_size=0.2)
        train_dataset, eval_dataset = split_dataset["train"], split_dataset["test"]

    return train_dataset, eval_dataset


In [20]:
train_dataset, eval_dataset = prepare_datasets(
    dataset, instruction_col_name="input", response_col_name="output"
)

In [21]:
print(f"{train_dataset = }")
print(f"{eval_dataset = }")

train_dataset = Dataset({
    features: ['instruction', 'response'],
    num_rows: 27164
})
eval_dataset = Dataset({
    features: ['instruction', 'response'],
    num_rows: 6791
})


In [22]:
def generate_response(model, tokenizer, instruction, device="cpu"):
    """Generate a response from the model based on an instruction."""
    messages = [{"role": "user", "content": instruction}]
    input_text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs, max_new_tokens=128, temperature=0.2, top_p=0.9, do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def print_example(example):
    """Print an example from the dataset."""
    print(f"Original Dataset Example:")
    print(f"Instruction: {example['instruction']}")
    print(f"Response: {example['response']}")
    print("-" * 100)

def print_response(response):
    """Print the model's response."""
    print(f"Model response:")
    print(response.split("assistant\n")[-1])
    print("-" * 100)


In [23]:
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

In [24]:
# Define a test example
example1 = eval_dataset[1]

response = generate_response(model, tokenizer, example1["instruction"], device)

print_example(example1)
print_response(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Original Dataset Example:
Instruction: What are some common causes of aortic stenosis?
Response: Aortic stenosis is typically caused by calcification (old, normal wear and tear), congenital bicuspid valve (middle age), and rheumatic fever.
----------------------------------------------------------------------------------------------------
Model response:
Aortic stenosis, also known as aortic regurgitation, is a common condition that affects the heart valves, causing them to become narrowed or blocked. Here are some common causes of aortic stenosis:

1. **Heart valve disease**: The most common cause of aortic stenosis is heart valve disease, which occurs when the valves in the heart are damaged or diseased. This can be caused by a variety of conditions, including valve replacement, valve replacement with a prosthetic valve, or valve replacement with a prosthetic valve.
2. **Valve regurgitation**: When the valves in the heart are not functioning properly, they can become narrowed or bloc

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

In [26]:
def formatting_prompts_func(example: dict) -> str:
    """Format prompt for training."""
    text = f"<|im_start|>user\n{example['instruction']}<|im_end|>\n<|im_start|>assistant\n{example['response']}<|im_end|>"
    return text

In [31]:
num_train_epochs = 5

output_dir = f"{model_id.split('/')[-1]}-{dataset_id.split('/')[-1]}-{num_train_epochs}epochs"

sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    max_seq_length=512,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_steps=500,  # save checkpoints every n training steps
    logging_steps=500,
    learning_rate=1e-3,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    warmup_ratio=0.05,
    lr_scheduler_type="constant",
    packing=True
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    formatting_func=formatting_prompts_func,
    peft_config=peft_config,
    args=sft_config,
)

In [30]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [29]:
trainer.save_model()