### Prepare Dataset

In [1]:
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
path = "/data/nvme6n1p1/adal_workspace/small_llm/datasets/sft_dataset_conversation"
df_train = pd.read_csv(f"{path}/everyday-conversations_train-kk.csv")
df_test = pd.read_csv(f"{path}/everyday-conversations_test-kk.csv")

In [2]:
from datasets import Dataset, DatasetDict, concatenate_datasets
ds_train = Dataset.from_pandas(df_train)
ds_test = Dataset.from_pandas(df_test)

ds_dict = DatasetDict({
    "train":ds_train.select_columns(["messages"]),
    "test":ds_test.select_columns(["messages"])
})

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds_dict

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 119
    })
})

In [4]:
def apply_chatting_template_full(sample):
    sample["text"]=sample['messages'].replace("<#>", "<|im_start|>user:").replace("<*>", "<|im_end|>\n<|im_start|>assistant:") + "<|im_end|>"
    return sample

In [5]:
ds_input = ds_dict.map(apply_chatting_template_full).select_columns(["text"])

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2260/2260 [00:00<00:00, 18762.63 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 119/119 [00:00<00:00, 24442.81 examples/s]


In [6]:
print(ds_input['train'][0]['text'])

 <|im_start|>user:
–°”ô–ª–µ–º–µ—Ç—Å—ñ–∑ –±–µ
<|im_end|>
<|im_start|>assistant:
–°”ô–ª–µ–º–µ—Ç—Å—ñ–∑ –±–µ! –ë“Ø–≥—ñ–Ω –º–µ–Ω —Å—ñ–∑–≥–µ “õ–∞–ª–∞–π –∫”©–º–µ–∫—Ç–µ—Å–µ –∞–ª–∞–º—ã–Ω?
<|im_start|>user:
–ú–µ–Ω –∫–µ–ª–µ—Å—ñ –¥–µ–º–∞–ª—ã—Å—ã–º–∞ –∂–∞“ì–∞–∂–∞–π –∫—É—Ä–æ—Ä—Ç—ã–Ω —ñ–∑–¥–µ–ø –∂“Ø—Ä–º—ñ–Ω. –ö–µ–π–±—ñ—Ä —Ç–∞–Ω—ã–º–∞–ª –∂–∞“ì–∞–∂–∞–π–ª–∞—Ä–¥—ã “±—Å—ã–Ω–∞ –∞–ª–∞—Å—ã–∑ –±–∞?
<|im_end|>
<|im_start|>assistant:
–ö–µ–π–±—ñ—Ä —Ç–∞–Ω—ã–º–∞–ª –∂–∞“ì–∞–∂–∞–π –∫—É—Ä–æ—Ä—Ç—Ç–∞—Ä—ã–Ω–∞ –ì–∞–≤–∞–π–∏–¥–µ–≥—ñ –ú–∞—É–∏, –ú–∞–ª—å–¥–∏–≤ –∞—Ä–∞–ª–¥–∞—Ä—ã –∂”ô–Ω–µ –ë–∞–≥–∞–º –∞—Ä–∞–ª–¥–∞—Ä—ã –∫—ñ—Ä–µ–¥—ñ. –û–ª–∞—Ä ”ô–¥–µ–º—ñ –∂–∞“ì–∞–∂–∞–π–ª–∞—Ä—ã–º–µ–Ω –∂”ô–Ω–µ –º”©–ª–¥—ñ—Ä —Å—É–ª–∞—Ä—ã–º–µ–Ω —Ç–∞–Ω—ã–º–∞–ª.
<|im_start|>user:
–ë“±–ª –∫–µ—Ä–µ–º–µ—Ç –µ—Å—Ç—ñ–ª–µ–¥—ñ. –ö–∞—Ä–∏–± —Ç–µ“£—ñ–∑—ñ–Ω–¥–µ –æ—Ç–±–∞—Å—ã–ª–∞—Ä “Ø—à—ñ–Ω “õ–æ–ª–∞–π–ª—ã –∫—É—Ä–æ—Ä—Ç—Ç–∞—Ä –±–∞—Ä –º–∞?
<|im_end|>
<|im_start|>assistant:
–ò”ô, –ö–∞—Ä–∏–± —Ç–µ“£—ñ–∑—ñ–Ω–¥–µ–≥—ñ –æ—Ç–±–∞—Å—ã–ª–∞—Ä“ì–∞ “õ–æ–ª–∞–π–ª—ã –∫—É—Ä–æ—Ä—Ç—Ç–∞—Ä “Ø—à—ñ

### Model preparation

In [7]:
path = "/data/nvme6n1p1/adal_workspace/small_llm/models/small_lm_test1"

from transformers import LlamaForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LlamaForCausalLM.from_pretrained(path).to(device)
tokenizer = AutoTokenizer.from_pretrained(path)

In [8]:
tokenized_output = tokenizer(
    ds_input['train'][0]['text'], 
    padding="max_length",  # You can also use padding=True for dynamic padding
    truncation=True,  # Ensures the text does not exceed max length
    max_length=512,  # Adjust based on your model
    return_tensors="pt"  # Returns PyTorch tensors (or use "tf" for TensorFlow)
)

print(tokenized_output)

{'input_ids': tensor([[  237,     1,   518,   316,    42,   215, 22582,  1754, 14572,  8150,
           215,     2,   215,     1,  1262, 24939,    42,   215, 22582,  1754,
         14572,  8150,    17, 19726,   691, 28952,  5327,  6058,   719, 34041,
            47,   215,     1,   518,   316,    42,   215,  8842,  5079,  8659,
          4946,   471, 36521, 39895,  1399, 20160, 29804,   379,    30, 13514,
          5550, 36521,  2852, 31680, 26133,   701,    47,   215,     2,   215,
             1,  1262, 24939,    42,   215, 31024,  5550, 36521, 39895, 18142,
         22432,   431, 35184,   531, 37462,    28, 26594, 12506, 27628,   456,
          3747, 44306, 27628,  8840,    30,  4648, 20158, 36521, 17430,   456,
         44413,  6770, 32967,  5550,    30,   215,     1,   518,   316,    42,
           215,  4612, 14464,  7684,  2053,    30, 42537,  5397,   561, 40009,
           820, 14896, 39895, 11939,   964,   835,    47,   215,     2,   215,
             1,  1262, 24939,    42,  

In [12]:
# Configure the SFTTrainer
finetune_name = "SmolLM2-FT-MyDataset"
finetune_tags = ["smol-course", "module_1"]

sft_config = SFTConfig(
    report_to=None, # To disable wandb
    output_dir="/data/nvme6n1p1/adal_workspace/small_llm/models/chatting_llm/try_1",
    adam_beta1=0.9,
    adam_beta2=0.95,
    max_seq_length=1024,
    # max_steps=141,  # Adjust based on dataset size and desired training duration
    per_device_train_batch_size=8,  # Set according to your GPU memory capacity
    learning_rate=3e-6,  # Common starting point for fine-tuning
    logging_steps=10,  # Frequency of logging training metrics
    save_steps=100,  # Frequency of saving model checkpoints
    evaluation_strategy="steps",  # Evaluate the model at regular intervals
    eval_steps=100,  # Frequency of evaluation
    num_train_epochs=2,
    use_mps_device=(
        True if device == "mps" else False
    ),  # Use MPS for mixed precision training
    hub_model_id=finetune_name,  # Set a unique name for your model
)



In [13]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=ds_input["train"],
    tokenizer=tokenizer,
    eval_dataset=ds_input["test"],
)

  trainer = SFTTrainer(
Converting train dataset to ChatML: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2260/2260 [00:00<00:00, 29030.15 examples/s]
Applying chat template to train dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2260/2260 [00:00<00:00, 55569.98 examples/s]
Tokenizing train dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2260/2260 [00:01<00:00, 1746.52 examples/s]
Truncating train dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2260/2260 [00:00<00:00, 4732.22 examples/s]
Converting eval dataset to ChatML: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 119/119 [00:00<00:00, 40307.05 examples/s]
Applying chat template to eval dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 119/119 [00:00<00:00, 28934.62 examples/s]
Tokenizing eval dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 119/119 [00:00<00:00, 1674.63 examples/s]
Truncating eval dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 119/119 [00:00<00:00, 4346.43 examples/s]


In [14]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mmamubieke-parehati[0m ([33mmamubieke-parehati-ISSAI[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,1.3556,1.362775
200,1.2567,1.251368
300,1.1613,1.210384
400,1.1733,1.194203
500,1.1491,1.18645


TrainOutput(global_step=566, training_loss=1.2867608087222904, metrics={'train_runtime': 386.8576, 'train_samples_per_second': 11.684, 'train_steps_per_second': 1.463, 'total_flos': 9979924473114624.0, 'train_loss': 1.2867608087222904})

### Run model's inference

In [15]:
prompt = "\n–°”ô–ª–µ–º\n"

messages = [{'role':'user', 'content':prompt}]

In [16]:
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

In [17]:
formatted_prompt_2 = formatted_prompt.replace("user : ", "user:\n") + "<|im_start|>assistant:\n"

inputs_2 = tokenizer(formatted_prompt_2, return_tensors="pt").to(device)

In [18]:
print(formatted_prompt_2)

<|im_start|>user

–°”ô–ª–µ–º
<|im_end|>
<|im_start|>assistant:



In [20]:
tokenizer.encode("<|im_end|>\n<|im_start|>")

[2, 215, 1]

In [19]:
from transformers import StoppingCriteria
# https://github.com/huggingface/trl/issues/921
class EosListStoppingCriteria(StoppingCriteria):
    def __init__(self, eos_sequence = [2, 215, 1]):
        self.eos_sequence = eos_sequence

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
        return self.eos_sequence in last_ids

In [24]:
new_model = LlamaForCausalLM.from_pretrained("/data/nvme6n1p1/adal_workspace/small_llm/models/chatting_llm/try_1/checkpoint-566").to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  7.34it/s]


In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
streamer = TextStreamer(tokenizer)
ouputs  = new_model.generate(**inputs_2,
    do_sample=True,
    max_new_tokens=100, 
    temperature=0.3,
    stopping_criteria=[EosListStoppingCriteria()],
    # streamer=streamer, 
    repetition_penalty=1.17, 
    # top_p=1.0,
    eos_token_id=tokenizer.encode("<|im_start|>"))

In [27]:
ouputs_2  = new_model.generate(**inputs_2, max_new_tokens=100, do_sample=True, eos_token_id=tokenizer.encode("<|im_end|>"), repetition_penalty=2.07, temperature=0.1, top_p=1.0, streamer=streamer)

<|im_start|>user

–°”ô–ª–µ–º
<|im_end|>
<|im_start|>assistant:
“ö–æ—à –∫–µ–ª–¥—ñ“£—ñ–∑! );- –¥–µ–ø –∂–∞—É–∞–ø –±–µ—Ä–¥—ñ. | (“õ–æ—Å –Ω“Ø–∫—Ç–µ–¥–µ–Ω –∫–µ–π—ñ–Ω) –∂”ô–Ω–µ —Ç–µ—Ä—É–¥—ñ –∂–∞–ª“ì–∞—Å—Ç—ã—Ä—ã“£—ã–∑). –ë“±–ª –∫”©–º–µ–∫—Ç–µ—Å–µ–¥—ñ, —Ä–∞—Ö–º–µ—Ç; “õ–∞–∂–µ—Ç –µ–º–µ—Å.)>><h2 ><p class="title" id="" name = "–ú—ã—Å—ã“õ—Ç—ã“£ –∞—Ç—ã"></a href_lengths` </heading style ‚ÄúTail Length‚Äù) # –ú—ã—Å–∞–ª–¥–∞—Ä –º–µ–Ω –∫–µ“£–µ—Å—Ç–µ—Ä “Ø—à—ñ–Ω –∞—Ç–∞—É–¥—ã –ø–∞–π–¥–∞–ª–∞–Ω—ã“£—ã–∑ –Ω–µ–º–µ—Å–µ –æ–Ω—ã ”©–∑–≥–µ—Ä—Ç—ñ“£—ñ–∑): –º—ã—Å—ã“õ—Ç–∞—Ä —Ç—É—Ä–∞–ª—ã


In [62]:
import json
from argparse import ArgumentParser
from pathlib import Path
from typing import Literal, Optional
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import LlamaConfig as HFLlamaConfig

TEST_PROMPT = "“ö–∞–∑–∞“õ—Å—Ç–∞–Ω —Ç—É—Ä–∞–ª—ã –Ω–µ –±—ñ–ª–µ—Å—ñ–Ω?"

def check_converted_model_generation(save_path: Path):
    """Loads a huggingface model and tokenizer from `save_path` and
    performs a dummy text generation."""

    tokenizer = AutoTokenizer.from_pretrained(save_path)
    messages = [{'role':'user', 'content':TEST_PROMPT}]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    formatted_prompt_2 = formatted_prompt.replace("user : ", "user:\n") + "<|im_start|>assistant:\n"
    input_ids = tokenizer(formatted_prompt_2, return_tensors="pt")["input_ids"].cuda()
    print("Inputs:", tokenizer.batch_decode(input_ids))

    model = LlamaForCausalLM.from_pretrained(save_path).cuda().bfloat16()
    out = model.generate(input_ids, max_new_tokens=1000, do_sample=True, temperature=0.3, top_p=0.98, repetition_penalty=1.17)
    
    print("Generation (converted): ", tokenizer.decode(out[0], skip_special_tokens=True))
    # print("Generation (Not-converted): ", out[0])

In [69]:
TEST_PROMPT = "¬´“ö–∞–∑–∞“õ—Å—Ç–∞–Ω –æ—Ç–∞—Ä –±–æ–ª—ã–ø –∫–µ–ª–¥—ñ –∂”ô–Ω–µ —Å–æ–ª–∞–π –±–æ–ª—ã–ø “õ–∞–ª–¥—ã¬ª –¥–µ–ø –∞–π—Ç“õ–∞–Ω “õ–∞–π—Ä–∞—Ç–∫–µ—Ä?"
check_converted_model_generation("/data/nvme6n1p1/adal_workspace/small_llm/models/chatting_llm/try_1/checkpoint-566")

Inputs: ['<|im_start|>user\n¬´“ö–∞–∑–∞“õ—Å—Ç–∞–Ω –æ—Ç–∞—Ä –±–æ–ª—ã–ø –∫–µ–ª–¥—ñ –∂”ô–Ω–µ —Å–æ–ª–∞–π –±–æ–ª—ã–ø “õ–∞–ª–¥—ã¬ª –¥–µ–ø –∞–π—Ç“õ–∞–Ω “õ–∞–π—Ä–∞—Ç–∫–µ—Ä?<|im_end|>\n<|im_start|>assistant:\n']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  7.99it/s]


Generation (converted):  user
¬´“ö–∞–∑–∞“õ—Å—Ç–∞–Ω –æ—Ç–∞—Ä –±–æ–ª—ã–ø –∫–µ–ª–¥—ñ –∂”ô–Ω–µ —Å–æ–ª–∞–π –±–æ–ª—ã–ø “õ–∞–ª–¥—ã¬ª –¥–µ–ø –∞–π—Ç“õ–∞–Ω “õ–∞–π—Ä–∞—Ç–∫–µ—Ä?
assistant:
–û–ª 1920 –∂—ã–ª–¥–∞—Ä—ã –ö–µ“£–µ—Å ”©–∫—ñ–º–µ—Ç—ñ–Ω–µ “õ–∞—Ä—Å—ã —à—ã“õ“õ–∞–Ω “õ–∞–∑–∞“õ –∂–∞—Å—Ç–∞—Ä—ã–Ω “õ–æ–ª–¥–∞“ì–∞–Ω, –±—ñ—Ä–∞“õ –∫–µ–π—ñ–Ω –æ–ª ¬´—Ö–∞–ª—ã“õ –∂–∞—É—ã¬ª –¥–µ–≥–µ–Ω –∞–π—ã–ø–ø–µ–Ω —Ç“±—Ç“õ—ã–Ω–¥–∞–ª—ã–ø, –∞—Ç—É –∂–∞–∑–∞—Å—ã–Ω–∞ –∫–µ—Å—ñ–ª–≥–µ–Ω.


In [97]:
TEST_PROMPT = "–°–∏–Ω–≥–∞–ø—É—Ä —Ç—É—Ä–∞–ª—ã –∞–π—Ç—ã–ø –±–µ—Ä—à—ñ?"

tokenizer = AutoTokenizer.from_pretrained("/data/nvme6n1p1/adal_workspace/small_llm/models/chatting_llm/try_1/checkpoint-566")
messages = [{'role':'user', 'content':TEST_PROMPT}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
formatted_prompt_2 = formatted_prompt.replace("user : ", "user:\n") + "<|im_start|>assistant:\n"

In [105]:
from openai import OpenAI
client = OpenAI(
    base_url="http://0.0.0.0:8009/v1",
    api_key="token-abc123",
)

completion = client.chat.completions.create(
  model="/data/nvme6n1p1/adal_workspace/small_llm/models/chatting_llm/try_1/checkpoint-566",
  messages=[
        {"role": "user", "content": formatted_prompt}
    ],
  stream=True,
  temperature=0.4,
  top_p=0.95,
  stop=["<|endoftext|>","<|im_end|>", "<|im_start|>"],
    extra_body={
        "skip_special_tokens": False,
        # "repetition_penalty":1.17
    },
)
cool = []
for chunk in completion:
    if chunk.choices[0].delta.content is not None:
        cool.append(chunk.choices[0].delta.content)

print("".join(cool))

–°–∏–Ω–≥–∞–ø—É—Ä - –û“£—Ç“Ø—Å—Ç—ñ–∫-–®—ã“ì—ã—Å –ê–∑–∏—è–¥–∞“ì—ã –∞—Ä–∞–ª–¥—ã“õ –º–µ–º–ª–µ–∫–µ—Ç. –û–ª 1963 –∂—ã–ª—ã 9 –∂–µ–ª—Ç–æ“õ—Å–∞–Ω–¥–∞ “∞–ª—ã–±—Ä–∏—Ç–∞–Ω–∏—è–¥–∞–Ω —Ç”ô—É–µ–ª—Å—ñ–∑–¥—ñ–∫ –∞–ª–¥—ã. –û–ª ”©–∑—ñ–Ω—ñ“£ –µ—Ä–µ–∫—à–µ –º”ô–¥–µ–Ω–∏–µ—Ç—ñ –º–µ–Ω —Ç—ñ–ª—ñ–º–µ–Ω, —Ç–∞“£“ì–∞–∂–∞–π—ã–ø –∂–∞“ì–∞–∂–∞–π–ª–∞—Ä—ã–º–µ–Ω –∂”ô–Ω–µ –∂–∞–Ω–¥—ã —Ç“Ø–Ω–≥—ñ ”©–º—ñ—Ä—ñ–º–µ–Ω —Ç–∞–Ω—ã–º–∞–ª.

