In [None]:
! pip install unsloth

In [None]:
! pip install huggingface-hub

In [None]:
! pip install datasets

In [None]:
import os
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer  # trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported  # checks if the hardware supports bfloat16 operations

from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset  # Lets you load fine-tuning datasets in huggingface


hugging_face_token = os.environ["HF_TOKEN"]
login(hugging_face_token)

In [None]:
# paramerts to load pre-trained model
max_seq_length = 1024  # Define the maximum sequence length a model can handle (i.e. how many tokens can be processed at once)
dtype = None  # set to default
load_in_4bit = True  # Enables 4-bit quantization - a memory saving optimization

# Load the R1 model and tokenizer using unsloth - imported using FastLanguageModel
model_og, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B",  # ref: https://huggingface.co/unsloth/DeepSeek-R1
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=hugging_face_token
)


==((====))==  Unsloth 2025.1.8: Fast Qwen2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
prompt_style = """Given a question think and try to answer the question,
Before answering, think when needed based in the question and create a step-by-step chain of thoughts.
Do reverify your answers and reasonsing and correct any mistakes.

### Instruction:
Only think when required

### Question:
{}

### Response:
<think>{}"""

In [None]:
question = "What is (7154 + -92) * 1936 / -1928 ?"
FastLanguageModel.for_inference(model_og)
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")
# generate response using the model
outputs = model_og.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    use_cache=True
)
# decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


<think>

</think>

To solve the expression \((7154 + -92) \times 1936 / -1928\), we will follow the order of operations (PEMDAS/BODMAS):

1. **Parentheses/Brackets**: First, evaluate the expression inside the parentheses.
   \[
   7154 + (-92) = 7154 - 92 = 7062
   \]
   
2. **Multiplication and Division**: Next, perform multiplication and division from left to right.
   \[
   7062 \times 1936 = ?
   \]
   Let's compute this step by step:
   \[
   7062 \times 1936 = 7062 \times (1000 + 900 + 30 + 6) = 7062 \times 1000 + 7062 \times 900 + 7062 \times 30 + 7062 \times 6
   \]
   \[
   = 7,062,000 + 6,355,800 + 211,860 + 42,372 = 13,079,032
   \]
   Now, divide the result by \(-1928\):
   \[
   13,079,032 \div (-1928) = -6768
   \]
   
3. **Final Answer**: The result of the expression is \(-6768\).

\[
\boxed{-6768}
\]<｜end▁of▁sentence｜>


In [None]:
# use supervised fine-tuning with training prompts and existing thinking
train_prompt_style = """Given a question think and try to answer the question,
Before answering, think when needed based in the question and create a step-by-step chain of thoughts.
Do reverify your answers and reasonsing and correct any mistakes.

### Instruction:
Only think when required

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }


In [None]:
from datasets import load_dataset
# Load dataset from a local JSONL file
dataset = load_dataset("json", data_files="dummy_coldstart.jsonl")

# Access the dataset split (default is 'train' if no split is defined)
print(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset['train'][0]

model = FastLanguageModel.get_peft_model(
    model_og,
    r=8,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=2025,
    use_rslora=False,
    loftq_config=None,
)


DatasetDict({
    train: Dataset({
        features: ['Question', 'Complex_CoT', 'Response'],
        num_rows: 381
    })
})


Unsloth 2025.1.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps=5,
        max_steps=100,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
    report_to="none"
)

trainer_stats = trainer.train()


Map (num_proc=2):   0%|          | 0/381 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 381 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 9,232,384


Step,Training Loss
10,2.2186
20,0.9941
30,0.6411
40,0.5866
50,0.577
60,0.4732
70,0.443
80,0.4567
90,0.4895
100,0.5376


In [None]:
question = "What is (7154 + -92) * 1936 / -1928 ?"


FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


<think>
Problem: What is (7154 + -92) * 1936 / -1928?  
Reasoning:  
1. Simplify the expression inside the parentheses: 7154 + (-92) = 7062  
2. Multiply the result by 1936: 7062 * 1936 = 136,162,872  
3. Divide the result by -1928: 136,162,872 / -1928 = -7062  

Recheck the steps and correct any mistakes:
1. The initial operation inside the parentheses is correct (7154 - 92 = 7062).  
2. Multiplying 7062 by 1936 gives 136,162,872, which is accurate.  
3. Finally, dividing this value by -1928 results in -7062, which is correct.
</think>
<think> 
Problem: What is (7154 + -92) * 1936 / -1928?  
Reasoning:  
1. Simplify the expression inside the parentheses: 7154 + (-92) = 7062  
2. Multiply the result by 1936: 7062 * 1936 = 136,162,872  
3. Divide the result by -1928: 136,162,872 / -1928 = -7062  

Recheck the steps and correct any mistakes:
1. The initial operation inside the parentheses is correct (7154 - 92 = 7062).  
2. Multiplying 7062 by 1936 gives 136,162,872, which is accurate. 

In [None]:
new_model_local = "DeepSeek-R1-Test-Qwen-COT"
model.save_pretrained(new_model_local)
tokenizer.save_pretrained(new_model_local)

model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.8G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.89 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 53.96it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving DeepSeek-R1-Test-Qwen-COT/pytorch_model.bin...
Done.


In [None]:
#### -------- PUSH TO HUB -------- ####
# new_model_online = "***/DeepSeek-R1-Test-COT"
# model.push_to_hub(new_model_online)
# tokenizer.push_to_hub(new_model_online)

# model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

In [None]:
! zip -r DeepSeek-R1-Test-Qwen-COT.zip DeepSeek-R1-Test-Qwen-COT/

  adding: DeepSeek-R1-Test-Qwen-COT/ (stored 0%)
  adding: DeepSeek-R1-Test-Qwen-COT/tokenizer_config.json (deflated 84%)
  adding: DeepSeek-R1-Test-Qwen-COT/adapter_model.safetensors (deflated 8%)
  adding: DeepSeek-R1-Test-Qwen-COT/special_tokens_map.json (deflated 70%)
  adding: DeepSeek-R1-Test-Qwen-COT/config.json (deflated 50%)
  adding: DeepSeek-R1-Test-Qwen-COT/README.md (deflated 66%)
  adding: DeepSeek-R1-Test-Qwen-COT/adapter_config.json (deflated 55%)
  adding: DeepSeek-R1-Test-Qwen-COT/generation_config.json (deflated 37%)
  adding: DeepSeek-R1-Test-Qwen-COT/tokenizer.json (deflated 81%)
  adding: DeepSeek-R1-Test-Qwen-COT/pytorch_model.bin (deflated 12%)
