In [1]:
import pandas as pd
import torch, os, json, re
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, TextIteratorStreamer, AutoConfig)
from datasets import load_dataset, DatasetDict
from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel)
from trl import SFTConfig
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from tensorboard import program
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import psutil
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Pytorch and CPU optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.set_num_threads(16)
# num_cores = psutil.cpu_count(logical=False)
# cpu_affinity = list(range(num_cores))
# p = psutil.Process(os.getpid())
# p.cpu_affinity(cpu_affinity)

model_path = r"/mnt/models/llm_storage/DeepSeek-R1-Distill-Llama-8B"

tokenizer = AutoTokenizer.from_pretrained(model_path)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)


model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    device_map="auto" ,
)


Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00,  1.01s/it]


In [4]:
model.gradient_checkpointing_enable()
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196


In [5]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are an advanced AI assistant specializing in mathematics, science, engineering, and technology. Your expertise includes problem-solving, theorem proofs, numerical computations, and logical reasoning. Ensure that your responses are precise, well-structured, and aligned with formal STEM methodologies.

### Question:
{}

### Response:
<think>
{}
</think>
{}
"""


In [6]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["problem"]
    cots = examples["reannotated_assistant_content"]
    outputs = examples["solution"]
    texts = []
    
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)

    return {
        "text": texts,
    }

In [7]:
# @misc{slam-distillation-from-r1,  
#     author = {Sathwik Tejaswi Madhusudhan and Shruthan Radhakrishna and Jash Mehta and Toby Liang},  
#     title = {Millions scale dataset distilled from R1-32b},  
#     howpublished = {https://huggingface.co/datasets/ServiceNow-AI/R1-Distill-SFT},
#     publisher = {SLAM - ServiceNow Language Models Lab}  
#     year = {2025}
# }

dataset = load_dataset("ServiceNow-AI/R1-Distill-SFT", 'v0', split="train[:30000]", trust_remote_code=True)
dataset = dataset.map(formatting_prompts_func, batched=True)

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
temp_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

split_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"],
})

split_dataset["train"].to_json("split_sets/train.json", orient="records", lines=True)
split_dataset["validation"].to_json("split_sets/val.json", orient="records", lines=True)
split_dataset["test"].to_json("split_sets/test.json", orient="records", lines=True)
print(f"Train dataset size: {len(split_dataset['train'])}")
print(f"Validation dataset size: {len(split_dataset['validation'])}")
print(f"Test dataset size: {len(split_dataset['test'])}")

Creating json from Arrow format: 100%|██████████| 24/24 [00:02<00:00,  9.20ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  9.43ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  9.94ba/s]

Train dataset size: 24000
Validation dataset size: 3000
Test dataset size: 3000





In [7]:
print(split_dataset["train"].select([0])["text"])

['Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are an advanced AI assistant specializing in mathematics, science, engineering, and technology. Your expertise includes problem-solving, theorem proofs, numerical computations, and logical reasoning. Ensure that your responses are precise, well-structured, and aligned with formal STEM methodologies.\n\n### Question:\nReading comprehension:<br/>Definition: The values of unknowns that satisfy both equations and inequalities are called "ideal solutions" of these equations and inequalities. For example, given the equation $2x-1=1$ and the inequality $x+1 \\gt 0$, when $x=1$, $2x-1=2\\times 1-1=1$ and $1+1=2 \\gt 0$ both hold true. In this case, "$x=1$" is considered an

In [None]:
os.chdir("/home/joshua/llms/deepseekr1")
wandb.init(
    project="DeepSeek-Finetune", 
    name=f"Finetune-R1-8B-run-{wandb.util.generate_id()}"
)

In [None]:
sft_config = SFTConfig(
    output_dir="outputs",
    dataset_text_field="text",
    max_seq_length=512,
    num_train_epochs=2,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=4,
    optim="adamw_8bit",
    evaluation_strategy="steps",
    eval_steps=10,
    save_steps=10,
    logging_steps=10,
    disable_tqdm=False,
    learning_rate=2e-4, #5e-5 to 1e-4 ORGINAL: 5e-6	
    fp16=True,
    bf16=False,
    save_strategy="steps",
    save_total_limit=2,
    lr_scheduler_type="linear",
    report_to="wandb",
    save_safetensors=True,
    dataset_kwargs={"add_special_tokens": False, "append_concat_token": False},
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["validation"],
    tokenizer=tokenizer,
)

  trainer = SFTTrainer(
Map: 100%|██████████| 3000/3000 [00:01<00:00, 1879.53 examples/s]

[2025-02-11 16:04:30,124] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)



/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: /usr/local/cuda-12.1/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: /usr/local/cuda-12.1/lib64/libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: /usr/local/cuda-12.1/lib64/libstdc++.so.6: undefined reference to `fegetround@GLIBC_2.2.5'
collect2: error: ld returned 1 exit status


In [None]:
# Restart or load from checkpoint
load_check = True

def get_latest_checkpoint(output_dir):
    output_dir = os.path.abspath(output_dir)
    checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
    if not checkpoints:
        return None

    latest_checkpoint = max(checkpoints, key=lambda x: int(re.findall(r'\d+', x)[0]))
    return os.path.join(output_dir, latest_checkpoint)

latest_checkpoint = get_latest_checkpoint("/home/joshua/llms/deepseekr1/outputs")

if latest_checkpoint and load_check and os.path.exists(os.path.join(latest_checkpoint, "trainer_state.json")):
    with open(os.path.join(latest_checkpoint, "trainer_state.json"), "r") as f:
        trainer_state = json.load(f)
        print("Trainer State Loaded from Checkpoint:")
        print("Epoch:", trainer_state["epoch"])
        print("Global Step:", trainer_state["global_step"])

    print(f"Resuming from latest checkpoint: {latest_checkpoint}")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("No valid checkpoint found. Training from scratch.")
    trainer.train()

Trainer State Loaded from Checkpoint:
Epoch: 0.1
Global Step: 50
Resuming from latest checkpoint: outputs/checkpoint-50


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after pa

Step,Training Loss,Validation Loss


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from peft import PeftModel

BASE_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-Distill-Llama-8B"
MERGED_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-8B-FINETINED-v0-3"

def get_latest_checkpoint(output_dir):
    checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
    if not checkpoints:
        return None

    latest_checkpoint = max(checkpoints, key=lambda x: int(re.findall(r'\d+', x)[0]))
    return os.path.join(output_dir, latest_checkpoint)

latest_checkpoint = get_latest_checkpoint("outputs")

print("Merging model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.float16
)
peft_model = PeftModel.from_pretrained(
    base_model,
    latest_checkpoint,
    torch_dtype=torch.float16,
)
merged_model = peft_model.merge_and_unload()

config = AutoConfig.from_pretrained(BASE_MODEL_PATH)
merged_model.save_pretrained(MERGED_MODEL_PATH, config=config)
tokenizer.save_pretrained(MERGED_MODEL_PATH)

print(f"Training complete. Fine-tuned model saved at {MERGED_MODEL_PATH}")

Merging model...


Loading checkpoint shards: 100%|██████████| 7/7 [00:22<00:00,  3.21s/it]


[2025-02-15 11:15:15,815] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: /usr/local/cuda-12.1/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: /usr/local/cuda-12.1/lib64/libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: /usr/local/cuda-12.1/lib64/libstdc++.so.6: undefined reference to `fegetround@GLIBC_2.2.5'
collect2: error: ld returned 1 exit status


Training complete. Fine-tuned model saved at /home/joshua/llms/deepseekr1/DeepSeek-R1-8B-FINETINED-v0


In [None]:
# Loading model and data
wandb.init(project="DeepSeek-Finetune", name="Finetune-R1-8B-Eval", resume="allow")

train_dataset = load_dataset("json", data_files="split_sets/train.json", split="train")
val_dataset = load_dataset("json", data_files="split_sets/val.json", split="train")
test_dataset = load_dataset("json", data_files="split_sets/test.json", split="train")

tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MERGED_MODEL_PATH, 
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()


# Generate and compare prompts to golden then log them
log_file = "eval_results.log"

def evaluate_model(dataset, dataset_name, num_samples=5):
    """Evaluate the model on a given dataset and return accuracy + logs."""
    num_correct = 0
    generated_outputs = []
    gold_outputs = []
    eval_logs = []

    for i in range(num_samples):
        example = dataset[i]
        prompt = example["problem"]
        gold_answer = example["solution"]

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                temperature=0.7,
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_outputs.append(generated_text.strip())
        gold_outputs.append(gold_answer.strip())
        is_correct = generated_text.strip() == gold_answer.strip()
        num_correct += int(is_correct)

        eval_logs.append({
            "dataset": dataset_name,
            "index": i,
            "prompt": prompt,
            "generated_output": generated_text.strip(),
            "gold_output": gold_answer.strip(),
            "correct": is_correct
        })

        print(f"\n--- {dataset_name} Example {i} ---")
        print(f"Prompt: {prompt}")
        print(f"Generated:\n{generated_text}")
        print(f"Gold:\n{gold_answer}")
        print(f"Correct: {is_correct}")
        print("-" * 40)

    accuracy = num_correct / num_samples
    wandb.log({
        f"eval/{dataset_name}_accuracy": accuracy,
        f"eval/{dataset_name}_num_samples": num_samples,
        f"eval/{dataset_name}_generated_samples": generated_outputs,
        f"eval/{dataset_name}_gold_samples": gold_outputs
    })
    print(f"\nExact-match accuracy on {num_samples} {dataset_name} samples: {accuracy * 100:.2f}%")

    with open(log_file, "a") as f:
        for entry in eval_logs:
            f.write(json.dumps(entry) + "\n")

    return accuracy

# Evaluate for each set
train_acc = evaluate_model(train_dataset, "train", num_samples=5)
val_acc = evaluate_model(val_dataset, "validation", num_samples=5)
test_acc = evaluate_model(test_dataset, "test", num_samples=5)

wandb.log({
    "eval/train_accuracy": train_acc,
    "eval/validation_accuracy": val_acc,
    "eval/test_accuracy": test_acc
})

wandb.finish()

print(f"\nEvaluation results saved to {log_file}")

In [None]:
# CONVERT Quantized TO FULL
# MERGED_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-8B-FINETINED-v0-3"
# UNQUANTIZED_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-8B-FINETINED-FULL"

# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_quant_type="nf4",
# )

# model = AutoModelForCausalLM.from_pretrained(
#     MERGED_MODEL_PATH,
#     device_map="auto",
#     quantization_config=quant_config,
# )

# model = model.to(torch.float16)

# model.save_pretrained(UNQUANTIZED_MODEL_PATH)
# tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
# tokenizer.save_pretrained(UNQUANTIZED_MODEL_PATH)
# print(f"Full precision model saved at: {UNQUANTIZED_MODEL_PATH}")