In [None]:
import pandas as pd
import torch, os, json, re, time
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, TextIteratorStreamer, AutoConfig)
from datasets import load_dataset, DatasetDict
from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel)
from trl import SFTConfig
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from tensorboard import program
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import psutil
import wandb
from rouge_score import rouge_scorer
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = r"/mnt/models/llm_storage/DeepSeek-R1-Distill-Llama-8B"

tokenizer = AutoTokenizer.from_pretrained(model_path)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)


model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    device_map="auto" ,
)


Loading checkpoint shards: 100%|██████████| 7/7 [01:27<00:00, 12.46s/it]


In [None]:
model.gradient_checkpointing_enable()
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

: 

In [None]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are an advanced AI assistant specializing in mathematics, science, engineering, and technology. Your expertise includes problem-solving, theorem proofs, numerical computations, and logical reasoning. Ensure that your responses are precise, well-structured, and aligned with formal STEM methodologies.

### Question:
{}

### Response:
<think>
{}
</think>
{}
"""


In [None]:
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    inputs = examples["problem"]
    cots = examples["deepseek_reasoning"]
    outputs = examples["deepseek_solution"]
    texts = []
    
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)

    return {
        "text": texts,
    }

In [None]:
# @misc{slam-distillation-from-r1,  
#     author = {Sathwik Tejaswi Madhusudhan and Shruthan Radhakrishna and Jash Mehta and Toby Liang},  
#     title = {Millions scale dataset distilled from R1-32b},  
#     howpublished = {https://huggingface.co/datasets/ServiceNow-AI/R1-Distill-SFT},
#     publisher = {SLAM - ServiceNow Language Models Lab}  
#     year = {2025}
# }

dataset = load_dataset("open-thoughts/OpenThoughts-114k", 'metadata', split="train", trust_remote_code=True)
dataset = dataset.map(formatting_prompts_func, batched=True)

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
temp_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

split_dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"],
})

split_dataset["train"].to_json("split_sets/train.json", orient="records", lines=True)
split_dataset["validation"].to_json("split_sets/val.json", orient="records", lines=True)
split_dataset["test"].to_json("split_sets/test.json", orient="records", lines=True)
print(f"Train dataset size: {len(split_dataset['train'])}")
print(f"Validation dataset size: {len(split_dataset['validation'])}")
print(f"Test dataset size: {len(split_dataset['test'])}")

Creating json from Arrow format: 100%|██████████| 92/92 [01:25<00:00,  1.08ba/s]
Creating json from Arrow format: 100%|██████████| 12/12 [00:08<00:00,  1.44ba/s]
Creating json from Arrow format: 100%|██████████| 12/12 [00:08<00:00,  1.44ba/s]


Train dataset size: 91165
Validation dataset size: 11396
Test dataset size: 11396


In [None]:
print(split_dataset["train"].select([0])["text"])

["Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are an advanced AI assistant specializing in mathematics, science, engineering, and technology. Your expertise includes problem-solving, theorem proofs, numerical computations, and logical reasoning. Ensure that your responses are precise, well-structured, and aligned with formal STEM methodologies.\n\n### Question:\nFind the sum of the first seven prime numbers that have a units digit of 7.\n\n### Response:\n<think>\nOkay, let's see. I need to find the sum of the first seven prime numbers that have a units digit of 7. Hmm, units digit of 7 means that each prime number ends with 7. So, primes like 7, 17, 37, etc. Right?\n\nFirst, I should start by listing prime num

In [None]:
os.chdir("/home/joshua/llms/deepseekr1")
wandb.init(
    project="DeepSeek-Finetune", 
    id="oj6k0ysj",
    name="Finetune-R1-8B-OT-4",
    resume="allow" #allow
)

[34m[1mwandb[0m: Currently logged in as: [33mjoshuawlod2003[0m ([33mjoshuawlod2003-qut[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
sft_config = SFTConfig(
    output_dir="outputs",
    dataset_text_field="text",
    max_seq_length=512,
    num_train_epochs=0.25,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    gradient_accumulation_steps=2,
    optim="adamw_torch_fused",
    eval_strategy="steps",
    eval_steps=10,
    save_steps=10,
    logging_steps=10,
    disable_tqdm=False,
    learning_rate=5e-5, #5e-5 to 1e-4 ORGINAL: 5e-6	
    fp16=True,
    bf16=False,
    save_strategy="steps",
    save_total_limit=2,
    lr_scheduler_type="cosine_with_restarts",
    report_to="wandb",
    save_safetensors=True,
    dataset_kwargs={"add_special_tokens": True, "append_concat_token": False},
    #dataloader_num_workers=8,
    dataloader_pin_memory=True,
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["validation"],
    processing_class=tokenizer, #tokenizer 
)

[2025-03-05 02:58:32,418] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/joshua/miniconda3/envs/llm-env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [None]:
# Restart or load from checkpoint
load_check = True

def get_latest_checkpoint(output_dir):
    output_dir = os.path.abspath(output_dir)
    checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
    if not checkpoints:
        return None

    latest_checkpoint = max(checkpoints, key=lambda x: int(re.findall(r'\d+', x)[0]))
    return os.path.join(output_dir, latest_checkpoint)

latest_checkpoint = get_latest_checkpoint("/home/joshua/llms/deepseekr1/outputs")

if latest_checkpoint and load_check and os.path.exists(os.path.join(latest_checkpoint, "trainer_state.json")):
    with open(os.path.join(latest_checkpoint, "trainer_state.json"), "r") as f:
        trainer_state = json.load(f)
        print("Trainer State Loaded from Checkpoint:")
        print("Epoch:", trainer_state["epoch"])
        print("Global Step:", trainer_state["global_step"])

    print(f"Resuming from latest checkpoint: {latest_checkpoint}")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("No valid checkpoint found. Training from scratch.")
    trainer.train()

Trainer State Loaded from Checkpoint:
Epoch: 0.9997367728349565
Global Step: 1899
Resuming from latest checkpoint: /home/joshua/llms/deepseekr1/outputs/checkpoint-1899


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Step,Training Loss,Validation Loss


In [None]:
import os, re, torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from peft import PeftModel

BASE_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-Distill-Llama-8B"
MERGED_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-8B-OpenThought-4"

def get_latest_checkpoint(output_dir):
    checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
    if not checkpoints:
        return None

    latest_checkpoint = max(checkpoints, key=lambda x: int(re.findall(r'\d+', x)[0]))
    return os.path.join(output_dir, latest_checkpoint)

latest_checkpoint = get_latest_checkpoint("outputs")

print("Merging model into full precision format...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.float16,
)

peft_model = PeftModel.from_pretrained(
    base_model,
    latest_checkpoint,
    torch_dtype=torch.float16,
)

merged_model = peft_model.merge_and_unload()
merged_model = merged_model.to(torch.float32)

if hasattr(merged_model, "quantization_method"):
    del merged_model.quantization_method

config = AutoConfig.from_pretrained(BASE_MODEL_PATH)
merged_model.save_pretrained(MERGED_MODEL_PATH, config=config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
tokenizer.save_pretrained(MERGED_MODEL_PATH)

print(f"Training complete. Full precision fine-tuned model saved at {MERGED_MODEL_PATH}")

Merging model into full precision format...


Loading checkpoint shards: 100%|██████████| 7/7 [00:26<00:00,  3.72s/it]


Training complete. Full precision fine-tuned model saved at /home/joshua/llms/deepseekr1/DeepSeek-R1-8B-OpenThought-1-2


In [None]:
cache_file = "/home/joshua/llms/deepseekr1/TRAIN_CACHE.json"

with open(cache_file, "r") as f:
    data = json.load(f)
start_time = data.get("start_time")
last_epoch = data.get("last_epoch")
total_training_time = data.get("total_training_time", 0)
epoch_dict = data.get("epoch_list", {})
points = [(float(ts), float(ep)) for ts, ep in epoch_dict.items()]
points.sort(key=lambda x: x[0])

for i, (ts, ep) in enumerate(points):
    time_since_start = ts - start_time
    wandb.log(
        {
            "time_since_start": time_since_start,
            "epoch": ep
        },
        step=i
    )
    
wandb.log({"training_duration_seconds": total_training_time})
print("Logged epochs and time to wandb.")

with open(cache_file, "w") as f:
    json.dump({}, f)

In [None]:
# Loading model and data
train_dataset = load_dataset("json", data_files="split_sets/train.json", split="train")
val_dataset = load_dataset("json", data_files="split_sets/val.json", split="train")
test_dataset = load_dataset("json", data_files="split_sets/test.json", split="train")

BASE_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-Distill-Llama-8B"
MERGED_MODEL_PATH = "/home/joshua/llms/deepseekr1/DeepSeek-R1-8B-OpenThought-4"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)


tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MERGED_MODEL_PATH, 
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    device_map="auto",
)
model.eval()


# Generate and compare prompts to the golden
log_file = "eval_results.log"

def evaluate_model(dataset, dataset_name, num_samples=5):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    table = wandb.Table(columns=["index", "prompt", "generated_output", "gold_output", "rougeL_f1", "exact_match"])

    num_correct = 0
    for i in range(num_samples):
        example = dataset[i]
        prompt = example["problem"]
        gold_answer = example["deepseek_solution"]

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=False,
                temperature=0.7,
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        gold_text = gold_answer.strip()

        exact_match = (generated_text == gold_text)
        num_correct += int(exact_match)
        rouge_score = scorer.score(gold_text, generated_text)["rougeL"].fmeasure

        table.add_data(i, prompt, generated_text, gold_text, rouge_score, exact_match)

    accuracy = num_correct / num_samples

    wandb.log({
        f"eval/{dataset_name}_accuracy": accuracy,
        f"eval/{dataset_name}_samples": table
    })
    print(f"\nExact-match accuracy on {num_samples} {dataset_name} samples: {accuracy * 100:.2f}%")

    return accuracy

train_acc = evaluate_model(train_dataset, "train", num_samples=5)
val_acc = evaluate_model(val_dataset, "validation", num_samples=5)
test_acc = evaluate_model(test_dataset, "test", num_samples=5)

wandb.log({
    "eval/train_accuracy": train_acc,
    "eval/validation_accuracy": val_acc,
    "eval/test_accuracy": test_acc
})

wandb.finish()

print(f"\nEvaluation results saved to {log_file}")

NameError: name 'quantization_config' is not defined