In [1]:
import os
from datasets import load_dataset, load_dataset_builder


In [82]:
DATASET_NAME = "OpenCoder-LLM/opc-sft-stage2"
DATA_FILES_PATTERN = "data/*.parquet" # This pattern loads all Parquet files in the 'data' folder


In [83]:
dataset_dict = load_dataset(DATASET_NAME,'educational_instruct')


In [84]:
train_dataset = dataset_dict['train']

In [85]:
train_dataset[1431]


{'seq_id': 26622393304,
 'instruction': 'Write a function to find the kth largest elements in an unsorted array using a min heap.',
 'output': 'Here is the code to solve this problem: \n```python\nimport heapq\n\ndef kthLargestElement(nums, k):\n    min_heap = []\n    for num in nums:\n        heapq.heappush(min_heap, num)\n        if len(min_heap) > k:\n            heapq.heappop(min_heap)\n    return min_heap[0]\n```',
 'code': 'import heapq\n\ndef kthLargestElement(nums, k):\n    min_heap = []\n    for num in nums:\n        heapq.heappush(min_heap, num)\n        if len(min_heap) > k:\n            heapq.heappop(min_heap)\n    return min_heap[0]',
 'entry_point': 'kthLargestElement',
 'testcase': ['assert kthLargestElement([9,8,7,6,5,4,3,2,1],1)==9',
  'assert kthLargestElement([3,2,3,1,2,4,5,5,6],4)==4',
  'assert kthLargestElement([3,2,1,5,6,4],2)==5']}

In [86]:
LLAMA_3_CODE_CHAT_TEMPLATE = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
    "You are an expert programming assistant.<|eot_id|>" # Keep the system prompt simple and general
    "<|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|>"
    "<|start_header_id|>assistant<|end_header_id|>\n\n{full_response}<|eot_id|>"
)


In [87]:
def format_code_example(example):
    """
    Formats a single example by directly using the 'output' column
    as the assistant's full response.
    """
    LLAMA_3_CODE_CHAT_TEMPLATE = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
    "You are an expert programming assistant.<|eot_id|>" # Keep the system prompt simple and general
    "<|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|>"
    "<|start_header_id|>assistant<|end_header_id|>\n\n{full_response}<|eot_id|>")
    
    full_text = LLAMA_3_CODE_CHAT_TEMPLATE.format(
        instruction=example['instruction'],
        full_response=example['output']
    )
    return {"text": full_text}


In [88]:
code_formatted = train_dataset.map(
    format_code_example,
    remove_columns=train_dataset.column_names, # Only keep the newly created 'text' column
    num_proc=50
)


In [89]:
code_formatted[150]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an expert programming assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWrite a function that takes two lists as input and returns a new list with elements from the first list that are not present in the second list. The function should maintain the order of elements in the first list.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHere is the code to solve this problem: \n```python\ndef list_difference(list1, list2):\n    """\n    Returns a new list with elements from list1 that are not present in list2, preserving the order of elements in list1.\n    """\n    return [x for x in list1 if x not in list2]\n```<|eot_id|>'}

In [90]:
OUTPUT_FILE = "llama31_code_finetune_simple.jsonl"
code_formatted.to_json(OUTPUT_FILE, orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/119 [00:00<?, ?ba/s]

92135568

In [1]:
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DATASET_PATH = "llama31_code_finetune_simple.jsonl"  

In [2]:
import torch
import os
import gc
from transformers import (  AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TrainingArguments,
    AutoConfig
)
from peft import LoraConfig, PeftModel
from datasets import load_dataset
from trl import SFTTrainer

In [3]:
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DATASET_PATH = "llama31_code_finetune_simple.jsonl" # Your correctly formatted JSONL file
OUTPUT_DIR = "./llama31_code_lora_adapter"
MERGED_MODEL_DIR = os.path.join(OUTPUT_DIR, "merged_model")
SAFE_WINDOWS_WORKERS = 50 # Max workers for datasets.map() to avoid Windows handle error


In [None]:
# MODEL_NAME = "Qwen/Qwen3-0.6B"
# DATASET_PATH = "llama31_code_finetune_simple.jsonl" # Your correctly formatted JSONL file
# OUTPUT_DIR = "./qwen_code_lora_adapter"
# MERGED_MODEL_DIR = os.path.join(OUTPUT_DIR, "merged_model")
# SAFE_WINDOWS_WORKERS = 50 # Max workers for datasets.map() to avoid Windows handle error

In [4]:
COMPUTE_DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16


In [28]:
COMPUTE_DTYPE

torch.bfloat16

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",               # Use NormalFloat 4-bit for better memory efficiency
    bnb_4bit_compute_dtype=COMPUTE_DTYPE,    # Use BF16 for math operations on RTX 4080
    bnb_4bit_use_double_quant=True,          # Nested quantization for extra memory saving
)

In [6]:
lora_config = LoraConfig(
    r=16,                                    # LoRA attention dimension (Rank)
    lora_alpha=32,                           # Scaling factor (usually 2*r is a good start)
    target_modules=[                         # QLoRA best practice: target all linear layers
        "q_proj", "k_proj", "v_proj", "o_proj", 
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)


In [7]:
CUSTOM_DEVICE_MAP = {"": 0, "cpu": 0} # Try to put everything on GPU 0, but allow CPU as fallback

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    
    # ðŸš¨ FIX: Pass a custom device_map and the offload flag ðŸš¨
    device_map=CUSTOM_DEVICE_MAP,
    #llm_int8_enable_fp32_cpu_offload=True, 
    
    trust_remote_code=True,
    token=os.environ.get("HF_TOKEN"),
    attn_implementation="sdpa"
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    padding_side="right", # Required for Llama models during training
    add_eos_token=True,
    token=os.environ.get("HF_TOKEN")
)
tokenizer.pad_token = tokenizer.eos_token

In [19]:
from datasets import load_dataset, Features, Value # <-- Ensure Features and Value are imported

# --- 4. Load and Prepare Dataset (FINAL FIX) ---

# Define the expected feature structure: a single column named 'text' with string values
DATASET_FEATURES = Features({"text": Value("string")})

print(f"Loading dataset from {DATASET_PATH} with explicit features...")
dataset = load_dataset(
    'json', 
    data_files=DATASET_PATH, 
    split="train",
    # ðŸš¨ FINAL FIX: Force the dataset structure ðŸš¨
    features=DATASET_FEATURES
) 

Loading dataset from llama31_code_finetune_simple.jsonl with explicit features...


In [20]:
len(dataset)

118278

In [21]:
dataset = dataset.select(range(10000))

In [22]:
len(dataset)

10000

In [23]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,             # Per-GPU batch size
    gradient_accumulation_steps=4,             # Accumulate gradients over 4 steps (4 * 4 = 16)
    optim="paged_adamw_8bit",                  # Optimized optimizer for QLoRA
    logging_steps=25,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),   # Use FP16 if BF16 is not supported
    bf16=torch.cuda.is_bf16_supported(),       # Use BF16 if supported (RTX 4080 does)
    save_strategy="epoch",
    do_train=True,
    report_to="none",
    # Add gradient checkpointing for extra memory efficiency if needed
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':False} 
)

In [24]:
from trl import SFTTrainer, SFTConfig
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,             
    gradient_accumulation_steps=4,             
    optim="paged_adamw_8bit",                  
    logging_steps=25,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),   
    bf16=torch.cuda.is_bf16_supported(),       
    save_strategy="epoch",
    do_train=True,
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':False},
    dataset_text_field="text",
    max_length=2048,                     # Renamed from max_seq_length
    packing=True,
    dataset_num_proc=SAFE_WINDOWS_WORKERS 
    
    
)

In [25]:
trainer = SFTTrainer(
    model=model,
    args=training_args,        # This is your SFTConfig object
    train_dataset=dataset,
    peft_config=lora_config,
    #processing_class=tokenizer,       # Keep the tokenizer here
    # All data/processing parameters are gone from here!
)


Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-fla

Adding EOS to train dataset (num_proc=50):   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=50):   0%|          | 0/10000 [00:00<?, ? examples/s]

Packing train dataset (num_proc=50):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [26]:
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=True):
    trainer.train()

  self.gen = func(*args, **kwds)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
25,2.1503


In [27]:
#trainer.train()


ADAPTER_PATH = os.path.join(OUTPUT_DIR, "final_adapter")
trainer.model.save_pretrained(ADAPTER_PATH)
tokenizer.save_pretrained(ADAPTER_PATH)




('./llama31_code_lora_adapter\\final_adapter\\tokenizer_config.json',
 './llama31_code_lora_adapter\\final_adapter\\special_tokens_map.json',
 './llama31_code_lora_adapter\\final_adapter\\chat_template.jinja',
 './llama31_code_lora_adapter\\final_adapter\\tokenizer.json')

In [None]:
del trainer, model
gc.collect()
torch.cuda.empty_cache()

In [6]:
l=[[1,4],[3,6],[2,8]]
l.sort()
l

[[1, 4], [2, 8], [3, 6]]