In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())

for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")



CUDA available: True
Number of GPUs: 4
GPU 0: NVIDIA GeForce RTX 3090
GPU 1: NVIDIA GeForce RTX 3090
GPU 2: NVIDIA GeForce RTX 4090
GPU 3: NVIDIA GeForce RTX 4090


In [2]:
import json
import os
from pprint import pprint
 
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import load_checkpoint_and_dispatch, infer_auto_device_map
import torch
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512,expandable_segments:True'
# 1. Free up GPU memory
torch.cuda.empty_cache()

# 2. Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True  # Add this line
)

# 3. Model info
model_id = "meta-llama/Llama-3.3-70B-Instruct"
local_model_dir = "./llama3_70b_checkpoint"

# 4. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# 5. Calculate max memory for each GPU
n_gpus = torch.cuda.device_count()
# max_memory = {
#     0: "17GiB",
#     1: "17GiB",
#     2: "0GiB",  # Add this with minimal memory
#     3: "19GiB"

# }
max_memory = {
    0: "17GiB",
    1: "17GiB",
    2: "19GiB"  # Add this with minimal memory

}
# 6. Load model with device map inference
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="sequential",  # Let transformers handle device mapping
    max_memory=max_memory,
    trust_remote_code = True 
)

print("Model loaded successfully!")
print("\nCurrent device map:")
for name, device in model.hf_device_map.items():
    print(f"{name}: {device}")

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Model loaded successfully!

Current device map:
model.embed_tokens: 0
model.layers.0: 0
model.layers.1: 0
model.layers.2: 0
model.layers.3: 0
model.layers.4: 0
model.layers.5: 0
model.layers.6: 0
model.layers.7: 0
model.layers.8: 0
model.layers.9: 0
model.layers.10: 0
model.layers.11: 0
model.layers.12: 0
model.layers.13: 0
model.layers.14: 0
model.layers.15: 0
model.layers.16: 0
model.layers.17: 0
model.layers.18: 0
model.layers.19: 0
model.layers.20: 0
model.layers.21: 0
model.layers.22: 0
model.layers.23: 0
model.layers.24: 0
model.layers.25: 0
model.layers.26: 0
model.layers.27: 0
model.layers.28: 0
model.layers.29: 0
model.layers.30: 0
model.layers.31: 0
model.layers.32: 1
model.layers.33: 1
model.layers.34: 1
model.layers.35: 1
model.layers.36: 1
model.layers.37: 1
model.layers.38: 1
model.layers.39: 1
model.layers.40: 1
model.layers.41: 1
model.layers.42: 1
model.layers.43: 1
model.layers.44: 1
model.layers.45: 1
model.layers.46: 1
model.layers.47: 1
model.layers.48: 1
model.lay

In [3]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [4]:
# Configure LoRA
lora_config =  LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05  # Conventional
)
model = get_peft_model(model, lora_config)
# Function to print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    total_params = 0
    for param in model.parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable parameters: {trainable_params} / {total_params} ({100 * trainable_params / total_params:.2f}%)")

# Print the trainable parameters
print_trainable_parameters(model)


Trainable parameters: 89128960 / 36417314816 (0.24%)


In [5]:
import json
from datasets import load_dataset, Dataset, Features, Sequence, Value
from transformers import AutoTokenizer

def generate_and_tokenize_prompt(examples):
    prompts = []
    solutions = []
    
    for title, problem, solution in zip(examples['title'], examples['problem_statement'], examples['solution']):
        prompt = f"""<human>: Below is a control systems engineering problem. Solve it step by step.
        
Title: {title}
Problem: {problem}

<assistant>:""".strip()
        
        prompts.append(prompt)
        solutions.append(solution)

    # Tokenize prompts and solutions
    tokenized_inputs = tokenizer(prompts, padding='max_length', truncation=True, max_length=2048)
    tokenized_labels = tokenizer(solutions, padding='max_length', truncation=True, max_length=2048)

    # Convert lists to tensors first
    input_ids = torch.tensor(tokenized_inputs['input_ids'])
    labels = torch.full_like(input_ids, -100)
    
    # Get prompt lengths
    prompt_lens = [len(tokenizer(p)['input_ids']) for p in prompts]
    
    # Fill in the solution labels
    for i, prompt_len in enumerate(prompt_lens):
        # Convert to tensor before assigning
        solution_ids = torch.tensor(tokenized_labels['input_ids'][i][:(2048-prompt_len)])
        labels[i, prompt_len:prompt_len + len(solution_ids)] = solution_ids

    # Return dictionary with all required fields
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels.tolist()  # Convert back to list for dataset
    }

# Load and preprocess function
def load_and_preprocess_data(file_path):
    # Load the raw JSON data
    with open(file_path, "r") as f:
        raw_data = json.load(f)["problems"]
    
    # Convert to Dataset
    dataset = Dataset.from_list(raw_data)
    
    # Process data
    processed_data = dataset.map(
        generate_and_tokenize_prompt,
        batched=True,
        batch_size=32,
        load_from_cache_file=False
    )
    
    # Set format
    processed_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    
    return processed_data
# Main execution
file_path = "generated_problems_ControlBench_o1.json"
data = load_and_preprocess_data(file_path)
# Verify
print(data)
print(data[0])

Map:   0%|          | 0/9952 [00:00<?, ? examples/s]

Dataset({
    features: ['number', 'title', 'problem_statement', 'solution', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 9952
})
{'input_ids': tensor([128000,  10174,   7282,  ..., 128009, 128009, 128009]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'labels': tensor([  -100,   -100,   -100,  ..., 128009, 128009, 128009])}


In [6]:
OUTPUT_DIR = "experiments"
 
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=80,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

In [7]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
1,2.0227
2,1.7266
3,1.8064
4,2.017
5,1.8566
6,2.1493
7,1.894
8,1.9248
9,1.5772
10,1.3739


TrainOutput(global_step=80, training_loss=0.7198976818472147, metrics={'train_runtime': 5908.4294, 'train_samples_per_second': 0.054, 'train_steps_per_second': 0.014, 'total_flos': 2.7364751692529664e+17, 'train_loss': 0.7198976818472147, 'epoch': 0.03215434083601286})

In [8]:
model.save_pretrained("trained-model")

In [9]:
# Replace "trained-model" with the directory where your model is saved
PEFT_MODEL = "trained-model"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(PEFT_MODEL)

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,  # Base model path from the PEFT config
    return_dict=True,
    device_map="auto",
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the PEFT model
model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
# 1. First, install required additional packages if not already installed

# 2. Import additional required libraries
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import json

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    formatted_data = []
    for problem in data['problems']:
        prompt = (
            f"Below is a control systems engineering problem. Solve it step by step.\n\n"
            f"Title: {problem['title']}\n"
            f"Problem: {problem['problem_statement']}\n"
            f"Solution:"
        )
        completion = problem['solution']

        # Store raw text in "text" instead of "input_ids"
        formatted_text = {
            "text": f"{prompt}\n{completion}</s>",
            "title": problem['title'],
            "number": problem.get('number', None)
        }
        formatted_data.append(formatted_text)

    return formatted_data

train_data = load_data("generated_problems_ControlBench_o1.json")
dataset = Dataset.from_list(train_data)

# 1. Tokenize
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=1024)

# 2. Apply to entire dataset
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9952 [00:00<?, ? examples/s]

In [4]:
# Import required libraries
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
import json
import os
import torch
from tensorboard import notebook
from transformers import TrainerCallback

class ConsoleLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            print(f"Step {state.global_step} - Loss: {logs['loss']}")


# Configure LoRA
lora_config = LoraConfig(
    r=64,                # Even more capacity
    lora_alpha=128,       # Matched with higher rank
    target_modules=[     # All major modules
        "q_proj", "k_proj", "v_proj", "o_proj"

    ],
    lora_dropout=0.1
)

# Prepare the model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Create output directory if it doesn't exist
os.makedirs("./control_system_tuned_model", exist_ok=True)

# Set up training arguments with TensorBoard logging
training_args = TrainingArguments(
    output_dir="./control_system_tuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    save_steps=500,
    eval_steps =500,
    logging_steps=1,  # Changed from 10 to 1 to log more frequently
    logging_first_step=True,  # Add this to log the first step
    logging_nan_inf_filter=False,  # Add this to show all loss values
    learning_rate=5e-6,
    weight_decay=0.01,
    fp16=True,
    bf16=False,
    max_grad_norm=1.0,
    warmup_ratio=0.1,
    group_by_length=True,
    lr_scheduler_type="cosine_with_restarts",
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to=["tensorboard"],
    logging_dir="./control_system_tuned_model/runs",
    disable_tqdm=False,  # Enables progress bar and logs
    gradient_checkpointing=True 
)

# Split dataset into train and validation
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = split_dataset["train"]
eval_data  = split_dataset["test"]

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[ConsoleLossCallback()],  # Add the callback here
)



In [5]:

# Cell 1: Start TensorBoard in a separate cell
%load_ext tensorboard
%tensorboard --logdir ./control_system_tuned_model/runs

Reusing TensorBoard on port 6006 (pid 286474), started 20:06:41 ago. (Use '!kill 286474' to kill it.)

In [6]:
# Cell 2: Then run training
print("Starting training...")
train_result = trainer.train()

print("Final training metrics:", train_result.metrics)
trainer.save_model("./final_control_system_model")


Starting training...


Step,Training Loss,Validation Loss


Step 1 - Loss: 0.59
Step 2 - Loss: 0.5206
Step 3 - Loss: 0.7153
Step 4 - Loss: 0.6475
Step 5 - Loss: 0.7831
Step 6 - Loss: 0.7622
Step 7 - Loss: 0.8032
Step 8 - Loss: 0.7077
Step 9 - Loss: 0.789
Step 10 - Loss: 0.8429
Step 11 - Loss: 0.8321
Step 12 - Loss: 0.7997
Step 13 - Loss: 0.7496
Step 14 - Loss: 0.6279
Step 15 - Loss: 0.7245
Step 16 - Loss: 0.9579
Step 17 - Loss: 0.7972
Step 18 - Loss: 0.9679
Step 19 - Loss: 0.7951
Step 20 - Loss: 0.7273
Step 21 - Loss: 0.843
Step 22 - Loss: 0.9088
Step 23 - Loss: 0.8539
Step 24 - Loss: 0.8979
Step 25 - Loss: 0.7965
Step 26 - Loss: 0.8654
Step 27 - Loss: 0.8524
Step 28 - Loss: 0.9925
Step 29 - Loss: 0.8508
Step 30 - Loss: 0.8155
Step 31 - Loss: 0.94
Step 32 - Loss: 0.9262
Step 33 - Loss: 0.9008
Step 34 - Loss: 0.9217
Step 35 - Loss: 1.0107
Step 36 - Loss: 1.1129
Step 37 - Loss: 1.0977
Step 38 - Loss: 1.1489
Step 39 - Loss: 0.9306
Step 40 - Loss: 1.0087
Step 41 - Loss: 1.0219
Step 42 - Loss: 1.0427
Step 43 - Loss: 1.0439
Step 44 - Loss: 1.3035
Ste

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Step 500 - Loss: 0.564


OutOfMemoryError: CUDA out of memory. Tried to allocate 896.00 MiB. GPU 0 has a total capacity of 23.59 GiB of which 630.44 MiB is free. Including non-PyTorch memory, this process has 22.96 GiB memory in use. Of the allocated memory 20.81 GiB is allocated by PyTorch, and 1.84 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./control_system_tuned_model/runs

In [None]:
# Import necessary libraries
from vllm import LLM, SamplingParams  # vLLM tools for efficient language model inference
import os  # For setting environment variables

# Configure PyTorch memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'  
# This sets the maximum size of a memory block that PyTorch will allocate to 512MB, helping prevent memory fragmentation

def generate_response(prompt, max_tokens=100):
    # Function that takes a prompt and max tokens as input
    # max_tokens=50 is the default value if not specified
    
    # Format the prompt using the chat template
    formatted_prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],  # Creates a chat message format
        add_generation_prompt=True,  # Adds any necessary generation prompts
        tokenize=False,  # Returns string instead of tokens
    )
    
    # Configure generation parameters
    sampling_params = SamplingParams(
        temperature=0.7,  # Controls randomness (higher = more random)
        top_p=0.9,  # Nucleus sampling parameter (higher = more diverse)
        max_tokens=max_tokens,  # Maximum number of tokens to generate
        presence_penalty=0.0,  # Penalize new tokens based on presence in text
        frequency_penalty=0.0  # Penalize new tokens based on frequency in text
    )
    
    # Generate the response with memory-efficient settings
    with torch.inference_mode():  # Disables gradient computation to save memory
        # Convert input text to tokens and move to same device as model
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
        
        # Generate output using the model
        outputs = model.generate(
            **inputs,  # Pass tokenized inputs
            max_new_tokens=max_tokens,  # Maximum length of generated text
            do_sample=True,  # Enable sampling (vs greedy decoding)
            temperature=0.7,  # Controls randomness
            top_p=0.9,  # Nucleus sampling parameter
            num_beams=1,  # Disable beam search to save memory
            pad_token_id=tokenizer.pad_token_id,  # ID for padding token
            eos_token_id=tokenizer.eos_token_id,  # ID for end of sequence token
            use_cache=True  # Enable KV-caching for faster generation
        )
    
    # Convert output tokens back to text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def batch_generate(prompts, max_tokens=100):
    # Function to process multiple prompts
    responses = []
    
    for prompt in prompts:
        try:
            # Try to generate a response for each prompt
            response = generate_response(prompt, max_tokens)
            responses.append({"prompt": prompt, "response": response})
        except Exception as e:
            # Handle any errors that occur during generation
            print(f"Error processing prompt '{prompt}': {str(e)}")
            responses.append({"prompt": prompt, "response": "Error generating response"})
    return responses

# Example usage
test_prompts = [
    "What is the capital of France?",
    "Explain quantum computing in simple terms"
]

# Clear GPU memory before generation
torch.cuda.empty_cache()  # Free up any unused GPU memory

# Generate responses for all prompts
results = batch_generate(test_prompts)

# Print results
for result in results:
    print(f"Prompt: {result['prompt']}")  # Print the original prompt
    print(f"Response: {result['response']}")  # Print the generated response
    print("-" * 50)  # Print a separator line

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

def process_questions(questions, tokenizer, model, sampling_params):
    # Format questions using chat template
    questions_processed = [tokenizer.apply_chat_template(
        [{"role": "user", "content": q}],
        add_generation_prompt=True,
        tokenize=False,
    ) for q in questions]
    
    # Generate answers with memory-efficient settings
    answers = []
    for prompt in questions_processed:
        with torch.inference_mode():
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                **inputs,
                max_new_tokens=sampling_params.max_tokens,
                do_sample=sampling_params.do_sample,
                temperature=sampling_params.temperature,
                top_p=sampling_params.top_p,
                num_beams=1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True
            )
            answers.append(tokenizer.decode(output[0], skip_special_tokens=True))
    
    return answers

# Define evaluation parameters
class SamplingParams:
    def __init__(self, n=1, temperature=0, top_p=1, seed=123, max_tokens=2048, stop_token_ids=None):
        self.n = n
        self.temperature = temperature
        self.top_p = top_p
        self.seed = seed
        self.max_tokens = max_tokens
        self.stop_token_ids = stop_token_ids
        self.do_sample = temperature > 0

# Template for evaluation
prompt_template = """Given a question, an model-generated answer and a reasoning step from the ground-truth answer. 
You are required to analyze and tell if the model-generated answer contains the given reasoning step. 
End your answer with [[Yes]] or [[No]].

Question: {}
Model-generated answer: {}
Reasoning Step: {}"""

def evaluate_answers(questions, solutions, answers, tokenizer, model, eval_params):
    dump_data_full = []
    
    for question, gt_steps, gen_answer in zip(questions, solutions, answers):
        dump_data = [{
            'question': question,
            'answer': gen_answer,
            'contamination': []
        }]
        
        prompt_set = []
        for gt_step in gt_steps:
            prompt_set.append(prompt_template.format(question, gen_answer, gt_step))
            
        if not prompt_set:
            continue
            
        prompt_set_processed = [tokenizer.apply_chat_template(
            [{"role": "user", "content": p}],
            add_generation_prompt=True,
            tokenize=False,
        ) for p in prompt_set]
        
        # Generate evaluations
        eval_results = []
        for prompt in prompt_set_processed:
            with torch.inference_mode():
                inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                output = model.generate(
                    **inputs,
                    max_new_tokens=eval_params.max_tokens,
                    do_sample=eval_params.do_sample,
                    temperature=eval_params.temperature,
                    top_p=eval_params.top_p,
                    num_beams=1,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    use_cache=True
                )
                eval_results.append(tokenizer.decode(output[0], skip_special_tokens=True).strip())
        
        # Process evaluation results
        for i, eval_result in enumerate(eval_results):
            if eval_result[-1] == '.':
                eval_result = eval_result[:-1]
                
            contain = None
            if '[[Yes]]' in eval_result:
                contain = True
            elif '[[No]]' in eval_result:
                contain = False
                
            dump_data[0]['contamination'].append({
                'gt_step': gt_steps[i],
                'is_contain': contain,
                'analysis': eval_result
            })
            
        dump_data_full.extend(dump_data)
    
    return dump_data_full

# Main execution
def main():
    # Your existing model setup code here
    
    # Setup parameters
    sampling_params = SamplingParams(n=1, temperature=0.7, top_p=0.9, max_tokens=2048)
    eval_params = SamplingParams(n=1, temperature=0, top_p=1, seed=123, max_tokens=2048, 
                               stop_token_ids=[128009])  # for llama 3
    
    # Process questions and generate answers
    answers = process_questions(questions, tokenizer, model, sampling_params)
    
    # Print first input-output example
    print(f"Prompt: {questions[0]!r}")
    print(f"Generated text: {answers[0]!r}")
    print("-" * 50)
    
    # Evaluate answers
    dump_data_full = evaluate_answers(questions, solutions, answers, tokenizer, model, eval_params)
    
    # Save results
    print(f"Total processed items: {len(dump_data_full)}")
    with open('dump.json', 'w') as f:
        json.dump(dump_data_full, f, indent=4)

if __name__ == "__main__":
    main()