In [1]:
import os
import json
import time
import torch
import torch.profiler
from typing import Callable
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from functools import wraps
import csv


# Custom data collator for dynamic padding
class CustomDataCollatorWithPadding:
    def __init__(self, tokenizer, label_pad_token_id=-100):
        self.tokenizer = tokenizer
        self.label_pad_token_id = label_pad_token_id

    def __call__(self, features):
        # Extract and pad labels dynamically
        labels = [feature.pop("labels") for feature in features]
        batch = self.tokenizer.pad(features, return_tensors="pt")
        
        max_label_length = max(len(l) for l in labels)
        padded_labels = [
            l + [self.label_pad_token_id] * (max_label_length - len(l)) 
            for l in labels
        ]
        batch["labels"] = torch.tensor(padded_labels)
        return batch

# -----------------------------
# Set cache directory and load model/tokenizer
# -----------------------------
os.environ['HF_HOME'] = '/cs/student/projects2/aisd/2024/shekchu/snlp'
cache_dir = os.getenv('HF_HOME', 'Cache directory not set')
print(f"Model weights are stored in: {cache_dir}")

model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
)
model.config.use_cache = False  # Disable caching for gradient checkpointing
model.gradient_checkpointing_enable()
model.eval()

# -----------------------------
# Setup LoRA configuration
# -----------------------------
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                   
    lora_alpha=32,         
    lora_dropout=0.1,      
    target_modules=["qkv_proj"]  
)
model = get_peft_model(model, lora_config)

# -----------------------------
# Custom Dataset
# -----------------------------
from dataset.dataset import FinQADataset
train_dataset = FinQADataset("./dataset/train.json")

class TokenizedFinQADataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=4096):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # Concatenate prompt and label (the answer)
        full_text = item["prompt"].strip() + "\n" + item["label"].strip()
        
        # Tokenize without converting to tensors so the collator can handle padding.
        tokenized = self.tokenizer(
            full_text,
            truncation=True,
            max_length=self.max_length,
            # No static padding and no return_tensors.
        )
        
        # Create labels from input_ids (copying the list)
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

tokenized_train_dataset = TokenizedFinQADataset(train_dataset, tokenizer, max_length=2048)

# -----------------------------
# Use our custom collator for dynamic padding
# -----------------------------
data_collator = CustomDataCollatorWithPadding(tokenizer, label_pad_token_id=-100)

# -----------------------------
# Training Arguments and Trainer
# -----------------------------
training_args = TrainingArguments(
    output_dir="./lora_finetuned_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    max_steps=30,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    evaluation_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# -----------------------------
# Train the model
# -----------------------------
import torch
import torch.profiler
import csv
import os
from typing import Callable
from functools import wraps

def profile_training_flops_by_steps(csv_path="flops_per_step.csv", log_every_n_steps=10):
    """
    A decorator/wrapper function to profile FLOPs during model training,
    logging results every N steps to a CSV file.
    
    Args:
        csv_path (str): Path to save the CSV log file
        log_every_n_steps (int): How frequently to log FLOP measurements
        
    Returns:
        A decorator function that profiles the training process
    """
    def decorator(training_func: Callable):
        @wraps(training_func)
        def wrapper(*args, **kwargs):
            # Get the trainer from args (assuming first arg is trainer)
            trainer = args[0]
            original_training_step = trainer.training_step
            
            # Create CSV file and write header
            os.makedirs(os.path.dirname(csv_path) or '.', exist_ok=True)
            with open(csv_path, 'w', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow(['Step', 'FLOPs', 'CUDA Time (ms)', 'CPU Time (ms)', 'Memory (MB)'])
            
            # Step counter
            step_counter = [0]  # Using list for nonlocal access
            
            # Override the training_step method to add profiling
            def profiled_training_step(*step_args, **step_kwargs):
                step_counter[0] += 1
                current_step = step_counter[0]
                
                # Only profile every N steps
                if current_step % log_every_n_steps == 0:
                    with torch.profiler.profile(
                        activities=[
                            torch.profiler.ProfilerActivity.CPU,
                            torch.profiler.ProfilerActivity.CUDA
                        ],
                        record_shapes=True,
                        profile_memory=True,
                        with_flops=True
                    ) as prof:
                        result = original_training_step(*step_args, **step_kwargs)
                    
                    # Calculate total FLOPs
                    total_flops = 0
                    total_cuda_time_ms = 0
                    total_cpu_time_ms = 0
                    
                    for event in prof.key_averages():
                        # Accumulate FLOPs
                        if hasattr(event, 'flops') and event.flops > 0:
                            total_flops += event.flops
                        
                        # Accumulate CUDA time (converted to ms)
                        if hasattr(event, 'cuda_time'):
                            total_cuda_time_ms += event.cuda_time * 1000  # Convert to ms
                        
                        # Accumulate CPU time (converted to ms)
                        if hasattr(event, 'cpu_time'):
                            total_cpu_time_ms += event.cpu_time * 1000  # Convert to ms
                    
                    # Calculate memory usage in MB
                    memory_usage_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
                    
                    # Log to CSV
                    with open(csv_path, 'a', newline='') as csvfile:
                        csv_writer = csv.writer(csvfile)
                        csv_writer.writerow([
                            current_step, 
                            total_flops, 
                            total_cuda_time_ms, 
                            total_cpu_time_ms,
                            memory_usage_mb
                        ])
                    
                    # Print to console
                    print(f"\nStep {current_step}:")
                    print(f"  Total FLOPs: {total_flops:,}")
                    print(f"  CUDA Time: {total_cuda_time_ms:.2f} ms")
                    print(f"  CPU Time: {total_cpu_time_ms:.2f} ms")
                    print(f"  Memory Usage: {memory_usage_mb:.2f} MB")
                    
                    # Reset peak memory stats for next iteration
                    torch.cuda.reset_peak_memory_stats()
                    
                    return result
                else:
                    # Run without profiling for other steps
                    return original_training_step(*step_args, **step_kwargs)
            
            # Replace the training_step method
            trainer.training_step = profiled_training_step
            
            try:
                # Execute the original training function
                result = training_func(*args, **kwargs)
                return result
            finally:
                # Restore original training_step method
                trainer.training_step = original_training_step
                print(f"\nFLOP measurements have been saved to {csv_path}")
        
        return wrapper
    return decorator

# Example usage with the Transformers Trainer:
@profile_training_flops_by_steps(csv_path="training_flops_profiler_v2.csv", log_every_n_steps=10)
def train_model(trainer):
    return trainer.train()

# Replace your existing trainer.train() call with:
train_model(trainer)

# Save the fine-tuned model (including LoRA adapters)
model.save_pretrained("./lora_finetuned_model")


  from .autonotebook import tqdm as notebook_tqdm


Model weights are stored in: /cs/student/projects2/aisd/2024/shekchu/snlp


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
  trainer = Trainer(
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,12.7763
20,12.2616
30,11.9921



Step 10:
  Total FLOPs: 363,207,474,996,226
  CUDA Time: 568083631.18 ms
  CPU Time: 574030368.80 ms
  Memory Usage: 16719.23 MB


  if hasattr(event, 'cuda_time'):
  total_cuda_time_ms += event.cuda_time * 1000  # Convert to ms



Step 20:
  Total FLOPs: 299,007,716,303,378
  CUDA Time: 465780431.31 ms
  CPU Time: 479812599.04 ms
  Memory Usage: 16719.23 MB

Step 30:
  Total FLOPs: 271,341,521,949,554
  CUDA Time: 419381649.46 ms
  CPU Time: 430611644.32 ms
  Memory Usage: 16719.23 MB

FLOP measurements have been saved to training_flops_profiler_v2.csv
