In [1]:
# new d=for 1 epoch 
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset

# Define paths
model_name = "meta-llama/Llama-3.2-3B-Instruct"
cache_dir = "/scratch/gilbreth/pate2530"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Update model with the new token

# Load and preprocess the dataset
dataset = load_dataset("squad", cache_dir=cache_dir)

# Limit training to 1000 samples and validation to 200 samples
train_dataset = dataset["train"].select(range(1000))
validation_dataset = dataset["validation"].select(range(200))

def preprocess_function(examples):
    # Combine context and question
    inputs = [f"Context: {context} Question: {question}" for context, question in zip(examples["context"], examples["question"])]
    # Extract the first answer
    labels = [answer["text"][0] for answer in examples["answers"]]
    
    # Tokenize inputs and labels
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, padding="max_length", truncation=True, max_length=512)
    
    # Replace padding tokens in labels with -100 for loss calculation
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]
    
    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True, remove_columns=validation_dataset.column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"{cache_dir}/llama-finetuned",
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every 100 steps
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Small batch size to avoid OOM
    gradient_accumulation_steps=16,  # Simulate a larger batch size
    num_train_epochs=1,  # Train for only 1 epoch
    fp16=True,  # Enable mixed-precision training
    save_steps=500,
    save_total_limit=2,
    logging_dir=f"{cache_dir}/logs",
    load_best_model_at_end=True,  # Required for EarlyStoppingCallback
    metric_for_best_model="eval_loss",  # Use evaluation loss as the metric for the best model
    greater_is_better=False,  # Smaller loss is better
)

# Initialize Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop if eval loss doesn't improve for 3 evaluations
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained(f"{cache_dir}/llama-finetuned")
tokenizer.save_pretrained(f"{cache_dir}/llama-finetuned")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https:/

Step,Training Loss,Validation Loss


('/scratch/gilbreth/pate2530/llama-finetuned/tokenizer_config.json',
 '/scratch/gilbreth/pate2530/llama-finetuned/special_tokens_map.json',
 '/scratch/gilbreth/pate2530/llama-finetuned/tokenizer.json')

In [3]:
import os
os.environ["HF_TOKEN"]="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"

In [5]:
import os
os.environ["HF_TOKEN"]= "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"

In [6]:
!huggingface-cli login

model.push_to_hub("Drashtip/llama-finetuned", check_pr=True)

tokenizer.push_to_hub("Drashtip/llama-finetuned",check_pr=True)

/bin/bash: huggingface-cli: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


model-00003-of-00003.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Drashtip/llama-finetuned/commit/76ab25b001f3aa2821e6746fa85129bcce7fd1dc', commit_message='Upload tokenizer', commit_description='', oid='76ab25b001f3aa2821e6746fa85129bcce7fd1dc', pr_url=None, pr_revision=None, pr_num=None)