In [1]:
# pip install -q transformers[torch] datasets
# pip install -q bitsandbytes trl peft
# pip install flash-attn --no-build-isolation

In [1]:
from datasets import DatasetDict
import json

In [2]:
filename = "llama-3-finetuning-data.jsonl"

In [3]:
data = []
with open(filename, 'r', encoding='utf-8') as f:
  for line in f:
    data.append(json.loads(line.strip()))


In [4]:
from datasets import Dataset

dataset = Dataset.from_list(data)

train_test_data = dataset.train_test_split(test_size=63, seed=42)
train_val_data = train_test_data["train"].train_test_split(test_size=63, seed=42)

# Access the train, validation, and test datasets
train_dataset = train_val_data["train"]
eval_dataset = train_val_data["test"]
test_dataset = train_test_data["test"]

# Print the number of examples in each set
print(f"Train dataset: {len(train_dataset)} examples")
print(f"Validation dataset: {len(eval_dataset)} examples")
print(f"Test dataset: {len(test_dataset)} examples")



Train dataset: 577 examples
Validation dataset: 63 examples
Test dataset: 63 examples


In [6]:
for item in test_dataset:
    print(item)

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an intelligent assistant dedicated to extracting management levels and job titles from user queries. Before doing so, you must understand what a functional area is.<|eot_id|>\n    <|start_header_id|>user<|end_header_id|>\nDefinition of a Functional Area:\n- A functional area is a department or group of personnel tasked with a specific organizational function. These include departments like finance, marketing, engineering, etc.\n\nDefinition of Management Level:\n- A management level refers to a hierarchical position within an organization without a specific functional area. It encompasses broader titles that may include roles across different functional areas.\n- Management levels include: "Board of Directors," "CSuite and President," "Executive and Sr. VP," "General Manager," "VP," "Director," "Manager," "Senior (Individual Contributor)," "Mid (Individual Contributor)," and "Junior."\n\nDefinition of a Job Ti

In [5]:
dataset_dict = {"train": train_dataset,
                "test": eval_dataset}

In [6]:
raw_datasets = DatasetDict(dataset_dict)

In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 577
    })
    test: Dataset({
        features: ['text'],
        num_rows: 63
    })
})

In [8]:
from transformers import AutoTokenizer

model_name = "/home/raza/Downloads/meta-llama-3-2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048


In [9]:
from transformers import BitsAndBytesConfig
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType, PeftModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed
from transformers import AutoProcessor, DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
from transformers import AutoConfig, GenerationConfig

# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map=device_map,
    quantization_config=quantization_config,
)

processor = AutoTokenizer.from_pretrained(model_name)
processor.padding_side = 'right'

# Set the padding token to be the same as eos_token if it's not defined
if processor.pad_token is None:
    processor.pad_token = processor.eos_token  # or you can set a custom token if needed




model = AutoModelForCausalLM.from_pretrained(
          model_name,
          trust_remote_code=True,
          attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
          torch_dtype="auto",
          use_cache=False, # set to False as we're going to use gradient checkpointing
          device_map=device_map,
          quantization_config=quantization_config,
)





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
def inference_function(input_text: str, max_length: int = 50):
    """
    Simple inference function to generate text based on an input prompt.

    Args:
    - input_text (str): The input text to be processed by the model.
    - max_length (int): The maximum length of the generated text.

    Returns:
    - str: The generated text.
    """
    # Process the input text
    inputs = processor(input_text, return_tensors="pt", padding=True).to(model.device)

    # Generate output from the model
    output = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,  # Enable sampling for more diverse outputs
        temperature=1.0,
        top_p=0.9
    )

    # Decode the output tokens into text
    generated_text = processor.decode(output[0], skip_special_tokens=True)  # Get the first sequence

    return generated_text

# Example usage
input_text = "Once upon a time"
generated_output = inference_function(input_text)
print(f"Generated Output: {generated_output}")


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Output: Once upon a time, in a bustling city, there was a beautiful and vibrant public park. The park was a hub of activity, with people from all walks of life coming to walk, jog, play games, or simply enjoy the scenery.


In [11]:
model = prepare_model_for_kbit_training(model)

In [12]:
total_epochs = 15

In [13]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

# path where the Trainer will save its checkpoints and logs
output_dir = './finetuned_llama3-2-3B_model'


# Adjusted TrainingArguments for 15 epochs and saving checkpoints every 2 epochs
training_args = TrainingArguments(
    fp16=True,  # Use mixed precision
    bf16=False,  # BF16 is generally for specific hardware; here, fp16 is better suited
    do_eval=True,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    evaluation_strategy="epoch",  # Ensure evaluation happens at every epoch
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate a larger batch size
    gradient_checkpointing=True,  # Save memory during training
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Optional, helps with stability
    learning_rate=2e-5,  # Low learning rate for fine-tuning
    log_level="info",
    logging_steps=10,  # Log every 10 steps
    logging_strategy="steps",
    lr_scheduler_type="cosine",  # Use a cosine decay scheduler
    max_steps=-1,  # Train for the full number of epochs
    num_train_epochs=total_epochs,  # Fine-tune for 15 epochs
    output_dir=output_dir,  # Path where the model is saved
    overwrite_output_dir=True,  # Overwrite any existing model
    per_device_train_batch_size=4,  # Batch size of 4 (adjust if necessary)
    per_device_eval_batch_size=4,  # Eval batch size (use 2-4 depending on memory)
    push_to_hub=False,
    save_strategy="epoch",  # Save model after each epoch
    save_steps=None,  # Disable saving based on steps (use epochs instead)
    save_total_limit=None,  # Limit the number of saved checkpoints (optional)
    seed=42,
    logging_dir="./logs",  # Directory to store logs (TensorBoard)
    load_best_model_at_end=True,  # Load the best model based on validation loss
)

# based on config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

trainer = SFTTrainer(
    model=model_name,
    args=training_args,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["test"],
    tokenizer=processor,
    peft_config=peft_config,
)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = SFTTrainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Using auto half precision backend


In [13]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# chat = [
#   {"role": "system", "content": "You are a sentiment analysis agent"},
#   {"role": "user", "content": "Classify the following statement as Positive or Negative:\nStatement: I am happy"},
#   {"role": "assistant", "content": "Positive"},
# ]

# tokenizer.apply_chat_template(chat, tokenize=False)

In [14]:
import torch
torch.cuda.empty_cache()

In [15]:
trainer_result = trainer.train()

***** Running training *****
  Num examples = 577
  Num Epochs = 15
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 540
  Number of trainable parameters = 36,700,160


  0%|          | 0/540 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


{'loss': 2.4912, 'grad_norm': 0.28775086998939514, 'learning_rate': 1.9983081582712684e-05, 'epoch': 0.28}
{'loss': 2.3672, 'grad_norm': 0.31076881289482117, 'learning_rate': 1.9932383577419432e-05, 'epoch': 0.55}
{'loss': 2.2098, 'grad_norm': 0.3079773187637329, 'learning_rate': 1.9848077530122083e-05, 'epoch': 0.83}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-37
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": 

{'eval_loss': 2.0058774948120117, 'eval_runtime': 5.159, 'eval_samples_per_second': 12.212, 'eval_steps_per_second': 3.101, 'epoch': 1.0}
{'loss': 2.0477, 'grad_norm': 0.30620676279067993, 'learning_rate': 1.973044870579824e-05, 'epoch': 1.08}
{'loss': 1.8807, 'grad_norm': 0.287406861782074, 'learning_rate': 1.957989512315489e-05, 'epoch': 1.36}
{'loss': 1.7174, 'grad_norm': 0.3243946135044098, 'learning_rate': 1.9396926207859085e-05, 'epoch': 1.63}
{'loss': 1.5325, 'grad_norm': 0.35606786608695984, 'learning_rate': 1.9182161068802742e-05, 'epoch': 1.91}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-74
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": 

{'eval_loss': 1.3275282382965088, 'eval_runtime': 5.2654, 'eval_samples_per_second': 11.965, 'eval_steps_per_second': 3.039, 'epoch': 2.0}
{'loss': 1.3094, 'grad_norm': 0.4378772974014282, 'learning_rate': 1.8936326403234125e-05, 'epoch': 2.17}
{'loss': 1.048, 'grad_norm': 0.48989933729171753, 'learning_rate': 1.866025403784439e-05, 'epoch': 2.44}
{'loss': 0.7709, 'grad_norm': 0.4631698429584503, 'learning_rate': 1.8354878114129368e-05, 'epoch': 2.72}
{'loss': 0.5312, 'grad_norm': 0.370974063873291, 'learning_rate': 1.802123192755044e-05, 'epoch': 2.99}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-111
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.4123690724372864, 'eval_runtime': 5.1761, 'eval_samples_per_second': 12.171, 'eval_steps_per_second': 3.091, 'epoch': 3.0}
{'loss': 0.368, 'grad_norm': 0.23368079960346222, 'learning_rate': 1.766044443118978e-05, 'epoch': 3.25}
{'loss': 0.2941, 'grad_norm': 0.17825289070606232, 'learning_rate': 1.7273736415730488e-05, 'epoch': 3.52}
{'loss': 0.2473, 'grad_norm': 0.13374724984169006, 'learning_rate': 1.686241637868734e-05, 'epoch': 3.8}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-148
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.20817050337791443, 'eval_runtime': 5.1649, 'eval_samples_per_second': 12.198, 'eval_steps_per_second': 3.098, 'epoch': 4.0}
{'loss': 0.2139, 'grad_norm': 0.10989093035459518, 'learning_rate': 1.6427876096865394e-05, 'epoch': 4.06}
{'loss': 0.1953, 'grad_norm': 0.1177702322602272, 'learning_rate': 1.5971585917027864e-05, 'epoch': 4.33}
{'loss': 0.1811, 'grad_norm': 0.10799044370651245, 'learning_rate': 1.5495089780708062e-05, 'epoch': 4.61}
{'loss': 0.1553, 'grad_norm': 0.1283911019563675, 'learning_rate': 1.5000000000000002e-05, 'epoch': 4.88}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-185
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.13099165260791779, 'eval_runtime': 4.7212, 'eval_samples_per_second': 13.344, 'eval_steps_per_second': 3.389, 'epoch': 5.0}
{'loss': 0.1311, 'grad_norm': 0.099880151450634, 'learning_rate': 1.4487991802004625e-05, 'epoch': 5.14}
{'loss': 0.119, 'grad_norm': 0.0628439262509346, 'learning_rate': 1.396079766039157e-05, 'epoch': 5.41}
{'loss': 0.1131, 'grad_norm': 0.054415322840213776, 'learning_rate': 1.342020143325669e-05, 'epoch': 5.69}
{'loss': 0.1082, 'grad_norm': 0.049315936863422394, 'learning_rate': 1.2868032327110904e-05, 'epoch': 5.97}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-222
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.10942086577415466, 'eval_runtime': 4.7179, 'eval_samples_per_second': 13.353, 'eval_steps_per_second': 3.391, 'epoch': 6.0}
{'loss': 0.109, 'grad_norm': 0.07325206696987152, 'learning_rate': 1.2306158707424402e-05, 'epoch': 6.22}
{'loss': 0.1026, 'grad_norm': 0.050754375755786896, 'learning_rate': 1.1736481776669307e-05, 'epoch': 6.5}
{'loss': 0.1075, 'grad_norm': 0.07395963370800018, 'learning_rate': 1.1160929141252303e-05, 'epoch': 6.77}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-259
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.1034022867679596, 'eval_runtime': 4.6993, 'eval_samples_per_second': 13.406, 'eval_steps_per_second': 3.405, 'epoch': 7.0}
{'loss': 0.112, 'grad_norm': 0.05523417890071869, 'learning_rate': 1.0581448289104759e-05, 'epoch': 7.03}
{'loss': 0.097, 'grad_norm': 0.052267350256443024, 'learning_rate': 1e-05, 'epoch': 7.3}
{'loss': 0.1038, 'grad_norm': 0.05608953908085823, 'learning_rate': 9.418551710895243e-06, 'epoch': 7.58}
{'loss': 0.1042, 'grad_norm': 0.04649007320404053, 'learning_rate': 8.839070858747697e-06, 'epoch': 7.86}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-296
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.10087685286998749, 'eval_runtime': 4.7009, 'eval_samples_per_second': 13.402, 'eval_steps_per_second': 3.404, 'epoch': 8.0}
{'loss': 0.1031, 'grad_norm': 0.04357268288731575, 'learning_rate': 8.263518223330698e-06, 'epoch': 8.11}
{'loss': 0.0963, 'grad_norm': 0.048526618629693985, 'learning_rate': 7.6938412925756e-06, 'epoch': 8.39}
{'loss': 0.0996, 'grad_norm': 0.0437864288687706, 'learning_rate': 7.131967672889101e-06, 'epoch': 8.66}
{'loss': 0.1033, 'grad_norm': 0.05747843533754349, 'learning_rate': 6.579798566743314e-06, 'epoch': 8.94}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-333
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.09970598667860031, 'eval_runtime': 4.7015, 'eval_samples_per_second': 13.4, 'eval_steps_per_second': 3.403, 'epoch': 9.0}
{'loss': 0.0988, 'grad_norm': 0.052400071173906326, 'learning_rate': 6.039202339608432e-06, 'epoch': 9.19}
{'loss': 0.0972, 'grad_norm': 0.049836497753858566, 'learning_rate': 5.512008197995379e-06, 'epoch': 9.47}
{'loss': 0.0977, 'grad_norm': 0.048214539885520935, 'learning_rate': 5.000000000000003e-06, 'epoch': 9.74}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


{'loss': 0.0997, 'grad_norm': 0.12826083600521088, 'learning_rate': 4.504910219291941e-06, 'epoch': 10.0}


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-370
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.09909076243638992, 'eval_runtime': 4.6973, 'eval_samples_per_second': 13.412, 'eval_steps_per_second': 3.406, 'epoch': 10.0}
{'loss': 0.0978, 'grad_norm': 0.046093929558992386, 'learning_rate': 4.028414082972141e-06, 'epoch': 10.28}
{'loss': 0.0953, 'grad_norm': 0.05652180314064026, 'learning_rate': 3.5721239031346067e-06, 'epoch': 10.55}
{'loss': 0.1011, 'grad_norm': 0.04933388531208038, 'learning_rate': 3.1375836213126653e-06, 'epoch': 10.83}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-407
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.09870133548974991, 'eval_runtime': 4.6989, 'eval_samples_per_second': 13.408, 'eval_steps_per_second': 3.405, 'epoch': 11.0}
{'loss': 0.0985, 'grad_norm': 0.04898371174931526, 'learning_rate': 2.726263584269513e-06, 'epoch': 11.08}
{'loss': 0.0975, 'grad_norm': 0.049494918435811996, 'learning_rate': 2.339555568810221e-06, 'epoch': 11.36}
{'loss': 0.0956, 'grad_norm': 0.04310060292482376, 'learning_rate': 1.9787680724495617e-06, 'epoch': 11.63}
{'loss': 0.0979, 'grad_norm': 0.06262964010238647, 'learning_rate': 1.6451218858706374e-06, 'epoch': 11.91}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-444
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.0984833613038063, 'eval_runtime': 4.6983, 'eval_samples_per_second': 13.409, 'eval_steps_per_second': 3.406, 'epoch': 12.0}
{'loss': 0.1004, 'grad_norm': 0.048556938767433167, 'learning_rate': 1.339745962155613e-06, 'epoch': 12.17}
{'loss': 0.0996, 'grad_norm': 0.05056541785597801, 'learning_rate': 1.0636735967658785e-06, 'epoch': 12.44}
{'loss': 0.0999, 'grad_norm': 0.04356081783771515, 'learning_rate': 8.178389311972612e-07, 'epoch': 12.72}
{'loss': 0.0939, 'grad_norm': 0.05319838970899582, 'learning_rate': 6.030737921409169e-07, 'epoch': 12.99}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-481
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.09837163239717484, 'eval_runtime': 4.6993, 'eval_samples_per_second': 13.406, 'eval_steps_per_second': 3.405, 'epoch': 13.0}
{'loss': 0.1024, 'grad_norm': 0.07585842162370682, 'learning_rate': 4.2010487684511105e-07, 'epoch': 13.25}
{'loss': 0.0966, 'grad_norm': 0.046450987458229065, 'learning_rate': 2.6955129420176193e-07, 'epoch': 13.52}
{'loss': 0.0966, 'grad_norm': 0.05601801723241806, 'learning_rate': 1.519224698779198e-07, 'epoch': 13.8}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-518
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.09833750128746033, 'eval_runtime': 4.7011, 'eval_samples_per_second': 13.401, 'eval_steps_per_second': 3.403, 'epoch': 14.0}
{'loss': 0.1006, 'grad_norm': 0.050159674137830734, 'learning_rate': 6.761642258056977e-08, 'epoch': 14.06}
{'loss': 0.0961, 'grad_norm': 0.04582798480987549, 'learning_rate': 1.6918417287318245e-08, 'epoch': 14.33}


Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-540
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'loss': 0.0961, 'grad_norm': 0.04443378746509552, 'learning_rate': 0.0, 'epoch': 14.61}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 4


  0%|          | 0/16 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_llama3-2-3B_model/checkpoint-540
loading configuration file /home/raza/Downloads/meta-llama-3-2-3B-Instruct/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version":

{'eval_loss': 0.09833656251430511, 'eval_runtime': 4.6998, 'eval_samples_per_second': 13.405, 'eval_steps_per_second': 3.404, 'epoch': 14.61}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./finetuned_llama3-2-3B_model/checkpoint-540 (score: 0.09833656251430511).


{'train_runtime': 2147.4783, 'train_samples_per_second': 4.03, 'train_steps_per_second': 0.251, 'train_loss': 0.43018975059191383, 'epoch': 14.61}


In [15]:
train_result = trainer.train()

***** Running training *****
  Num examples = 577
  Num Epochs = 50
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 7,200
  Number of trainable parameters = 12,582,912


  0%|          | 0/7200 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 1.4911, 'grad_norm': 0.04986097291111946, 'learning_rate': 1.999997620177352e-05, 'epoch': 0.03}
{'loss': 1.4963, 'grad_norm': 0.05934108793735504, 'learning_rate': 1.9999904807207348e-05, 'epoch': 0.07}
{'loss': 1.4869, 'grad_norm': 0.0683630183339119, 'learning_rate': 1.9999785816641293e-05, 'epoch': 0.1}
{'loss': 1.4842, 'grad_norm': 0.07281941920518875, 'learning_rate': 1.9999619230641714e-05, 'epoch': 0.14}
{'loss': 1.465, 'grad_norm': 0.06812652945518494, 'learning_rate': 1.99994050500015e-05, 'epoch': 0.17}
{'loss': 1.4492, 'grad_norm': 0.06429293751716614, 'learning_rate': 1.999914327574007e-05, 'epoch': 0.21}
{'loss': 1.4394, 'grad_norm': 0.06659349054098129, 'learning_rate': 1.9998833909103385e-05, 'epoch': 0.24}
{'loss': 1.4123, 'grad_norm': 0.07142454385757446, 'learning_rate': 1.9998476951563914e-05, 'epoch': 0.28}
{'loss': 1.3984, 'grad_norm': 0.07648936659097672, 'learning_rate': 1.9998072404820648e-05, 'epoch': 0.31}
{'loss': 1.3829, 'grad_norm': 0.081494316458


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.4594, 'grad_norm': 0.2873406708240509, 'learning_rate': 1.99799923589142e-05, 'epoch': 1.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-145
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dty

{'eval_loss': 0.4201500117778778, 'eval_runtime': 6.8803, 'eval_samples_per_second': 9.157, 'eval_steps_per_second': 4.651, 'epoch': 1.0}
{'loss': 0.3924, 'grad_norm': 0.24495331943035126, 'learning_rate': 1.9978589232386036e-05, 'epoch': 1.03}
{'loss': 0.3342, 'grad_norm': 0.21487076580524445, 'learning_rate': 1.997713861131257e-05, 'epoch': 1.07}
{'loss': 0.2734, 'grad_norm': 0.20786333084106445, 'learning_rate': 1.9975640502598243e-05, 'epoch': 1.1}
{'loss': 0.228, 'grad_norm': 0.17612580955028534, 'learning_rate': 1.997409491337352e-05, 'epoch': 1.14}
{'loss': 0.1804, 'grad_norm': 0.13294285535812378, 'learning_rate': 1.9972501850994857e-05, 'epoch': 1.17}
{'loss': 0.1594, 'grad_norm': 0.1360795646905899, 'learning_rate': 1.9970861323044667e-05, 'epoch': 1.21}
{'loss': 0.1501, 'grad_norm': 0.13306614756584167, 'learning_rate': 1.9969173337331283e-05, 'epoch': 1.24}
{'loss': 0.1238, 'grad_norm': 0.1311856061220169, 'learning_rate': 1.9967437901888914e-05, 'epoch': 1.28}
{'loss': 0.0


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0447, 'grad_norm': 0.08023426681756973, 'learning_rate': 1.9920049496797153e-05, 'epoch': 2.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-290
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dty

{'eval_loss': 0.04589034616947174, 'eval_runtime': 6.8828, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 2.0}
{'loss': 0.0353, 'grad_norm': 0.03214845806360245, 'learning_rate': 1.9917272656617704e-05, 'epoch': 2.03}
{'loss': 0.0482, 'grad_norm': 0.046078380197286606, 'learning_rate': 1.9914448613738107e-05, 'epoch': 2.07}
{'loss': 0.0422, 'grad_norm': 0.030392561107873917, 'learning_rate': 1.99115773815998e-05, 'epoch': 2.1}
{'loss': 0.0443, 'grad_norm': 0.05540081113576889, 'learning_rate': 1.9908658973868823e-05, 'epoch': 2.14}
{'loss': 0.0386, 'grad_norm': 0.05533044785261154, 'learning_rate': 1.990569340443577e-05, 'epoch': 2.17}
{'loss': 0.0422, 'grad_norm': 0.042744383215904236, 'learning_rate': 1.9902680687415704e-05, 'epoch': 2.21}
{'loss': 0.0544, 'grad_norm': 0.04437527060508728, 'learning_rate': 1.989962083714808e-05, 'epoch': 2.24}
{'loss': 0.0411, 'grad_norm': 0.04528980702161789, 'learning_rate': 1.9896513868196706e-05, 'epoch': 2.28}
{'loss'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0367, 'grad_norm': 0.06601044535636902, 'learning_rate': 1.982041127670304e-05, 'epoch': 3.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-435
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dty

{'eval_loss': 0.042672719806432724, 'eval_runtime': 6.8808, 'eval_samples_per_second': 9.156, 'eval_steps_per_second': 4.651, 'epoch': 3.0}
{'loss': 0.0451, 'grad_norm': 0.04042164236307144, 'learning_rate': 1.9816271834476642e-05, 'epoch': 3.03}
{'loss': 0.0465, 'grad_norm': 0.03576890379190445, 'learning_rate': 1.981208567027818e-05, 'epoch': 3.07}
{'loss': 0.0401, 'grad_norm': 0.04064302518963814, 'learning_rate': 1.9807852804032306e-05, 'epoch': 3.1}
{'loss': 0.0398, 'grad_norm': 0.037023577839136124, 'learning_rate': 1.9803573255885967e-05, 'epoch': 3.14}
{'loss': 0.0429, 'grad_norm': 0.038088008761405945, 'learning_rate': 1.9799247046208297e-05, 'epoch': 3.17}
{'loss': 0.0349, 'grad_norm': 0.057312220335006714, 'learning_rate': 1.9794874195590514e-05, 'epoch': 3.21}
{'loss': 0.0406, 'grad_norm': 0.02943326346576214, 'learning_rate': 1.979045472484584e-05, 'epoch': 3.24}
{'loss': 0.0491, 'grad_norm': 0.04820864275097847, 'learning_rate': 1.9785988655009386e-05, 'epoch': 3.28}
{'lo


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0394, 'grad_norm': 0.04243633896112442, 'learning_rate': 1.968147640378108e-05, 'epoch': 4.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-580
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dty

{'eval_loss': 0.041130781173706055, 'eval_runtime': 6.8819, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.65, 'epoch': 4.0}
{'loss': 0.0438, 'grad_norm': 0.03963303938508034, 'learning_rate': 1.96759909236026e-05, 'epoch': 4.03}
{'loss': 0.0377, 'grad_norm': 0.05913958698511124, 'learning_rate': 1.9670459389139433e-05, 'epoch': 4.07}
{'loss': 0.0456, 'grad_norm': 0.04234561696648598, 'learning_rate': 1.966488182671972e-05, 'epoch': 4.1}
{'loss': 0.0422, 'grad_norm': 0.05398566648364067, 'learning_rate': 1.9659258262890683e-05, 'epoch': 4.14}
{'loss': 0.0376, 'grad_norm': 0.036745693534612656, 'learning_rate': 1.9653588724418492e-05, 'epoch': 4.17}
{'loss': 0.0456, 'grad_norm': 0.054030586034059525, 'learning_rate': 1.964787323828813e-05, 'epoch': 4.21}
{'loss': 0.0385, 'grad_norm': 0.032193899154663086, 'learning_rate': 1.9642111831703294e-05, 'epoch': 4.24}
{'loss': 0.036, 'grad_norm': 0.04467589035630226, 'learning_rate': 1.963630453208623e-05, 'epoch': 4.28}
{'loss': 


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0348, 'grad_norm': 0.060963552445173264, 'learning_rate': 1.9503800829845613e-05, 'epoch': 5.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-725
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dty

{'eval_loss': 0.03999890387058258, 'eval_runtime': 6.8816, 'eval_samples_per_second': 9.155, 'eval_steps_per_second': 4.65, 'epoch': 5.0}
{'loss': 0.0523, 'grad_norm': 0.05152033641934395, 'learning_rate': 1.949699126201877e-05, 'epoch': 5.03}
{'loss': 0.037, 'grad_norm': 0.03514130041003227, 'learning_rate': 1.9490136491882143e-05, 'epoch': 5.07}
{'loss': 0.0382, 'grad_norm': 0.030937066301703453, 'learning_rate': 1.9483236552061996e-05, 'epoch': 5.1}
{'loss': 0.0371, 'grad_norm': 0.052663929760456085, 'learning_rate': 1.94762914753996e-05, 'epoch': 5.14}
{'loss': 0.0379, 'grad_norm': 0.03430204465985298, 'learning_rate': 1.946930129495106e-05, 'epoch': 5.17}
{'loss': 0.0404, 'grad_norm': 0.05705816298723221, 'learning_rate': 1.9462266043987148e-05, 'epoch': 5.21}
{'loss': 0.0362, 'grad_norm': 0.04576617479324341, 'learning_rate': 1.945518575599317e-05, 'epoch': 5.24}
{'loss': 0.0427, 'grad_norm': 0.04918767511844635, 'learning_rate': 1.944806046466878e-05, 'epoch': 5.28}
{'loss': 0.0


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0506, 'grad_norm': 0.08018342405557632, 'learning_rate': 1.9288095528719245e-05, 'epoch': 6.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-870
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dty

{'eval_loss': 0.03906916826963425, 'eval_runtime': 6.8824, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.65, 'epoch': 6.0}
{'loss': 0.0419, 'grad_norm': 0.04151894152164459, 'learning_rate': 1.9279989121921846e-05, 'epoch': 6.03}
{'loss': 0.0337, 'grad_norm': 0.04131153225898743, 'learning_rate': 1.9271838545667876e-05, 'epoch': 6.07}
{'loss': 0.0345, 'grad_norm': 0.049131013453006744, 'learning_rate': 1.926364383875118e-05, 'epoch': 6.1}
{'loss': 0.0518, 'grad_norm': 0.08803063631057739, 'learning_rate': 1.9255405040175666e-05, 'epoch': 6.14}
{'loss': 0.0351, 'grad_norm': 0.05369000509381294, 'learning_rate': 1.9247122189155082e-05, 'epoch': 6.17}
{'loss': 0.0367, 'grad_norm': 0.07562945783138275, 'learning_rate': 1.9238795325112867e-05, 'epoch': 6.21}
{'loss': 0.0319, 'grad_norm': 0.04933538660407066, 'learning_rate': 1.9230424487681944e-05, 'epoch': 6.24}
{'loss': 0.0369, 'grad_norm': 0.0429980605840683, 'learning_rate': 1.922200971670452e-05, 'epoch': 6.28}
{'loss': 


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0465, 'grad_norm': 0.22474613785743713, 'learning_rate': 1.903522365125102e-05, 'epoch': 7.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1015
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03870159760117531, 'eval_runtime': 6.8818, 'eval_samples_per_second': 9.155, 'eval_steps_per_second': 4.65, 'epoch': 7.0}
{'loss': 0.0412, 'grad_norm': 0.04155987873673439, 'learning_rate': 1.902585284349861e-05, 'epoch': 7.03}
{'loss': 0.0322, 'grad_norm': 0.03155370056629181, 'learning_rate': 1.901643907588816e-05, 'epoch': 7.07}
{'loss': 0.0305, 'grad_norm': 0.029276954010128975, 'learning_rate': 1.9006982393225878e-05, 'epoch': 7.1}
{'loss': 0.0465, 'grad_norm': 0.04299108684062958, 'learning_rate': 1.8997482840522218e-05, 'epoch': 7.14}
{'loss': 0.027, 'grad_norm': 0.056572530418634415, 'learning_rate': 1.8987940462991673e-05, 'epoch': 7.17}
{'loss': 0.0298, 'grad_norm': 0.03572305664420128, 'learning_rate': 1.897835530605258e-05, 'epoch': 7.21}
{'loss': 0.0324, 'grad_norm': 0.045558080077171326, 'learning_rate': 1.8968727415326885e-05, 'epoch': 7.24}
{'loss': 0.0334, 'grad_norm': 0.03876899182796478, 'learning_rate': 1.8959056836639937e-05, 'epoch': 7.28}
{'loss':


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0307, 'grad_norm': 0.11450470238924026, 'learning_rate': 1.874619707139396e-05, 'epoch': 8.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1160
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.037897415459156036, 'eval_runtime': 6.8824, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.65, 'epoch': 8.0}
{'loss': 0.0306, 'grad_norm': 0.052198171615600586, 'learning_rate': 1.873559936023817e-05, 'epoch': 8.03}
{'loss': 0.0465, 'grad_norm': 0.05193769186735153, 'learning_rate': 1.8724960070727974e-05, 'epoch': 8.07}
{'loss': 0.0418, 'grad_norm': 0.051916033029556274, 'learning_rate': 1.8714279253502616e-05, 'epoch': 8.1}
{'loss': 0.0412, 'grad_norm': 0.032911621034145355, 'learning_rate': 1.8703556959398998e-05, 'epoch': 8.14}
{'loss': 0.0298, 'grad_norm': 0.04272962361574173, 'learning_rate': 1.869279323945144e-05, 'epoch': 8.17}
{'loss': 0.0291, 'grad_norm': 0.05734889954328537, 'learning_rate': 1.8681988144891425e-05, 'epoch': 8.21}
{'loss': 0.0328, 'grad_norm': 0.04651014506816864, 'learning_rate': 1.8671141727147374e-05, 'epoch': 8.24}
{'loss': 0.0326, 'grad_norm': 0.0498405396938324, 'learning_rate': 1.866025403784439e-05, 'epoch': 8.28}
{'loss'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.031, 'grad_norm': 0.04714678227901459, 'learning_rate': 1.8422172337162865e-05, 'epoch': 9.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1305
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03765296936035156, 'eval_runtime': 6.8911, 'eval_samples_per_second': 9.142, 'eval_steps_per_second': 4.644, 'epoch': 9.0}
{'loss': 0.0308, 'grad_norm': 0.03979823738336563, 'learning_rate': 1.8410390129643927e-05, 'epoch': 9.03}
{'loss': 0.0414, 'grad_norm': 0.04313907027244568, 'learning_rate': 1.8398567891651163e-05, 'epoch': 9.07}
{'loss': 0.0419, 'grad_norm': 0.06284420937299728, 'learning_rate': 1.8386705679454243e-05, 'epoch': 9.1}
{'loss': 0.0391, 'grad_norm': 0.05195574462413788, 'learning_rate': 1.837480354951308e-05, 'epoch': 9.14}
{'loss': 0.0348, 'grad_norm': 0.045733775943517685, 'learning_rate': 1.8362861558477597e-05, 'epoch': 9.17}
{'loss': 0.0311, 'grad_norm': 0.07874564081430435, 'learning_rate': 1.8350879763187433e-05, 'epoch': 9.21}
{'loss': 0.0351, 'grad_norm': 0.06498535722494125, 'learning_rate': 1.8338858220671683e-05, 'epoch': 9.24}
{'loss': 0.0398, 'grad_norm': 0.046574387699365616, 'learning_rate': 1.8326796988148627e-05, 'epoch': 9.28}
{'los


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0353, 'grad_norm': 0.04772978276014328, 'learning_rate': 1.806444604267483e-05, 'epoch': 10.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1450
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.037245750427246094, 'eval_runtime': 6.8862, 'eval_samples_per_second': 9.149, 'eval_steps_per_second': 4.647, 'epoch': 10.0}
{'loss': 0.0335, 'grad_norm': 0.05922059342265129, 'learning_rate': 1.8051526485628582e-05, 'epoch': 10.03}
{'loss': 0.0349, 'grad_norm': 0.06657275557518005, 'learning_rate': 1.8038568606172172e-05, 'epoch': 10.07}
{'loss': 0.0359, 'grad_norm': 0.047869786620140076, 'learning_rate': 1.802557246598051e-05, 'epoch': 10.1}
{'loss': 0.0318, 'grad_norm': 0.03756537288427353, 'learning_rate': 1.801253812691061e-05, 'epoch': 10.14}
{'loss': 0.0272, 'grad_norm': 0.04345676675438881, 'learning_rate': 1.7999465651001297e-05, 'epoch': 10.17}
{'loss': 0.0303, 'grad_norm': 0.08344458043575287, 'learning_rate': 1.798635510047293e-05, 'epoch': 10.21}
{'loss': 0.0376, 'grad_norm': 0.06222066283226013, 'learning_rate': 1.797320653772707e-05, 'epoch': 10.24}
{'loss': 0.0337, 'grad_norm': 0.04209303855895996, 'learning_rate': 1.796002002534622e-05, 'epoch': 10.28}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0259, 'grad_norm': 0.054174575954675674, 'learning_rate': 1.7674449639791255e-05, 'epoch': 11.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1595
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03734889253973961, 'eval_runtime': 6.8824, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.65, 'epoch': 11.0}
{'loss': 0.0348, 'grad_norm': 0.056006599217653275, 'learning_rate': 1.766044443118978e-05, 'epoch': 11.03}
{'loss': 0.0385, 'grad_norm': 0.0732097327709198, 'learning_rate': 1.7646402761590006e-05, 'epoch': 11.07}
{'loss': 0.0348, 'grad_norm': 0.056362785398960114, 'learning_rate': 1.7632324697825288e-05, 'epoch': 11.1}
{'loss': 0.0353, 'grad_norm': 0.04466785863041878, 'learning_rate': 1.7618210306902227e-05, 'epoch': 11.14}
{'loss': 0.0474, 'grad_norm': 0.05593980848789215, 'learning_rate': 1.7604059656000313e-05, 'epoch': 11.17}
{'loss': 0.0376, 'grad_norm': 0.06101289018988609, 'learning_rate': 1.758987281247162e-05, 'epoch': 11.21}
{'loss': 0.036, 'grad_norm': 0.06856410950422287, 'learning_rate': 1.75756498438405e-05, 'epoch': 11.24}
{'loss': 0.028, 'grad_norm': 0.05705433338880539, 'learning_rate': 1.7561390817803226e-05, 'epoch': 11.28}
{'l


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0444, 'grad_norm': 0.08301598578691483, 'learning_rate': 1.7253743710122877e-05, 'epoch': 12.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1740
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03691443055868149, 'eval_runtime': 6.883, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 12.0}
{'loss': 0.0343, 'grad_norm': 0.060254428535699844, 'learning_rate': 1.723870889220358e-05, 'epoch': 12.03}
{'loss': 0.0265, 'grad_norm': 0.048848509788513184, 'learning_rate': 1.7223639620597556e-05, 'epoch': 12.07}
{'loss': 0.0394, 'grad_norm': 0.07562282681465149, 'learning_rate': 1.720853596702919e-05, 'epoch': 12.1}
{'loss': 0.0408, 'grad_norm': 0.04173414781689644, 'learning_rate': 1.7193398003386514e-05, 'epoch': 12.14}
{'loss': 0.0352, 'grad_norm': 0.055548179894685745, 'learning_rate': 1.7178225801720865e-05, 'epoch': 12.17}
{'loss': 0.0464, 'grad_norm': 0.05498165264725685, 'learning_rate': 1.7163019434246545e-05, 'epoch': 12.21}
{'loss': 0.0334, 'grad_norm': 0.052336279302835464, 'learning_rate': 1.7147778973340466e-05, 'epoch': 12.24}
{'loss': 0.0255, 'grad_norm': 0.058311499655246735, 'learning_rate': 1.713250449154182e-05, 'epoch': 12.


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0398, 'grad_norm': 0.13863010704517365, 'learning_rate': 1.6804011720318394e-05, 'epoch': 13.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-1885
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.037013519555330276, 'eval_runtime': 6.8836, 'eval_samples_per_second': 9.152, 'eval_steps_per_second': 4.649, 'epoch': 13.0}
{'loss': 0.0297, 'grad_norm': 0.04555593058466911, 'learning_rate': 1.678800745532942e-05, 'epoch': 13.03}
{'loss': 0.0335, 'grad_norm': 0.06297636777162552, 'learning_rate': 1.677197088183269e-05, 'epoch': 13.07}
{'loss': 0.0321, 'grad_norm': 0.04566813260316849, 'learning_rate': 1.6755902076156606e-05, 'epoch': 13.1}
{'loss': 0.0387, 'grad_norm': 0.056620895862579346, 'learning_rate': 1.673980111478298e-05, 'epoch': 13.14}
{'loss': 0.0304, 'grad_norm': 0.06986584514379501, 'learning_rate': 1.672366807434668e-05, 'epoch': 13.17}
{'loss': 0.028, 'grad_norm': 0.061541054397821426, 'learning_rate': 1.6707503031635258e-05, 'epoch': 13.21}
{'loss': 0.0472, 'grad_norm': 0.08779741078615189, 'learning_rate': 1.6691306063588583e-05, 'epoch': 13.24}
{'loss': 0.0305, 'grad_norm': 0.04593755304813385, 'learning_rate': 1.6675077247298475e-05, 'epoch': 13.28}


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0333, 'grad_norm': 0.1010560616850853, 'learning_rate': 1.6327053285625164e-05, 'epoch': 14.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2030
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03675984963774681, 'eval_runtime': 6.8835, 'eval_samples_per_second': 9.152, 'eval_steps_per_second': 4.649, 'epoch': 14.0}
{'loss': 0.0429, 'grad_norm': 0.07168961316347122, 'learning_rate': 1.631014361508446e-05, 'epoch': 14.03}
{'loss': 0.0307, 'grad_norm': 0.03159947693347931, 'learning_rate': 1.6293203910498375e-05, 'epoch': 14.07}
{'loss': 0.0306, 'grad_norm': 0.05173318460583687, 'learning_rate': 1.6276234252493903e-05, 'epoch': 14.1}
{'loss': 0.0324, 'grad_norm': 0.043427225202322006, 'learning_rate': 1.6259234721840595e-05, 'epoch': 14.14}
{'loss': 0.0283, 'grad_norm': 0.04613537713885307, 'learning_rate': 1.624220539945018e-05, 'epoch': 14.17}
{'loss': 0.0406, 'grad_norm': 0.05572952702641487, 'learning_rate': 1.6225146366376198e-05, 'epoch': 14.21}
{'loss': 0.0287, 'grad_norm': 0.05731779709458351, 'learning_rate': 1.6208057703813595e-05, 'epoch': 14.24}
{'loss': 0.034, 'grad_norm': 0.04489654675126076, 'learning_rate': 1.6190939493098344e-05, 'epoch': 14.28}


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0359, 'grad_norm': 0.09685233980417252, 'learning_rate': 1.5824776968678024e-05, 'epoch': 15.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2175
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03635180741548538, 'eval_runtime': 6.8842, 'eval_samples_per_second': 9.151, 'eval_steps_per_second': 4.648, 'epoch': 15.0}
{'loss': 0.0238, 'grad_norm': 0.054951753467321396, 'learning_rate': 1.5807029557109398e-05, 'epoch': 15.03}
{'loss': 0.0425, 'grad_norm': 0.07482010871171951, 'learning_rate': 1.578925450613986e-05, 'epoch': 15.07}
{'loss': 0.0325, 'grad_norm': 0.07376702129840851, 'learning_rate': 1.577145190037234e-05, 'epoch': 15.1}
{'loss': 0.0346, 'grad_norm': 0.04637366160750389, 'learning_rate': 1.5753621824540924e-05, 'epoch': 15.14}
{'loss': 0.032, 'grad_norm': 0.06512972712516785, 'learning_rate': 1.573576436351046e-05, 'epoch': 15.17}
{'loss': 0.028, 'grad_norm': 0.04253850877285004, 'learning_rate': 1.5717879602276123e-05, 'epoch': 15.21}
{'loss': 0.0257, 'grad_norm': 0.07758267968893051, 'learning_rate': 1.5699967625963032e-05, 'epoch': 15.24}
{'loss': 0.0353, 'grad_norm': 0.06861064583063126, 'learning_rate': 1.568202851982584e-05, 'epoch': 15.28}
{'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0277, 'grad_norm': 0.07079000771045685, 'learning_rate': 1.529919264233205e-05, 'epoch': 16.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2320
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.036526620388031006, 'eval_runtime': 6.8837, 'eval_samples_per_second': 9.152, 'eval_steps_per_second': 4.649, 'epoch': 16.0}
{'loss': 0.0392, 'grad_norm': 0.0987226590514183, 'learning_rate': 1.528067850650368e-05, 'epoch': 16.03}
{'loss': 0.0263, 'grad_norm': 0.05120842531323433, 'learning_rate': 1.5262139236518695e-05, 'epoch': 16.07}
{'loss': 0.0368, 'grad_norm': 0.06633025407791138, 'learning_rate': 1.5243574920617445e-05, 'epoch': 16.1}
{'loss': 0.0259, 'grad_norm': 0.05393323674798012, 'learning_rate': 1.5224985647159489e-05, 'epoch': 16.14}
{'loss': 0.0326, 'grad_norm': 0.034708861261606216, 'learning_rate': 1.5206371504623175e-05, 'epoch': 16.17}
{'loss': 0.0369, 'grad_norm': 0.06610293686389923, 'learning_rate': 1.5187732581605217e-05, 'epoch': 16.21}
{'loss': 0.0412, 'grad_norm': 0.07524990290403366, 'learning_rate': 1.5169068966820275e-05, 'epoch': 16.24}
{'loss': 0.0308, 'grad_norm': 0.04937271401286125, 'learning_rate': 1.5150380749100545e-05, 'epoch': 16.2


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0449, 'grad_norm': 0.11406335979700089, 'learning_rate': 1.4752403447099617e-05, 'epoch': 17.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2465
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03681216761469841, 'eval_runtime': 6.8787, 'eval_samples_per_second': 9.159, 'eval_steps_per_second': 4.652, 'epoch': 17.0}
{'loss': 0.0295, 'grad_norm': 0.05704876407980919, 'learning_rate': 1.4733196671848435e-05, 'epoch': 17.03}
{'loss': 0.0295, 'grad_norm': 0.04998396337032318, 'learning_rate': 1.4713967368259981e-05, 'epoch': 17.07}
{'loss': 0.0331, 'grad_norm': 0.051984161138534546, 'learning_rate': 1.469471562785891e-05, 'epoch': 17.1}
{'loss': 0.0343, 'grad_norm': 0.09759477525949478, 'learning_rate': 1.4675441542276685e-05, 'epoch': 17.14}
{'loss': 0.024, 'grad_norm': 0.046006713062524796, 'learning_rate': 1.4656145203251116e-05, 'epoch': 17.17}
{'loss': 0.0287, 'grad_norm': 0.06949015706777573, 'learning_rate': 1.463682670262593e-05, 'epoch': 17.21}
{'loss': 0.0271, 'grad_norm': 0.049049653112888336, 'learning_rate': 1.4617486132350343e-05, 'epoch': 17.24}
{'loss': 0.03, 'grad_norm': 0.06276753544807434, 'learning_rate': 1.45981235844786e-05, 'epoch': 17.28}
{


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0337, 'grad_norm': 0.17334814369678497, 'learning_rate': 1.4186597375374283e-05, 'epoch': 18.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2610
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03613695129752159, 'eval_runtime': 6.8796, 'eval_samples_per_second': 9.158, 'eval_steps_per_second': 4.651, 'epoch': 18.0}
{'loss': 0.0251, 'grad_norm': 0.056629788130521774, 'learning_rate': 1.416677481715342e-05, 'epoch': 18.03}
{'loss': 0.0264, 'grad_norm': 0.06882836669683456, 'learning_rate': 1.4146932426562391e-05, 'epoch': 18.07}
{'loss': 0.0274, 'grad_norm': 0.05286971479654312, 'learning_rate': 1.4127070298043949e-05, 'epoch': 18.1}
{'loss': 0.0337, 'grad_norm': 0.07517164945602417, 'learning_rate': 1.4107188526134774e-05, 'epoch': 18.14}
{'loss': 0.0348, 'grad_norm': 0.05001625046133995, 'learning_rate': 1.408728720546505e-05, 'epoch': 18.17}
{'loss': 0.0287, 'grad_norm': 0.10535594075918198, 'learning_rate': 1.4067366430758004e-05, 'epoch': 18.21}
{'loss': 0.0281, 'grad_norm': 0.05088945850729942, 'learning_rate': 1.4047426296829455e-05, 'epoch': 18.24}
{'loss': 0.0225, 'grad_norm': 0.05663256347179413, 'learning_rate': 1.4027466898587375e-05, 'epoch': 18.28


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0337, 'grad_norm': 0.16306646168231964, 'learning_rate': 1.36040385161175e-05, 'epoch': 19.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2755
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.036287784576416016, 'eval_runtime': 6.8886, 'eval_samples_per_second': 9.146, 'eval_steps_per_second': 4.645, 'epoch': 19.0}
{'loss': 0.0303, 'grad_norm': 0.04589654877781868, 'learning_rate': 1.3583679495453e-05, 'epoch': 19.03}
{'loss': 0.0228, 'grad_norm': 0.08839868754148483, 'learning_rate': 1.3563303417745258e-05, 'epoch': 19.07}
{'loss': 0.0368, 'grad_norm': 0.09631358832120895, 'learning_rate': 1.3542910379977158e-05, 'epoch': 19.1}
{'loss': 0.0294, 'grad_norm': 0.0846937745809555, 'learning_rate': 1.3522500479212337e-05, 'epoch': 19.14}
{'loss': 0.0346, 'grad_norm': 0.057120513170957565, 'learning_rate': 1.3502073812594677e-05, 'epoch': 19.17}
{'loss': 0.0264, 'grad_norm': 0.07025638967752457, 'learning_rate': 1.3481630477347864e-05, 'epoch': 19.21}
{'loss': 0.0263, 'grad_norm': 0.05678292363882065, 'learning_rate': 1.346117057077493e-05, 'epoch': 19.24}
{'loss': 0.0336, 'grad_norm': 0.0666596069931984, 'learning_rate': 1.3440694190257768e-05, 'epoch': 19.28}
{


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0321, 'grad_norm': 0.0897749736905098, 'learning_rate': 1.300705799504273e-05, 'epoch': 20.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-2900
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03651963174343109, 'eval_runtime': 6.8832, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 20.0}
{'loss': 0.0245, 'grad_norm': 0.05447490140795708, 'learning_rate': 1.2986243979130277e-05, 'epoch': 20.03}
{'loss': 0.0238, 'grad_norm': 0.06545262038707733, 'learning_rate': 1.296541574975571e-05, 'epoch': 20.07}
{'loss': 0.0395, 'grad_norm': 0.09281449019908905, 'learning_rate': 1.2944573406054021e-05, 'epoch': 20.1}
{'loss': 0.0326, 'grad_norm': 0.058882568031549454, 'learning_rate': 1.2923717047227368e-05, 'epoch': 20.14}
{'loss': 0.031, 'grad_norm': 0.07338576018810272, 'learning_rate': 1.2902846772544625e-05, 'epoch': 20.17}
{'loss': 0.0256, 'grad_norm': 0.055072613060474396, 'learning_rate': 1.2881962681340894e-05, 'epoch': 20.21}
{'loss': 0.0269, 'grad_norm': 0.06100992485880852, 'learning_rate': 1.2861064873017044e-05, 'epoch': 20.24}
{'loss': 0.0329, 'grad_norm': 0.03943958878517151, 'learning_rate': 1.284015344703923e-05, 'epoch': 20.28


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0263, 'grad_norm': 0.15478652715682983, 'learning_rate': 1.2398044646550167e-05, 'epoch': 21.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3045
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03689410164952278, 'eval_runtime': 6.8812, 'eval_samples_per_second': 9.155, 'eval_steps_per_second': 4.65, 'epoch': 21.0}
{'loss': 0.0318, 'grad_norm': 0.0811372920870781, 'learning_rate': 1.2376858923261732e-05, 'epoch': 21.03}
{'loss': 0.0337, 'grad_norm': 0.06318046152591705, 'learning_rate': 1.2355661886967904e-05, 'epoch': 21.07}
{'loss': 0.0233, 'grad_norm': 0.07216648757457733, 'learning_rate': 1.2334453638559057e-05, 'epoch': 21.1}
{'loss': 0.0288, 'grad_norm': 0.07515560835599899, 'learning_rate': 1.231323427897893e-05, 'epoch': 21.14}
{'loss': 0.0338, 'grad_norm': 0.05739833042025566, 'learning_rate': 1.2292003909224144e-05, 'epoch': 21.17}
{'loss': 0.0256, 'grad_norm': 0.06019795313477516, 'learning_rate': 1.2270762630343734e-05, 'epoch': 21.21}
{'loss': 0.0237, 'grad_norm': 0.06364704668521881, 'learning_rate': 1.2249510543438652e-05, 'epoch': 21.24}
{'loss': 0.0314, 'grad_norm': 0.10226250439882278, 'learning_rate': 1.2228247749661293e-05, 'epoch': 21.28}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0451, 'grad_norm': 0.14230026304721832, 'learning_rate': 1.177943545473842e-05, 'epoch': 22.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3190
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03686317428946495, 'eval_runtime': 6.8791, 'eval_samples_per_second': 9.158, 'eval_steps_per_second': 4.652, 'epoch': 22.0}
{'loss': 0.0309, 'grad_norm': 0.04719972237944603, 'learning_rate': 1.1757962799343548e-05, 'epoch': 22.03}
{'loss': 0.0273, 'grad_norm': 0.07361526787281036, 'learning_rate': 1.1736481776669307e-05, 'epoch': 22.07}
{'loss': 0.0288, 'grad_norm': 0.06346013396978378, 'learning_rate': 1.1714992488957743e-05, 'epoch': 22.1}
{'loss': 0.0285, 'grad_norm': 0.10267509520053864, 'learning_rate': 1.1693495038490247e-05, 'epoch': 22.14}
{'loss': 0.0279, 'grad_norm': 0.05362101271748543, 'learning_rate': 1.1671989527587057e-05, 'epoch': 22.17}
{'loss': 0.0262, 'grad_norm': 0.07745955139398575, 'learning_rate': 1.1650476058606776e-05, 'epoch': 22.21}
{'loss': 0.0279, 'grad_norm': 0.05433015152812004, 'learning_rate': 1.162895473394589e-05, 'epoch': 22.24}
{'loss': 0.0352, 'grad_norm': 0.08607536554336548, 'learning_rate': 1.1607425656038263e-05, 'epoch': 22.28


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0296, 'grad_norm': 0.07165136933326721, 'learning_rate': 1.115370580174392e-05, 'epoch': 23.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3335
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.036772679537534714, 'eval_runtime': 6.882, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.65, 'epoch': 23.0}
{'loss': 0.027, 'grad_norm': 0.05987037345767021, 'learning_rate': 1.113203213767907e-05, 'epoch': 23.03}
{'loss': 0.0342, 'grad_norm': 0.10208912193775177, 'learning_rate': 1.1110353085542778e-05, 'epoch': 23.07}
{'loss': 0.0314, 'grad_norm': 0.04447310045361519, 'learning_rate': 1.1088668748519646e-05, 'epoch': 23.1}
{'loss': 0.0271, 'grad_norm': 0.06204051896929741, 'learning_rate': 1.1066979229819427e-05, 'epoch': 23.14}
{'loss': 0.0319, 'grad_norm': 0.05547104775905609, 'learning_rate': 1.1045284632676535e-05, 'epoch': 23.17}
{'loss': 0.0265, 'grad_norm': 0.06184665113687515, 'learning_rate': 1.102358506034956e-05, 'epoch': 23.21}
{'loss': 0.0273, 'grad_norm': 0.08924055844545364, 'learning_rate': 1.1001880616120764e-05, 'epoch': 23.24}
{'loss': 0.0244, 'grad_norm': 0.11256001144647598, 'learning_rate': 1.098017140329561e-05, 'epoch': 23.28}
{'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0219, 'grad_norm': 0.14267316460609436, 'learning_rate': 1.0523359562429441e-05, 'epoch': 24.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3480
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.036675747483968735, 'eval_runtime': 6.8825, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.649, 'epoch': 24.0}
{'loss': 0.0263, 'grad_norm': 0.05836401879787445, 'learning_rate': 1.0501571617472934e-05, 'epoch': 24.03}
{'loss': 0.034, 'grad_norm': 0.09701675176620483, 'learning_rate': 1.047978128521344e-05, 'epoch': 24.07}
{'loss': 0.0339, 'grad_norm': 0.09909562021493912, 'learning_rate': 1.045798866936521e-05, 'epoch': 24.1}
{'loss': 0.0224, 'grad_norm': 0.05955342575907707, 'learning_rate': 1.0436193873653362e-05, 'epoch': 24.14}
{'loss': 0.0426, 'grad_norm': 0.0828840434551239, 'learning_rate': 1.0414397001813396e-05, 'epoch': 24.17}
{'loss': 0.0204, 'grad_norm': 0.053488776087760925, 'learning_rate': 1.0392598157590687e-05, 'epoch': 24.21}
{'loss': 0.0322, 'grad_norm': 0.07507229596376419, 'learning_rate': 1.0370797444740008e-05, 'epoch': 24.24}
{'loss': 0.0276, 'grad_norm': 0.0776604562997818, 'learning_rate': 1.0348994967025012e-05, 'epoch': 24.28}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0308, 'grad_norm': 0.0747794359922409, 'learning_rate': 9.890919085058179e-06, 'epoch': 25.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3625
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.0372241847217083, 'eval_runtime': 6.8825, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.649, 'epoch': 25.0}
{'loss': 0.0271, 'grad_norm': 0.0794016644358635, 'learning_rate': 9.869104044286558e-06, 'epoch': 25.03}
{'loss': 0.0342, 'grad_norm': 0.0633072480559349, 'learning_rate': 9.847289626533257e-06, 'epoch': 25.07}
{'loss': 0.0305, 'grad_norm': 0.07754866033792496, 'learning_rate': 9.825475935627165e-06, 'epoch': 25.1}
{'loss': 0.0273, 'grad_norm': 0.08987481147050858, 'learning_rate': 9.80366307539372e-06, 'epoch': 25.14}
{'loss': 0.0238, 'grad_norm': 0.0560021847486496, 'learning_rate': 9.78185114965439e-06, 'epoch': 25.17}
{'loss': 0.0277, 'grad_norm': 0.052824635058641434, 'learning_rate': 9.760040262226214e-06, 'epoch': 25.21}
{'loss': 0.0392, 'grad_norm': 0.0658341720700264, 'learning_rate': 9.738230516921272e-06, 'epoch': 25.24}
{'loss': 0.0273, 'grad_norm': 0.06198159605264664, 'learning_rate': 9.716422017546219e-06, 'epoch': 25.28}
{'loss': 0.


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0299, 'grad_norm': 0.12294723838567734, 'learning_rate': 9.258915098046008e-06, 'epoch': 26.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3770
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.037072669714689255, 'eval_runtime': 6.884, 'eval_samples_per_second': 9.152, 'eval_steps_per_second': 4.648, 'epoch': 26.0}
{'loss': 0.0235, 'grad_norm': 0.06891174614429474, 'learning_rate': 9.237160254960477e-06, 'epoch': 26.03}
{'loss': 0.0233, 'grad_norm': 0.05304296314716339, 'learning_rate': 9.215409042721553e-06, 'epoch': 26.07}
{'loss': 0.0227, 'grad_norm': 0.0881737768650055, 'learning_rate': 9.193661564857283e-06, 'epoch': 26.1}
{'loss': 0.0324, 'grad_norm': 0.10912938416004181, 'learning_rate': 9.17191792487796e-06, 'epoch': 26.14}
{'loss': 0.0265, 'grad_norm': 0.08438055962324142, 'learning_rate': 9.150178226275584e-06, 'epoch': 26.17}
{'loss': 0.032, 'grad_norm': 0.07056000828742981, 'learning_rate': 9.128442572523418e-06, 'epoch': 26.21}
{'loss': 0.0244, 'grad_norm': 0.08611530065536499, 'learning_rate': 9.106711067075464e-06, 'epoch': 26.24}
{'loss': 0.0287, 'grad_norm': 0.07677388936281204, 'learning_rate': 9.084983813365977e-06, 'epoch': 26.28}
{'loss':


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0316, 'grad_norm': 0.12536221742630005, 'learning_rate': 8.629876583180322e-06, 'epoch': 27.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-3915
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.0373259037733078, 'eval_runtime': 6.8826, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.649, 'epoch': 27.0}
{'loss': 0.0265, 'grad_norm': 0.05809590592980385, 'learning_rate': 8.60826899039935e-06, 'epoch': 27.03}
{'loss': 0.0272, 'grad_norm': 0.05742407962679863, 'learning_rate': 8.586668021764328e-06, 'epoch': 27.07}
{'loss': 0.0249, 'grad_norm': 0.06598392874002457, 'learning_rate': 8.56507378008821e-06, 'epoch': 27.1}
{'loss': 0.0302, 'grad_norm': 0.08548672497272491, 'learning_rate': 8.543486368151926e-06, 'epoch': 27.14}
{'loss': 0.0312, 'grad_norm': 0.08268078416585922, 'learning_rate': 8.521905888703894e-06, 'epoch': 27.17}
{'loss': 0.0275, 'grad_norm': 0.11927108466625214, 'learning_rate': 8.50033244445955e-06, 'epoch': 27.21}
{'loss': 0.0197, 'grad_norm': 0.06505869328975677, 'learning_rate': 8.478766138100834e-06, 'epoch': 27.24}
{'loss': 0.0266, 'grad_norm': 0.08993908762931824, 'learning_rate': 8.457207072275712e-06, 'epoch': 27.28}
{'loss': 


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0229, 'grad_norm': 0.07921142876148224, 'learning_rate': 8.00632065582803e-06, 'epoch': 28.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4060
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.037367962300777435, 'eval_runtime': 6.8822, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.65, 'epoch': 28.0}
{'loss': 0.0312, 'grad_norm': 0.08048124611377716, 'learning_rate': 7.984946776743829e-06, 'epoch': 28.03}
{'loss': 0.023, 'grad_norm': 0.08054837584495544, 'learning_rate': 7.963582488598227e-06, 'epoch': 28.07}
{'loss': 0.0327, 'grad_norm': 0.0971958115696907, 'learning_rate': 7.942227893077652e-06, 'epoch': 28.1}
{'loss': 0.0233, 'grad_norm': 0.09293553978204727, 'learning_rate': 7.92088309182241e-06, 'epoch': 28.14}
{'loss': 0.0203, 'grad_norm': 0.12084190547466278, 'learning_rate': 7.899548186426177e-06, 'epoch': 28.17}
{'loss': 0.0331, 'grad_norm': 0.05095755308866501, 'learning_rate': 7.878223278435539e-06, 'epoch': 28.21}
{'loss': 0.0334, 'grad_norm': 0.08053430169820786, 'learning_rate': 7.856908469349495e-06, 'epoch': 28.24}
{'loss': 0.0236, 'grad_norm': 0.06441955268383026, 'learning_rate': 7.835603860618973e-06, 'epoch': 28.28}
{'loss':


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0315, 'grad_norm': 0.06944422423839569, 'learning_rate': 7.3907424926274115e-06, 'epoch': 29.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4205
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03777090832591057, 'eval_runtime': 6.8834, 'eval_samples_per_second': 9.152, 'eval_steps_per_second': 4.649, 'epoch': 29.0}
{'loss': 0.0285, 'grad_norm': 0.062428951263427734, 'learning_rate': 7.3696878554202525e-06, 'epoch': 29.03}
{'loss': 0.0221, 'grad_norm': 0.06354544311761856, 'learning_rate': 7.348645737565919e-06, 'epoch': 29.07}
{'loss': 0.0209, 'grad_norm': 0.07725328207015991, 'learning_rate': 7.327616239217432e-06, 'epoch': 29.1}
{'loss': 0.0387, 'grad_norm': 0.08969204127788544, 'learning_rate': 7.306599460467741e-06, 'epoch': 29.14}
{'loss': 0.0256, 'grad_norm': 0.09614239633083344, 'learning_rate': 7.285595501349259e-06, 'epoch': 29.17}
{'loss': 0.0241, 'grad_norm': 0.0980062335729599, 'learning_rate': 7.26460446183338e-06, 'epoch': 29.21}
{'loss': 0.0314, 'grad_norm': 0.08231943100690842, 'learning_rate': 7.243626441830009e-06, 'epoch': 29.24}
{'loss': 0.0317, 'grad_norm': 0.07075371593236923, 'learning_rate': 7.2226615411870796e-06, 'epoch': 29.28}
{'lo


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0223, 'grad_norm': 0.18426059186458588, 'learning_rate': 6.785605346968387e-06, 'epoch': 30.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4350
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.038366787135601044, 'eval_runtime': 6.8843, 'eval_samples_per_second': 9.151, 'eval_steps_per_second': 4.648, 'epoch': 30.0}
{'loss': 0.0257, 'grad_norm': 0.11836399137973785, 'learning_rate': 6.7649542023631545e-06, 'epoch': 30.03}
{'loss': 0.023, 'grad_norm': 0.13316339254379272, 'learning_rate': 6.744318455428436e-06, 'epoch': 30.07}
{'loss': 0.0281, 'grad_norm': 0.07308385521173477, 'learning_rate': 6.723698204383067e-06, 'epoch': 30.1}
{'loss': 0.0311, 'grad_norm': 0.058335743844509125, 'learning_rate': 6.70309354737213e-06, 'epoch': 30.14}
{'loss': 0.0233, 'grad_norm': 0.09873541444540024, 'learning_rate': 6.682504582466482e-06, 'epoch': 30.17}
{'loss': 0.0217, 'grad_norm': 0.05101218447089195, 'learning_rate': 6.661931407662292e-06, 'epoch': 30.21}
{'loss': 0.0346, 'grad_norm': 0.06977202743291855, 'learning_rate': 6.6413741208805795e-06, 'epoch': 30.24}
{'loss': 0.0349, 'grad_norm': 0.14045605063438416, 'learning_rate': 6.6208328199667305e-06, 'epoch': 30.28}
{'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0261, 'grad_norm': 0.1751624047756195, 'learning_rate': 6.1933306922145556e-06, 'epoch': 31.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4495
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03801579028367996, 'eval_runtime': 6.8828, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 31.0}
{'loss': 0.0328, 'grad_norm': 0.07143480330705643, 'learning_rate': 6.173165676349103e-06, 'epoch': 31.03}
{'loss': 0.0287, 'grad_norm': 0.05812644213438034, 'learning_rate': 6.153018874857639e-06, 'epoch': 31.07}
{'loss': 0.0268, 'grad_norm': 0.06656871736049652, 'learning_rate': 6.132890383631796e-06, 'epoch': 31.1}
{'loss': 0.0225, 'grad_norm': 0.05654679238796234, 'learning_rate': 6.112780298476044e-06, 'epoch': 31.14}
{'loss': 0.0202, 'grad_norm': 0.06989990174770355, 'learning_rate': 6.092688715107265e-06, 'epoch': 31.17}
{'loss': 0.0272, 'grad_norm': 0.09292610734701157, 'learning_rate': 6.072615729154261e-06, 'epoch': 31.21}
{'loss': 0.0229, 'grad_norm': 0.11813348531723022, 'learning_rate': 6.052561436157329e-06, 'epoch': 31.24}
{'loss': 0.0269, 'grad_norm': 0.08890306949615479, 'learning_rate': 6.0325259315677895e-06, 'epoch': 31.28}
{'lo


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0207, 'grad_norm': 0.13573183119297028, 'learning_rate': 5.616288532109225e-06, 'epoch': 32.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4640
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.038618940860033035, 'eval_runtime': 6.8827, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 32.0}
{'loss': 0.0239, 'grad_norm': 0.10173540562391281, 'learning_rate': 5.596690335863542e-06, 'epoch': 32.03}
{'loss': 0.0222, 'grad_norm': 0.08784297108650208, 'learning_rate': 5.5771130978099896e-06, 'epoch': 32.07}
{'loss': 0.021, 'grad_norm': 0.062066901475191116, 'learning_rate': 5.5575569111292725e-06, 'epoch': 32.1}
{'loss': 0.0337, 'grad_norm': 0.07865682244300842, 'learning_rate': 5.5380218689019125e-06, 'epoch': 32.14}
{'loss': 0.028, 'grad_norm': 0.07559805363416672, 'learning_rate': 5.518508064107776e-06, 'epoch': 32.17}
{'loss': 0.0264, 'grad_norm': 0.08731958270072937, 'learning_rate': 5.499015589625649e-06, 'epoch': 32.21}
{'loss': 0.023, 'grad_norm': 0.05950476974248886, 'learning_rate': 5.479544538232804e-06, 'epoch': 32.24}
{'loss': 0.027, 'grad_norm': 0.06714548915624619, 'learning_rate': 5.460095002604533e-06, 'epoch': 32.28}
{'lo


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0255, 'grad_norm': 0.11431938409805298, 'learning_rate': 5.056787917138557e-06, 'epoch': 33.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4785
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03875565901398659, 'eval_runtime': 6.883, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 33.0}
{'loss': 0.0253, 'grad_norm': 0.07996451109647751, 'learning_rate': 5.037834963247922e-06, 'epoch': 33.03}
{'loss': 0.0276, 'grad_norm': 0.09105274081230164, 'learning_rate': 5.0189056275027595e-06, 'epoch': 33.07}
{'loss': 0.0264, 'grad_norm': 0.09459497034549713, 'learning_rate': 5.000000000000003e-06, 'epoch': 33.1}
{'loss': 0.025, 'grad_norm': 0.07542718201875687, 'learning_rate': 4.981118170723726e-06, 'epoch': 33.14}
{'loss': 0.0284, 'grad_norm': 0.08119719475507736, 'learning_rate': 4.962260229544738e-06, 'epoch': 33.17}
{'loss': 0.0234, 'grad_norm': 0.09797924757003784, 'learning_rate': 4.943426266220156e-06, 'epoch': 33.21}
{'loss': 0.0217, 'grad_norm': 0.07478044927120209, 'learning_rate': 4.924616370392962e-06, 'epoch': 33.24}
{'loss': 0.0284, 'grad_norm': 0.08827169239521027, 'learning_rate': 4.9058306315915826e-06, 'epoch': 33.28}
{'los


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0297, 'grad_norm': 0.09263128787279129, 'learning_rate': 4.517067704800864e-06, 'epoch': 34.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-4930
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03889624401926994, 'eval_runtime': 6.8825, 'eval_samples_per_second': 9.154, 'eval_steps_per_second': 4.649, 'epoch': 34.0}
{'loss': 0.0245, 'grad_norm': 0.08507437258958817, 'learning_rate': 4.498835834045067e-06, 'epoch': 34.03}
{'loss': 0.0249, 'grad_norm': 0.07654853910207748, 'learning_rate': 4.480630146879419e-06, 'epoch': 34.07}
{'loss': 0.0243, 'grad_norm': 0.1466064751148224, 'learning_rate': 4.462450729956531e-06, 'epoch': 34.1}
{'loss': 0.0311, 'grad_norm': 0.10216330736875534, 'learning_rate': 4.444297669803981e-06, 'epoch': 34.14}
{'loss': 0.0217, 'grad_norm': 0.09037411957979202, 'learning_rate': 4.42617105282389e-06, 'epoch': 34.17}
{'loss': 0.0278, 'grad_norm': 0.10828156024217606, 'learning_rate': 4.408070965292534e-06, 'epoch': 34.21}
{'loss': 0.0308, 'grad_norm': 0.13355080783367157, 'learning_rate': 4.389997493359905e-06, 'epoch': 34.24}
{'loss': 0.0211, 'grad_norm': 0.07465239614248276, 'learning_rate': 4.371950723049314e-06, 'epoch': 34.28}
{'loss'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0252, 'grad_norm': 0.14006060361862183, 'learning_rate': 3.999287600755192e-06, 'epoch': 35.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5075
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03956887125968933, 'eval_runtime': 6.8826, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 35.0}
{'loss': 0.0252, 'grad_norm': 0.08424044400453568, 'learning_rate': 3.981849768479516e-06, 'epoch': 35.03}
{'loss': 0.0232, 'grad_norm': 0.07743138819932938, 'learning_rate': 3.964440580464286e-06, 'epoch': 35.07}
{'loss': 0.0213, 'grad_norm': 0.07195182144641876, 'learning_rate': 3.9470601195710575e-06, 'epoch': 35.1}
{'loss': 0.0348, 'grad_norm': 0.11507519334554672, 'learning_rate': 3.929708468524655e-06, 'epoch': 35.14}
{'loss': 0.0279, 'grad_norm': 0.09240017086267471, 'learning_rate': 3.912385709912794e-06, 'epoch': 35.17}
{'loss': 0.0224, 'grad_norm': 0.10958930850028992, 'learning_rate': 3.895091926185653e-06, 'epoch': 35.21}
{'loss': 0.029, 'grad_norm': 0.06571801751852036, 'learning_rate': 3.877827199655506e-06, 'epoch': 35.24}
{'loss': 0.0253, 'grad_norm': 0.08239578455686569, 'learning_rate': 3.860591612496335e-06, 'epoch': 35.28}
{'los


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0288, 'grad_norm': 0.1966600865125656, 'learning_rate': 3.505519516698165e-06, 'epoch': 36.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5220
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.038995057344436646, 'eval_runtime': 6.8828, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 36.0}
{'loss': 0.0311, 'grad_norm': 0.07635992765426636, 'learning_rate': 3.4889455008805107e-06, 'epoch': 36.03}
{'loss': 0.0249, 'grad_norm': 0.08673406392335892, 'learning_rate': 3.472402475372778e-06, 'epoch': 36.07}
{'loss': 0.027, 'grad_norm': 0.095638707280159, 'learning_rate': 3.455890518913897e-06, 'epoch': 36.1}
{'loss': 0.0206, 'grad_norm': 0.07479849457740784, 'learning_rate': 3.4394097100949286e-06, 'epoch': 36.14}
{'loss': 0.029, 'grad_norm': 0.1109759658575058, 'learning_rate': 3.4229601273586757e-06, 'epoch': 36.17}
{'loss': 0.0194, 'grad_norm': 0.06763631105422974, 'learning_rate': 3.4065418489993118e-06, 'epoch': 36.21}
{'loss': 0.0239, 'grad_norm': 0.08367322385311127, 'learning_rate': 3.390154953162026e-06, 'epoch': 36.24}
{'loss': 0.0306, 'grad_norm': 0.07817166298627853, 'learning_rate': 3.3737995178426276e-06, 'epoch': 36.28}
{'lo


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0416, 'grad_norm': 0.30879873037338257, 'learning_rate': 3.0377392795508687e-06, 'epoch': 37.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5365
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.0395057387650013, 'eval_runtime': 6.8849, 'eval_samples_per_second': 9.15, 'eval_steps_per_second': 4.648, 'epoch': 37.0}
{'loss': 0.0296, 'grad_norm': 0.10499446839094162, 'learning_rate': 3.0220954015832004e-06, 'epoch': 37.03}
{'loss': 0.0205, 'grad_norm': 0.11291871219873428, 'learning_rate': 3.0064847359663284e-06, 'epoch': 37.07}
{'loss': 0.022, 'grad_norm': 0.10649887472391129, 'learning_rate': 2.990907357001491e-06, 'epoch': 37.1}
{'loss': 0.0251, 'grad_norm': 0.10441746562719345, 'learning_rate': 2.975363338831484e-06, 'epoch': 37.14}
{'loss': 0.0194, 'grad_norm': 0.11416199803352356, 'learning_rate': 2.9598527554403187e-06, 'epoch': 37.17}
{'loss': 0.036, 'grad_norm': 0.10180635005235672, 'learning_rate': 2.944375680652869e-06, 'epoch': 37.21}
{'loss': 0.0266, 'grad_norm': 0.10282158851623535, 'learning_rate': 2.9289321881345257e-06, 'epoch': 37.24}
{'loss': 0.0242, 'grad_norm': 0.09098181128501892, 'learning_rate': 2.913522351390834e-06, 'epoch': 37.28}
{'los


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0233, 'grad_norm': 0.15586329996585846, 'learning_rate': 2.5978187251316823e-06, 'epoch': 38.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5510
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.03939200937747955, 'eval_runtime': 6.8833, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 4.649, 'epoch': 38.0}
{'loss': 0.0233, 'grad_norm': 0.08370925486087799, 'learning_rate': 2.5831675844331094e-06, 'epoch': 38.03}
{'loss': 0.0337, 'grad_norm': 0.08594640344381332, 'learning_rate': 2.5685517452260566e-06, 'epoch': 38.07}
{'loss': 0.0273, 'grad_norm': 0.07532823830842972, 'learning_rate': 2.5539712770767377e-06, 'epoch': 38.1}
{'loss': 0.0297, 'grad_norm': 0.11896180361509323, 'learning_rate': 2.539426249383006e-06, 'epoch': 38.14}
{'loss': 0.0312, 'grad_norm': 0.08756357431411743, 'learning_rate': 2.5249167313740307e-06, 'epoch': 38.17}
{'loss': 0.023, 'grad_norm': 0.05668226629495621, 'learning_rate': 2.5104427921099783e-06, 'epoch': 38.21}
{'loss': 0.0274, 'grad_norm': 0.09591224789619446, 'learning_rate': 2.496004500481661e-06, 'epoch': 38.24}
{'loss': 0.0261, 'grad_norm': 0.09206698834896088, 'learning_rate': 2.4816019252102274e-06, 'epoch': 38.28}



***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0244, 'grad_norm': 0.2423626333475113, 'learning_rate': 2.1875182079524173e-06, 'epoch': 39.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5655
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04043509438633919, 'eval_runtime': 6.8802, 'eval_samples_per_second': 9.157, 'eval_steps_per_second': 4.651, 'epoch': 39.0}
{'loss': 0.0234, 'grad_norm': 0.13826708495616913, 'learning_rate': 2.173918431475861e-06, 'epoch': 39.03}
{'loss': 0.0306, 'grad_norm': 0.09875701367855072, 'learning_rate': 2.160355904371635e-06, 'epoch': 39.07}
{'loss': 0.0327, 'grad_norm': 0.10653316974639893, 'learning_rate': 2.146830691192553e-06, 'epoch': 39.1}
{'loss': 0.0269, 'grad_norm': 0.10948364436626434, 'learning_rate': 2.1333428563138304e-06, 'epoch': 39.14}
{'loss': 0.0229, 'grad_norm': 0.10778020322322845, 'learning_rate': 2.119892463932781e-06, 'epoch': 39.17}
{'loss': 0.0197, 'grad_norm': 0.13354970514774323, 'learning_rate': 2.106479578068501e-06, 'epoch': 39.21}
{'loss': 0.0238, 'grad_norm': 0.09070738404989243, 'learning_rate': 2.093104262561569e-06, 'epoch': 39.24}
{'loss': 0.0199, 'grad_norm': 0.10155506432056427, 'learning_rate': 2.0797665810737386e-06, 'epoch': 39.28}
{'l


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0243, 'grad_norm': 0.1276492178440094, 'learning_rate': 1.808479557110081e-06, 'epoch': 40.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5800
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.040281616151332855, 'eval_runtime': 6.8812, 'eval_samples_per_second': 9.155, 'eval_steps_per_second': 4.65, 'epoch': 40.0}
{'loss': 0.0303, 'grad_norm': 0.15754945576190948, 'learning_rate': 1.7959855647448642e-06, 'epoch': 40.03}
{'loss': 0.0225, 'grad_norm': 0.08911509066820145, 'learning_rate': 1.7835306205783643e-06, 'epoch': 40.07}
{'loss': 0.0222, 'grad_norm': 0.08978171646595001, 'learning_rate': 1.7711147838916987e-06, 'epoch': 40.1}
{'loss': 0.0365, 'grad_norm': 0.1276584416627884, 'learning_rate': 1.7587381137798432e-06, 'epoch': 40.14}
{'loss': 0.0232, 'grad_norm': 0.11967837810516357, 'learning_rate': 1.7464006691513624e-06, 'epoch': 40.17}
{'loss': 0.0353, 'grad_norm': 0.10685992240905762, 'learning_rate': 1.7341025087281149e-06, 'epoch': 40.21}
{'loss': 0.0201, 'grad_norm': 0.10971888899803162, 'learning_rate': 1.7218436910449787e-06, 'epoch': 40.24}
{'loss': 0.0209, 'grad_norm': 0.097005195915699, 'learning_rate': 1.709624274449584e-06, 'epoch': 40.28}
{


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0223, 'grad_norm': 0.18088559806346893, 'learning_rate': 1.4622195064614241e-06, 'epoch': 41.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-5945
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.040360815823078156, 'eval_runtime': 6.8808, 'eval_samples_per_second': 9.156, 'eval_steps_per_second': 4.651, 'epoch': 41.0}
{'loss': 0.0211, 'grad_norm': 0.10150832682847977, 'learning_rate': 1.4508812932705364e-06, 'epoch': 41.03}
{'loss': 0.0324, 'grad_norm': 0.12289080768823624, 'learning_rate': 1.4395837708522864e-06, 'epoch': 41.07}
{'loss': 0.0228, 'grad_norm': 0.09013748914003372, 'learning_rate': 1.4283269929788779e-06, 'epoch': 41.1}
{'loss': 0.0285, 'grad_norm': 0.14449919760227203, 'learning_rate': 1.4171110132285771e-06, 'epoch': 41.14}
{'loss': 0.0272, 'grad_norm': 0.11975526809692383, 'learning_rate': 1.4059358849854732e-06, 'epoch': 41.17}
{'loss': 0.0268, 'grad_norm': 0.10516175627708435, 'learning_rate': 1.3948016614392113e-06, 'epoch': 41.21}
{'loss': 0.0194, 'grad_norm': 0.08354433625936508, 'learning_rate': 1.3837083955847418e-06, 'epoch': 41.24}
{'loss': 0.0224, 'grad_norm': 0.12986110150814056, 'learning_rate': 1.3726561402220818e-06, 'epoch': 41.


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0324, 'grad_norm': 0.27648141980171204, 'learning_rate': 1.1501236253695823e-06, 'epoch': 42.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6090
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04011122137308121, 'eval_runtime': 6.8817, 'eval_samples_per_second': 9.155, 'eval_steps_per_second': 4.65, 'epoch': 42.0}
{'loss': 0.0226, 'grad_norm': 0.12134375423192978, 'learning_rate': 1.1399865615330397e-06, 'epoch': 42.03}
{'loss': 0.0278, 'grad_norm': 0.10171687602996826, 'learning_rate': 1.129891668217783e-06, 'epoch': 42.07}
{'loss': 0.0231, 'grad_norm': 0.0855947807431221, 'learning_rate': 1.1198389934719277e-06, 'epoch': 42.1}
{'loss': 0.0208, 'grad_norm': 0.10351758450269699, 'learning_rate': 1.1098285851426372e-06, 'epoch': 42.14}
{'loss': 0.019, 'grad_norm': 0.07987688481807709, 'learning_rate': 1.0998604908759025e-06, 'epoch': 42.17}
{'loss': 0.0281, 'grad_norm': 0.10470163077116013, 'learning_rate': 1.0899347581163222e-06, 'epoch': 42.21}
{'loss': 0.0242, 'grad_norm': 0.11990683525800705, 'learning_rate': 1.0800514341068592e-06, 'epoch': 42.24}
{'loss': 0.0304, 'grad_norm': 0.09033652395009995, 'learning_rate': 1.0702105658886318e-06, 'epoch': 42.28}
{


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0219, 'grad_norm': 0.12824144959449768, 'learning_rate': 8.734407743092078e-07, 'epoch': 43.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6235
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.040237557142972946, 'eval_runtime': 6.879, 'eval_samples_per_second': 9.158, 'eval_steps_per_second': 4.652, 'epoch': 43.0}
{'loss': 0.028, 'grad_norm': 0.12612122297286987, 'learning_rate': 8.645454235739903e-07, 'epoch': 43.03}
{'loss': 0.0219, 'grad_norm': 0.07042775303125381, 'learning_rate': 8.556935543621791e-07, 'epoch': 43.07}
{'loss': 0.0232, 'grad_norm': 0.14627523720264435, 'learning_rate': 8.468852088055291e-07, 'epoch': 43.1}
{'loss': 0.0241, 'grad_norm': 0.07376807183027267, 'learning_rate': 8.381204288286415e-07, 'epoch': 43.14}
{'loss': 0.0256, 'grad_norm': 0.08878016471862793, 'learning_rate': 8.293992561487596e-07, 'epoch': 43.17}
{'loss': 0.0225, 'grad_norm': 0.11156313121318817, 'learning_rate': 8.207217322755734e-07, 'epoch': 43.21}
{'loss': 0.0223, 'grad_norm': 0.10509289056062698, 'learning_rate': 8.120878985110181e-07, 'epoch': 43.24}
{'loss': 0.0277, 'grad_norm': 0.08706406503915787, 'learning_rate': 8.034977959490775e-07, 'epoch': 43.28}
{'loss


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0337, 'grad_norm': 0.1764463186264038, 'learning_rate': 6.332781075160244e-07, 'epoch': 44.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6380
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04036286845803261, 'eval_runtime': 6.8799, 'eval_samples_per_second': 9.157, 'eval_steps_per_second': 4.651, 'epoch': 44.0}
{'loss': 0.0262, 'grad_norm': 0.07811702787876129, 'learning_rate': 6.256600648791034e-07, 'epoch': 44.03}
{'loss': 0.0361, 'grad_norm': 0.07880964130163193, 'learning_rate': 6.180866407751595e-07, 'epoch': 44.07}
{'loss': 0.026, 'grad_norm': 0.10994181782007217, 'learning_rate': 6.105578712510074e-07, 'epoch': 44.1}
{'loss': 0.0261, 'grad_norm': 0.08312589675188065, 'learning_rate': 6.030737921409169e-07, 'epoch': 44.14}
{'loss': 0.0296, 'grad_norm': 0.10784425586462021, 'learning_rate': 5.956344390664525e-07, 'epoch': 44.17}
{'loss': 0.0224, 'grad_norm': 0.09466759115457535, 'learning_rate': 5.882398474362949e-07, 'epoch': 44.21}
{'loss': 0.0313, 'grad_norm': 0.10037282109260559, 'learning_rate': 5.80890052446077e-07, 'epoch': 44.24}
{'loss': 0.0284, 'grad_norm': 0.0830724909901619, 'learning_rate': 5.735850890782158e-07, 'epoch': 44.28}
{'loss':


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0253, 'grad_norm': 0.2950080633163452, 'learning_rate': 4.305966426779118e-07, 'epoch': 45.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6525
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04043063521385193, 'eval_runtime': 6.879, 'eval_samples_per_second': 9.158, 'eval_steps_per_second': 4.652, 'epoch': 45.0}
{'loss': 0.034, 'grad_norm': 0.08923224359750748, 'learning_rate': 4.2428639195185585e-07, 'epoch': 45.03}
{'loss': 0.0228, 'grad_norm': 0.08051814883947372, 'learning_rate': 4.180217182260338e-07, 'epoch': 45.07}
{'loss': 0.0204, 'grad_norm': 0.08723440766334534, 'learning_rate': 4.118026513180695e-07, 'epoch': 45.1}
{'loss': 0.0236, 'grad_norm': 0.10847582668066025, 'learning_rate': 4.056292208285162e-07, 'epoch': 45.14}
{'loss': 0.0238, 'grad_norm': 0.08576886355876923, 'learning_rate': 3.99501456140714e-07, 'epoch': 45.17}
{'loss': 0.0267, 'grad_norm': 0.07531101256608963, 'learning_rate': 3.9341938642064814e-07, 'epoch': 45.21}
{'loss': 0.0286, 'grad_norm': 0.09112495183944702, 'learning_rate': 3.8738304061681107e-07, 'epoch': 45.24}
{'loss': 0.021, 'grad_norm': 0.09075148403644562, 'learning_rate': 3.8139244746007276e-07, 'epoch': 45.28}
{'los


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0257, 'grad_norm': 0.23372216522693634, 'learning_rate': 2.662074153955152e-07, 'epoch': 46.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6670
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04046466946601868, 'eval_runtime': 6.881, 'eval_samples_per_second': 9.156, 'eval_steps_per_second': 4.65, 'epoch': 46.0}
{'loss': 0.0224, 'grad_norm': 0.14254069328308105, 'learning_rate': 2.612302072266637e-07, 'epoch': 46.03}
{'loss': 0.0323, 'grad_norm': 0.0793161392211914, 'learning_rate': 2.5629935214764866e-07, 'epoch': 46.07}
{'loss': 0.027, 'grad_norm': 0.12842874228954315, 'learning_rate': 2.51414873627589e-07, 'epoch': 46.1}
{'loss': 0.0234, 'grad_norm': 0.1036059632897377, 'learning_rate': 2.465767949148734e-07, 'epoch': 46.14}
{'loss': 0.0225, 'grad_norm': 0.09302526712417603, 'learning_rate': 2.4178513903703847e-07, 'epoch': 46.17}
{'loss': 0.0317, 'grad_norm': 0.1154857650399208, 'learning_rate': 2.370399288006664e-07, 'epoch': 46.21}
{'loss': 0.0221, 'grad_norm': 0.11320561915636063, 'learning_rate': 2.3234118679127615e-07, 'epoch': 46.24}
{'loss': 0.0251, 'grad_norm': 0.09055893868207932, 'learning_rate': 2.2768893537321145e-07, 'epoch': 46.28}
{'loss':


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0185, 'grad_norm': 0.13084648549556732, 'learning_rate': 1.407682338004046e-07, 'epoch': 47.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6815
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04047980159521103, 'eval_runtime': 6.8744, 'eval_samples_per_second': 9.164, 'eval_steps_per_second': 4.655, 'epoch': 47.0}
{'loss': 0.0239, 'grad_norm': 0.12823744118213654, 'learning_rate': 1.3714398462768563e-07, 'epoch': 47.03}
{'loss': 0.0201, 'grad_norm': 0.06434327363967896, 'learning_rate': 1.3356667915121025e-07, 'epoch': 47.07}
{'loss': 0.0283, 'grad_norm': 0.12883372604846954, 'learning_rate': 1.3003633439768182e-07, 'epoch': 47.1}
{'loss': 0.0218, 'grad_norm': 0.08322731405496597, 'learning_rate': 1.2655296717028808e-07, 'epoch': 47.14}
{'loss': 0.0233, 'grad_norm': 0.07925700396299362, 'learning_rate': 1.231165940486234e-07, 'epoch': 47.17}
{'loss': 0.0237, 'grad_norm': 0.1715325117111206, 'learning_rate': 1.1972723138860333e-07, 'epoch': 47.21}
{'loss': 0.0226, 'grad_norm': 0.09570812433958054, 'learning_rate': 1.1638489532239339e-07, 'epoch': 47.24}
{'loss': 0.0365, 'grad_norm': 0.104829803109169, 'learning_rate': 1.1308960175832606e-07, 'epoch': 47.28}
{


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0279, 'grad_norm': 0.28229308128356934, 'learning_rate': 5.4781046317267103e-08, 'epoch': 48.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-6960
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.040399596095085144, 'eval_runtime': 6.8756, 'eval_samples_per_second': 9.163, 'eval_steps_per_second': 4.654, 'epoch': 48.0}
{'loss': 0.031, 'grad_norm': 0.07313777506351471, 'learning_rate': 5.252425867601329e-08, 'epoch': 48.03}
{'loss': 0.0205, 'grad_norm': 0.08075948804616928, 'learning_rate': 5.031481749088296e-08, 'epoch': 48.07}
{'loss': 0.0284, 'grad_norm': 0.10634560883045197, 'learning_rate': 4.815273327803183e-08, 'epoch': 48.1}
{'loss': 0.029, 'grad_norm': 0.13198979198932648, 'learning_rate': 4.603801632821148e-08, 'epoch': 48.14}
{'loss': 0.0248, 'grad_norm': 0.17081913352012634, 'learning_rate': 4.397067670672828e-08, 'epoch': 48.17}
{'loss': 0.0266, 'grad_norm': 0.09618455916643143, 'learning_rate': 4.195072425338342e-08, 'epoch': 48.21}
{'loss': 0.0304, 'grad_norm': 0.0720328614115715, 'learning_rate': 3.997816858243297e-08, 'epoch': 48.24}
{'loss': 0.0241, 'grad_norm': 0.07701475918292999, 'learning_rate': 3.805301908254455e-08, 'epoch': 48.28}
{'loss'


***** Running Evaluation *****
  Num examples = 63
  Batch size = 2


{'loss': 0.0249, 'grad_norm': 0.13185715675354004, 'learning_rate': 8.589933103132498e-09, 'epoch': 49.0}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-7105
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.040412537753582, 'eval_runtime': 6.8762, 'eval_samples_per_second': 9.162, 'eval_steps_per_second': 4.654, 'epoch': 49.0}
{'loss': 0.0202, 'grad_norm': 0.1499631106853485, 'learning_rate': 7.70963759277099e-09, 'epoch': 49.03}
{'loss': 0.032, 'grad_norm': 0.11476847529411316, 'learning_rate': 6.876901840231487e-09, 'epoch': 49.07}
{'loss': 0.0258, 'grad_norm': 0.05743570625782013, 'learning_rate': 6.091729809042379e-09, 'epoch': 49.1}
{'loss': 0.0183, 'grad_norm': 0.08499053865671158, 'learning_rate': 5.354125236343155e-09, 'epoch': 49.14}
{'loss': 0.0282, 'grad_norm': 0.09843312948942184, 'learning_rate': 4.6640916328710705e-09, 'epoch': 49.17}
{'loss': 0.0194, 'grad_norm': 0.12889589369297028, 'learning_rate': 4.021632282938947e-09, 'epoch': 49.21}
{'loss': 0.0234, 'grad_norm': 0.08438156545162201, 'learning_rate': 3.4267502444274013e-09, 'epoch': 49.24}
{'loss': 0.0299, 'grad_norm': 0.1093689575791359, 'learning_rate': 2.879448348762637e-09, 'epoch': 49.28}
{'loss': 

Saving model checkpoint to ./finetuned_model/checkpoint-7200
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'loss': 0.0219, 'grad_norm': 0.0829143151640892, 'learning_rate': 0.0, 'epoch': 49.66}


  0%|          | 0/32 [00:00<?, ?it/s]

Saving model checkpoint to ./finetuned_model/checkpoint-7200
loading configuration file /home/raza/Downloads/Phi-3-mini-4k-instruct/config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dt

{'eval_loss': 0.04038238525390625, 'eval_runtime': 6.875, 'eval_samples_per_second': 9.164, 'eval_steps_per_second': 4.655, 'epoch': 49.66}
{'train_runtime': 9774.2836, 'train_samples_per_second': 2.952, 'train_steps_per_second': 0.737, 'train_loss': 0.05435840782605939, 'epoch': 49.66}
