<a href="https://colab.research.google.com/github/Akhilvallala2023/Thesis/blob/main/Fine_Tuning_using_LoRa_135M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login

# Prompt the user for their Hugging Face token
huggingface_token = input("Enter your Hugging Face token: ")

# Log in to Hugging Face
login(token=huggingface_token)

print("Successfully logged in to Hugging Face!")


KeyboardInterrupt: Interrupted by user

In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face Hub
dataset_name = "akhilfau/physics_decontaminated_2"
dataset = load_dataset(dataset_name, split="train")  # Adjust the split if needed (e.g., "test" or "validation")

# Print a sample record
print("Sample record from the dataset:")
print(dataset[0])  # Prints the first record from the dataset


Sample record from the dataset:
{'role_1': 'Physicist_RoleType.ASSISTANT', 'topic;': 'Quantum mechanics', 'sub_topic': 'The Schrödinger equation and its solutions', 'message_1': 'What is the probability of finding a particle with a given energy in a one-dimensional infinite square well potential when the potential width is 2 nm and the particle has a mass of 5x10^-26 kg? Use the Schrödinger equation to solve for the allowed energy states and their wave functions.', 'message_2': 'To find the probability of finding a particle with a given energy in a one-dimensional infinite square well potential, we first need to solve the Schrödinger equation for the allowed energy states and their wave functions.\n\nThe time-independent Schrödinger equation for a one-dimensional infinite square well potential is given by:\n\n- (ħ^2 / 2m) * (d^2ψ(x) / dx^2) = E * ψ(x)\n\nwhere ħ is the reduced Planck constant (1.0545718 × 10^-34 Js), m is the mass of the particle (5 × 10^-26 kg), E is the energy of the

In [None]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

# Step 1: Load the Dataset
dataset = load_dataset("akhilfau/physics_decontaminated_2", split="train")

# Step 2: Load the Pretrained Model and Tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the padding token if not already set
tokenizer.pad_token = tokenizer.eos_token or tokenizer.bos_token or "[PAD]"

# Step 3: Configure LoRA with PEFT
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type="CAUSAL_LM",  # Task type for causal language modeling
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters to confirm LoRA is applied
model.print_trainable_parameters()

# Step 4: Preprocess the Dataset
def preprocess_function(examples):
    # Concatenate the problem and solution for causal LM
    inputs = [f"Problem: {problem}\nSolution: {solution}" for problem, solution in zip(examples["message_1"], examples["message_2"])]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

    # Labels are the same as input_ids for causal LM
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split into train and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Step 5: Define Training Arguments

training_args = TrainingArguments(
    output_dir="./fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics",
    evaluation_strategy="epoch",
    logging_steps=100,
    save_steps=500,
    learning_rate=5e-4,  # Reduce the learning rate
    lr_scheduler_type="cosine",  # Use a more adaptive scheduler
    per_device_train_batch_size=4,  # Increase if memory allows
    per_device_eval_batch_size=4,
    num_train_epochs=8,  # Train for more epochs
    weight_decay=0.1,  # Regularization
    save_total_limit=2,
    logging_dir="./logs",
    push_to_hub=False,
    #gradient_checkpointing=True,  # Reduce memory usage if needed
    bf16=False,  # Disable BFloat16
    fp16=False,  # Disable FP16
)

# Step 6: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Step 7: Train the Model
trainer.train()



trainable params: 921,600 || all params: 135,436,608 || trainable%: 0.6805




Epoch,Training Loss,Validation Loss
1,1.0142,1.040344
2,1.0231,1.008247
3,0.9995,0.991766
4,0.9527,0.982188
5,0.9351,0.975225
6,0.9126,0.971895
7,0.9161,0.970611
8,0.9194,0.970476


TrainOutput(global_step=32000, training_loss=0.974360009431839, metrics={'train_runtime': 6891.5252, 'train_samples_per_second': 18.574, 'train_steps_per_second': 4.643, 'total_flos': 4.2123286020096e+16, 'train_loss': 0.974360009431839, 'epoch': 8.0})

In [None]:
#Testing locally

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the path to the locally saved model
local_model_path = "./fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

# Test with a sample input
input_text = "What is the Schrödinger equation?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
output = model.generate(**inputs, max_length=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is the Schrödinger equation?
The Schrödinger equation is a mathematical expression that describes the behavior of a quantum system, such as a particle in a box. It is a special form of the Schröding


In [None]:

# Push to Hugging Face Hub
trainer.push_to_hub(commit_message="Fine-tuned smolLM2-135M with LoRA on camel-ai/physics")

from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="./fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics",
    repo_id="akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics"
)


training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/7.47M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/7.47M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics/commit/18343dec31780151c0ff4afd9984e806f125dd07', commit_message='Upload folder using huggingface_hub', commit_description='', oid='18343dec31780151c0ff4afd9984e806f125dd07', pr_url=None, repo_url=RepoUrl('https://huggingface.co/akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics', endpoint='https://huggingface.co', repo_type='model', repo_id='akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics'), pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the Hugging Face model path (update with the correct repository path)
model_path = "akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics"

# Load the model and tokenizer from the Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Test with a sample input
input_text = "What is the Schrödinger equation?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
output = model.generate(**inputs, max_length=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))


adapter_config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is the Schrödinger equation?

The Schrödinger equation is a mathematical expression for the energy levels of a quantum system, which is a mathematical model of the behavior of matter and energy. It is given by:


In [None]:
# import torch

# # Clear cache
# torch.cuda.empty_cache()

# # Reset memory allocations and free up GPU memory
# torch.cuda.memory_summary(device=None, abbreviated=False)


In [None]:
MODEL = "HuggingFaceTB/SmolLM2-135M"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
0it [00:00, ?it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
 14%|██████▍                                      | 1/7 [00:00<00:01,  4.03it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 16.57it/s][A
1it [00:00,  2.36it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 42.23it/s][A
2it [00:00,  3.68it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 43.24it/s][A
3it [00:00,  4.51it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00

In [None]:
MODEL = "akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics"
OUTPUT_DIR = "/content/output"

!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
0it [00:00, ?it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
 14%|██████▍                                      | 1/7 [00:00<00:01,  4.08it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 16.21it/s][A
1it [00:00,  2.31it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 35.33it/s][A
2it [00:00,  3.39it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 35.50it/s][A
3it [00:00,  3.99it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 7/7 [00:00<00

In [None]:
MODEL = "HuggingFaceTB/SmolLM2-135M"
OUTPUT_DIR = "/content/output"
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:conceptual_physics|0|0" \
    --override_batch_size 16 \
    --output_dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
0it [00:00, ?it/s]
  0%|                                                    | 0/15 [00:00<?, ?it/s][A
  7%|██▉                                         | 1/15 [00:00<00:06,  2.03it/s][A
 40%|█████████████████▌                          | 6/15 [00:00<00:00, 12.27it/s][A
100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 18.15it/s][A
1it [00:00,  1.21it/s]
  0%|                                                    | 0/15 [00:00<?, ?it/s][A
 33%|██████████████▋                             | 5/15 [00:00<00:00, 44.19it/s][A
 67%|████████████████████████████▋              | 10/15 [00:00<00:00, 44.15it/s][A
100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 44.45it/s][A
2it [00:01,  1.85it/s]
  0%|                                                    | 0/15 [00:00<?, ?it/s][A
 33%

In [None]:
MODEL = "akhilfau/fine-tuned-smolLM2-135M-with-LoRA-on-camel-ai-physics"
OUTPUT_DIR = "/content/output"
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:conceptual_physics|0|0" \
    --override_batch_size 16 \
    --output_dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
0it [00:00, ?it/s]
  0%|                                                    | 0/15 [00:00<?, ?it/s][A
  7%|██▉                                         | 1/15 [00:00<00:03,  4.10it/s][A
 33%|██████████████▋                             | 5/15 [00:00<00:00, 16.54it/s][A
 60%|██████████████████████████▍                 | 9/15 [00:00<00:00, 23.15it/s][A
100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 23.27it/s][A
1it [00:00,  1.55it/s]
  0%|                                                    | 0/15 [00:00<?, ?it/s][A
 27%|███████████▋                                | 4/15 [00:00<00:00, 37.68it/s][A
 53%|███████████████████████▍                    | 8/15 [00:00<00:00, 37.21it/s][A
100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 25.41it/s][A
2it [00:01,  1.63it/s]
  0%