In [None]:
from huggingface_hub import login

# Prompt the user for their Hugging Face token
huggingface_token = input("Enter your Hugging Face token: ")

# Log in to Hugging Face
login(token=huggingface_token)

print("Successfully logged in to Hugging Face!")


: 

In [2]:
# import torch 
# for i in range(torch.cuda.device_count()):     
#     print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

In [15]:
from datasets import load_dataset
 
# Load dataset from CSV
dataset = load_dataset("csv", data_files="SF_train.csv",encoding="ISO-8859-1",split="train")
 
# Check available splits
#print("Available splits:", dataset.keys())
 
# Print a sample record from the training set
print("Sample record from the dataset:")
print(dataset[0])  # Accessing first sample from "train"

Sample record from the dataset:
{'Question_number': 1.1, 'Questions': '\\documentclass{article}\n\\usepackage{amsmath}\n\n\\begin{document}\n\nA car starts from rest at a constant acceleration of \\(2.0 \\, \\text{m/s}^2\\). At the same instant, a truck traveling with a constant speed of \\(10 \\, \\text{m/s}\\) overtakes and passes the car.\n\n\\begin{enumerate}\n    \\item[(a)] How far beyond the starting point will the car overtake the truck?\n    \\item[(b)] After what time will this happen?\n    \\item[(c)] At that instant, what will be the speed of the car?\n\\end{enumerate}\n\n\\end{document}', 'Sub-category': 'Motion in One Dimension', 'Category': 'Kinematics and statics', 'Solutions': '\\documentclass{article}\n\\usepackage{amsmath}\n\n\\begin{document}\n\n\\section*{Physics Problem: Motion of Truck and Car}\n\n\\subsection*{(a) Equations of Motion}\n\nThe equation of motion for the truck is given by:\n\\begin{equation}\n    s = ut \\tag{1}\n\\end{equation}\n\nThe equation of 

In [1]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

# Step 1: Load the Dataset
#dataset = load_dataset("akhilfau/physics_decontaminated_2", split="train")

# Load dataset from CSV
dataset = load_dataset("csv", data_files="SF_train.csv",encoding="ISO-8859-1",split="train")

# Step 2: Load the Pretrained Model and Tokenizer
model_name = "HuggingFaceTB/SmolLM2-360M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the padding token if not already set
tokenizer.pad_token = tokenizer.eos_token or tokenizer.bos_token or "[PAD]"

# Step 3: Configure LoRA with PEFT
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type="CAUSAL_LM",  # Task type for causal language modeling
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters to confirm LoRA is applied
model.print_trainable_parameters()

# Step 4: Preprocess the Dataset
def preprocess_function(examples):
    # Concatenate the problem and solution for causal LM
    inputs = [f"Problem: {problem}\nSolution: {solution}" for problem, solution in zip(examples["Questions"], examples["Solutions"])]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=1024)
    #model_inputs = tokenizer(inputs, truncation=True, padding=True)
    
    # Labels are the same as input_ids for causal LM
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)



print("length of the data: ",tokenized_dataset.shape[0])



#Step5:
# Estimate total training steps
use_bf16 = torch.cuda.is_bf16_supported()
dataset_size = tokenized_dataset.shape[0] # Adjust based on dataset size
batch_size = 4  # Adjust based on available GPU memory
grad_accum_steps = 4  # Simulate larger batch without using more memory
total_steps = (dataset_size // (batch_size * grad_accum_steps)) * 3  # 3 epochs

# Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum_steps,
    warmup_steps=100,  # More warmup steps for stability
    max_steps=total_steps,  # Train across full dataset
    learning_rate=2e-4,
    fp16=not use_bf16,  # Use FP16 if bf16 is not available
    bf16=use_bf16,  # Use BF16 if supported
    logging_steps=100,  # Reduce logging frequency for better efficiency
    save_steps=1000,  # Save every 1000 steps
    save_total_limit=2,  # Keep last 2 checkpoints
    optim="adamw_8bit",  # Memory-efficient optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="wandb",  # Log to W&B
)
# Step 6: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
    # eval_dataset=eval_dataset,
)

# Step 7: Train the Model
trainer.train()



trainable params: 1,638,400 || all params: 363,459,520 || trainable%: 0.4508


Map:   0%|          | 0/951 [00:00<?, ? examples/s]

length of the data:  951


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33makhilvallala0115[0m ([33makhilvallala0115-florida-atlantic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
100,0.9986


TrainOutput(global_step=177, training_loss=0.6189043239011602, metrics={'train_runtime': 763.5005, 'train_samples_per_second': 14.837, 'train_steps_per_second': 0.232, 'total_flos': 2.18200243470336e+16, 'train_loss': 0.6189043239011602, 'epoch': 11.8})

In [2]:
# Step 8: Save the Model and Tokenizer
save_dir = "./fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('./fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/tokenizer_config.json',
 './fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/special_tokens_map.json',
 './fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/vocab.json',
 './fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/merges.txt',
 './fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/added_tokens.json',
 './fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/tokenizer.json')

In [3]:
#Testing locally

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the path to the locally saved model
local_model_path = "./fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

# Test with a sample input
input_text = "A car accelerates uniformly from rest to a speed of 30 m/s in 10 seconds. What is the acceleration of the car?Options:A) 3 m/s²B)2.5 m/s²C) 5 m/s²D) 4 m/s²"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
output = model.generate(**inputs,max_length=512)
print(tokenizer.decode(output[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


A car accelerates uniformly from rest to a speed of 30 m/s in 10 seconds. What is the acceleration of the car?Options:A) 3 m/s²B)2.5 m/s²C) 5 m/s²D) 4 m/s²

### A 1000 kg car accelerates uniformly from rest to a speed of 30 m/s in 10 seconds. What is the acceleration of the car? Options:A) 3 m/s²B)2.5 m/s²C) 5 m/s²D) 4 m/s²

A 1000 kg car accelerates uniformly from rest to a speed of 30 m/s in 10 seconds. What is the acceleration of the car? Options:A) 3 m/s²B)2.5 m/s²C) 5 m/s²D) 4 m/s²...


In [20]:
from huggingface_hub import create_repo, upload_folder, login
# Create the repository (if it doesn't exist)
repo_name="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2"
create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

RepoUrl('https://huggingface.co/akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2', endpoint='https://huggingface.co', repo_type='model', repo_id='akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2')

In [21]:

# Push to Hugging Face Hub
trainer.push_to_hub(commit_message="Fine-tuned smolLM2-360M with LoRA on text-book-physics_2")

from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="./fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2",
    repo_id="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2"
)


adapter_model.safetensors:   0%|          | 0.00/6.57M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.57M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2/commit/9df07b5c62628bbf980e36ba66883376bf8733ec', commit_message='Upload folder using huggingface_hub', commit_description='', oid='9df07b5c62628bbf980e36ba66883376bf8733ec', pr_url=None, repo_url=RepoUrl('https://huggingface.co/akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2', endpoint='https://huggingface.co', repo_type='model', repo_id='akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2'), pr_revision=None, pr_num=None)

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the Hugging Face model path (update with the correct repository path)
model_path = "akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-text-book-physics_2"

# Load the model and tokenizer from the Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Test with a sample input
input_text = "A car accelerates uniformly from rest to a speed of 30 m/s in 10 seconds. What is the acceleration of the car?Options:A) 3 m/s²B) 2.5 m/s²C) 5 m/s²D) 4 m/s²"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
# output = model.generate(**inputs,max_length=50)
# print(tokenizer.decode(output[0], skip_special_tokens=True))

output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],  # Add attention mask
    max_length=50
)
 
print(tokenizer.decode(output[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


ValueError: Input length of input_ids is 66, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [23]:
# import torch

# # Clear cache
# torch.cuda.empty_cache()

# # Reset memory allocations and free up GPU memory
# torch.cuda.memory_summary(device=None, abbreviated=False)


In [24]:
MODEL = "HuggingFaceTB/SmolLM2-360M"
OUTPUT_DIR = "/content/output"
 
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: line 1: lighteval: command not found


In [25]:
MODEL = "akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"
OUTPUT_DIR = "/content/output"
 
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:college_physics|0|0" \
    --override_batch_size 16 \
    --output_dir $OUTPUT_DIR

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: line 1: lighteval: command not found


In [None]:
MODEL = "HuggingFaceTB/SmolLM2-360M"
OUTPUT_DIR = "/content/output"
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:conceptual_physics|0|0" \
    --override_batch_size 16 \
    --output_dir "$OUTPUT_DIR"


In [None]:
MODEL = "akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"
OUTPUT_DIR = "/content/output"
!lighteval accelerate \
    --model_args "pretrained=$MODEL" \
    --tasks "leaderboard|mmlu:conceptual_physics|0|0" \
    --override_batch_size 16 \
    --output_dir "$OUTPUT_DIR"
