In [8]:
# Install required libraries
!pip install transformers datasets trl torch accelerate bitsandbytes wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [9]:
# Import necessary modules
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, pipeline
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
import os

# Detect device (GPU, MPS, or CPU)
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

***Quick Check For Resources - Runtime***

In [10]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Thu Mar 13 18:01:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.107.02             Driver Version: 550.107.02     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5500               On  |   00000000:01:00.0 Off |                  Off |
| 49%   64C    P2             79W /  230W |   15260MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A5500               On  |   00

In [11]:
# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up chat format for the model and tokenizer
#model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# Test the base model with a simple prompt
prompt = "Explain AGI ?"
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
print(pipe(prompt, max_new_tokens=200))

Device set to use cuda:0


[{'generated_text': 'Explain AGI ?\n\nArtificial General Intelligence (AGI) is a hypothetical AI system that possesses the ability to perform any intellectual task that a human can, with the same level of accuracy and effectiveness. This includes reasoning, problem-solving, learning, and understanding. AGI would be able to learn from experience, adapt to new situations, and apply its knowledge to a wide range of tasks, similar to humans.\n\nThe concept of AGI was first proposed by mathematician and computer scientist John McCarthy in 1956, and it has since become a topic of interest in the fields of artificial intelligence, cognitive science, and philosophy. AGI would be a significant breakthrough in the development of AI, as it would enable machines to perform tasks that were previously the exclusive domain of humans.\n\nHowever, the development of AGI is still a topic of debate and research, and there are many challenges and uncertainties associated with creating a truly intelligent 

In [14]:
# Load the dataset
ds = load_dataset("akhilfau/Instruction_fine_tuning_camel_ai_physics")

# Tokenization function with batch handling
def tokenize_function(examples):
    # Apply strip to each string in the batch
    prompts = [p.strip() for p in examples["prompt"]]
    responses = [r.strip() for r in examples["response"]]

    # Apply chat template for each prompt-response pair
    texts = [
        tokenizer.apply_chat_template(
            [{"role": "system", "content": "You are a helpful physics tutor."},{"role": "user", "content": p}, {"role": "assistant", "content": r}],
            tokenize=False
        )
        for p, r in zip(prompts, responses)
    ]

    # Tokenize the combined texts
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

# Apply the tokenization function to the dataset
ds = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [15]:
import torch
from transformers import TrainingArguments
from trl import SFTTrainer

# Check if bf16 is available
use_bf16 = torch.cuda.is_bf16_supported()

# Estimate total training steps
dataset_size = len(ds["train"])  # Adjust based on dataset size
batch_size = 4  # Adjust based on available GPU memory
grad_accum_steps = 4  # Simulate larger batch without using more memory
total_steps = (dataset_size // (batch_size * grad_accum_steps)) * 3  # 3 epochs

# Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum_steps,
    warmup_steps=100,  # More warmup steps for stability
    max_steps=total_steps,  # Train across full dataset
    learning_rate=2e-4,
    fp16=not use_bf16,  # Use FP16 if bf16 is not available
    bf16=use_bf16,  # Use BF16 if supported
    logging_steps=100,  # Reduce logging frequency for better efficiency
    save_steps=1000,  # Save every 1000 steps
    save_total_limit=2,  # Keep last 2 checkpoints
    optim="adamw_8bit",  # Memory-efficient optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="wandb",  # Log to W&B
)

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=ds["train"],
    args=training_args,
)

# Train
trainer.train()


Converting train dataset to ChatML:   0%|          | 0/20000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]



Step,Training Loss
100,0.2122
200,0.1786
300,0.1674
400,0.1431
500,0.1398
600,0.1383
700,0.1166
800,0.1109
900,0.1122
1000,0.0942




TrainOutput(global_step=3750, training_loss=0.06028011699517568, metrics={'train_runtime': 9349.373, 'train_samples_per_second': 25.67, 'train_steps_per_second': 0.401, 'total_flos': 2.316340113702912e+17, 'train_loss': 0.06028011699517568})

In [16]:
# Save model and tokenizer to a local directory
# save_directory = "/content/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics"
# model.save_pretrained(save_directory)
# tokenizer.save_pretrained(save_directory)

# # Zip the directory
# import shutil
# shutil.make_archive(save_directory, 'zip', save_directory)

# # Download the zip file
# from google.colab import files
# files.download(f"{save_directory}.zip")

In [17]:
# Change to a directory in your home folder
save_directory = "./Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"✅ Model saved to {save_directory}")


✅ Model saved to ./Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics


In [None]:
from huggingface_hub import login

# Replace 'your_huggingface_token' with your actual token
hf_token = "hf_token"

# Log in to Hugging Face
login(token=hf_token)

print("✅ Successfully logged in to Hugging Face!")


✅ Successfully logged in to Hugging Face!


In [20]:
from huggingface_hub import create_repo, upload_folder, login
# Create the repository (if it doesn't exist)
# Define model repository name
repo_name = "akhilfau/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics"
create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

RepoUrl('https://huggingface.co/akhilfau/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics', endpoint='https://huggingface.co', repo_type='model', repo_id='akhilfau/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics')

In [21]:
from huggingface_hub import upload_folder
from transformers import AutoModel, AutoTokenizer

# Define model repository name
repo_name = "akhilfau/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics"

# Save model and tokenizer locally
save_directory = "./Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Upload to Hugging Face Hub
upload_folder(
    folder_path=save_directory,  # Path to local model directory
    repo_id=repo_name,           # Your Hugging Face repository name
    repo_type="model"            # Specify it as a model repository
)

print(f"✅ Model successfully uploaded: https://huggingface.co/{repo_name}")


model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

✅ Model successfully uploaded: https://huggingface.co/akhilfau/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics


In [22]:
MODEL="akhilfau/Instruction_fine_tuned_on_smollm2_360M_Instruct_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_supervised"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:conceptual_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: line 1: lighteval: command not found


In [None]:
MODEL="HuggingFaceTB/SmolLM2-360M"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:conceptual_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
MODEL="akhilfau/Instruction_fine_tuned_on_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:conceptual_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
#High School Physics

In [None]:
MODEL="HuggingFaceTB/SmolLM2-360M"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:high_school_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
MODEL="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:high_school_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
MODEL="akhilfau/Instruction_fine_tuned_on_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:high_school_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
#College_physics

In [None]:
MODEL="HuggingFaceTB/SmolLM2-360M"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:college_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
MODEL="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:college_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


In [None]:
MODEL="akhilfau/Instruction_fine_tuned_on_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:college_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"
