In [5]:
# Install required libraries
!pip install transformers datasets trl torch accelerate bitsandbytes wandb



In [2]:
# Import necessary modules
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, pipeline
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
import os

# Detect device (GPU, MPS, or CPU)
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

***Quick Check For Resources - Runtime***

In [3]:
!nvidia-smi

Tue Mar  4 14:03:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.107.02             Driver Version: 550.107.02     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5500               On  |   00000000:01:00.0 Off |                  Off |
| 30%   52C    P2             71W /  230W |     284MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A5500               On  |   00

In [4]:
# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-360M"
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up chat format for the model and tokenizer
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# Test the base model with a simple prompt
prompt = "Explain AGI ?"
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
print(pipe(prompt, max_new_tokens=200))

Device set to use cuda:0


[{'generated_text': 'Explain AGI ?\n\nA. AGI is a measure of the ability of a machine to perform a task.\n\nB. AGI is a measure of the ability of a machine to perform a task.\n\nC. AGI is a measure of the ability of a machine to perform a task.\n\nD. AGI is a measure of the ability of a machine to perform a task.\n\nAnswer: B\n\nQuestion 10. Which of the following is not a type of AGI?\n\nA. Machine learning\n\nB. Natural language processing\n\nC. Computer vision\n\nD. Speech recognition\n\nAnswer: B\n\nQuestion 11. Which of the following is not a type of AGI?\n\nA. Machine learning\n\nB. Natural language processing\n\nC. Computer vision\n\nD. Speech recognition\n\nAnswer: C\n\nQuestion 12. Which of the following is not a type of A'}]


In [5]:
# Load the dataset
ds = load_dataset("akhilfau/Instruction_fine_tuning_camel_ai_physics")

# Tokenization function with batch handling
def tokenize_function(examples):
    # Apply strip to each string in the batch
    prompts = [p.strip() for p in examples["prompt"]]
    responses = [r.strip() for r in examples["response"]]

    # Apply chat template for each prompt-response pair
    texts = [
        tokenizer.apply_chat_template(
            [{"role": "system", "content": "You are a helpful physics tutor."},{"role": "user", "content": p}, {"role": "assistant", "content": r}],
            tokenize=False
        )
        for p, r in zip(prompts, responses)
    ]

    # Tokenize the combined texts
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

# Apply the tokenization function to the dataset
ds = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [6]:
import torch
from transformers import TrainingArguments
from trl import SFTTrainer

# Check if bf16 is available
use_bf16 = torch.cuda.is_bf16_supported()

# Estimate total training steps
dataset_size = len(ds["train"])  # Adjust based on dataset size
batch_size = 4  # Adjust based on available GPU memory
grad_accum_steps = 4  # Simulate larger batch without using more memory
total_steps = (dataset_size // (batch_size * grad_accum_steps)) * 3  # 3 epochs

# Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum_steps,
    warmup_steps=100,  # More warmup steps for stability
    max_steps=total_steps,  # Train across full dataset
    learning_rate=2e-4,
    fp16=not use_bf16,  # Use FP16 if bf16 is not available
    bf16=use_bf16,  # Use BF16 if supported
    logging_steps=100,  # Reduce logging frequency for better efficiency
    save_steps=1000,  # Save every 1000 steps
    save_total_limit=2,  # Keep last 2 checkpoints
    optim="adamw_8bit",  # Memory-efficient optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="wandb",  # Log to W&B
)

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=ds["train"],
    args=training_args,
)

# Train
trainer.train()


Converting train dataset to ChatML:   0%|          | 0/20000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33makhilvallala0115[0m ([33makhilvallala0115-florida-atlantic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
100,0.2265
200,0.1871
300,0.1754
400,0.1494
500,0.1459
600,0.1444
700,0.1219
800,0.116
900,0.1175
1000,0.0989




TrainOutput(global_step=3750, training_loss=0.06364278049468994, metrics={'train_runtime': 9350.6445, 'train_samples_per_second': 25.667, 'train_steps_per_second': 0.401, 'total_flos': 2.316340113702912e+17, 'train_loss': 0.06364278049468994})

In [8]:
# Save model and tokenizer to a local directory
save_directory = "/content/Instruction_fine_tuned_on_camel_ai_physics"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# # Zip the directory
# import shutil
# shutil.make_archive(save_directory, 'zip', save_directory)

# # Download the zip file
# from google.colab import files
# files.download(f"{save_directory}.zip")

PermissionError: [Errno 13] Permission denied: '/content'

In [9]:
# Change to a directory in your home folder
save_directory = "./Instruction_fine_tuned_on_camel_ai_physics"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"✅ Model saved to {save_directory}")


✅ Model saved to ./Instruction_fine_tuned_on_camel_ai_physics


In [None]:
from huggingface_hub import login

# Replace 'your_huggingface_token' with your actual token
hf_token = "hf_token"

# Log in to Hugging Face
login(token=hf_token)

print("✅ Successfully logged in to Hugging Face!")


✅ Successfully logged in to Hugging Face!


In [18]:
from huggingface_hub import upload_folder
from transformers import AutoModel, AutoTokenizer

# Define model repository name
repo_name = "akhilfau/Instruction_fine_tuned_on_camel_ai_physics"

# Save model and tokenizer locally
save_directory = "./Instruction_fine_tuned_on_camel_ai_physics"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Upload to Hugging Face Hub
upload_folder(
    folder_path=save_directory,  # Path to local model directory
    repo_id=repo_name,           # Your Hugging Face repository name
    repo_type="model"            # Specify it as a model repository
)

print(f"✅ Model successfully uploaded: https://huggingface.co/{repo_name}")


model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

✅ Model successfully uploaded: https://huggingface.co/akhilfau/Instruction_fine_tuned_on_camel_ai_physics


In [17]:
from huggingface_hub import create_repo, upload_folder, login
# Create the repository (if it doesn't exist)
create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

RepoUrl('https://huggingface.co/akhilfau/Instruction_fine_tuned_on_camel_ai_physics', endpoint='https://huggingface.co', repo_type='model', repo_id='akhilfau/Instruction_fine_tuned_on_camel_ai_physics')

In [29]:
MODEL="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"  # Model name
OUTPUT_DIR="/content/360M_supervised"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:conceptual_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 11:54:00,402] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 11:54:02,442] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 11:54:02,517] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 11:54:02,517] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 11:54:02,799] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 11:54:03,050] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 11:54:03,050] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 11:54:04,150] [[32m    INFO

In [28]:
MODEL="HuggingFaceTB/SmolLM2-360M"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:conceptual_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 11:51:27,587] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 11:51:29,550] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 11:51:29,623] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 11:51:29,623] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 11:51:29,774] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 11:51:30,032] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 11:51:30,033] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 11:51:30,576] [[32m    INFO

In [30]:
MODEL="akhilfau/Instruction_fine_tuned_on_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:conceptual_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:11:30,981] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:11:32,963] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:11:33,038] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:11:33,038] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:11:33,270] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:11:33,511] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:11:33,512] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:11:34,153] [[32m    INFO

In [None]:
#High School Physics

In [33]:
MODEL="HuggingFaceTB/SmolLM2-360M"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:high_school_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:15:22,666] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:15:24,705] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:15:24,778] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:15:24,779] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:15:24,923] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:15:25,173] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:15:25,173] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:15:25,700] [[32m    INFO

In [34]:
MODEL="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:high_school_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:17:18,134] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:17:20,121] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:17:20,195] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:17:20,195] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:17:20,422] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:17:20,668] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:17:20,668] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:17:21,804] [[32m    INFO

In [35]:
MODEL="akhilfau/Instruction_fine_tuned_on_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:high_school_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:18:58,686] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:19:01,387] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:19:01,461] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:19:01,461] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:19:01,764] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:19:02,015] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:19:02,016] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:19:02,657] [[32m    INFO

In [36]:
#College_physics

In [37]:
MODEL="HuggingFaceTB/SmolLM2-360M"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:college_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:23:24,852] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:23:26,794] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:23:26,873] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:23:26,873] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:23:27,043] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:23:27,287] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:23:27,288] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:23:27,820] [[32m    INFO

In [38]:
MODEL="akhilfau/fine-tuned-smolLM2-360M-with-LoRA-on-camel-ai-physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:college_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:25:19,887] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:25:21,960] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:25:22,038] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:25:22,038] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:25:22,310] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:25:22,557] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:25:22,558] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:25:23,807] [[32m    INFO

In [39]:
MODEL="akhilfau/Instruction_fine_tuned_on_camel_ai_physics"  # Model name
OUTPUT_DIR="/content/360M_output"  # Change this if needed

!lighteval accelerate \
    "pretrained=$MODEL" \
    "leaderboard|mmlu:college_physics|0|0" \
    --override-batch-size 16 \
    --output-dir "$OUTPUT_DIR"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-03-05 12:26:29,913] [[32m    INFO[0m]: PyTorch version 2.4.1 available. (config.py:54)[0m
[2025-03-05 12:26:31,890] [[32m    INFO[0m]: Test gather tensor (parallelism.py:133)[0m
[2025-03-05 12:26:31,965] [[32m    INFO[0m]: gathered_tensor tensor([0], device='cuda:0'), should be [0] (parallelism.py:136)[0m
[2025-03-05 12:26:31,965] [[32m    INFO[0m]: --- LOADING MODEL --- (pipeline.py:168)[0m
[2025-03-05 12:26:32,162] [[32m    INFO[0m]: Tokenizer truncation and padding size set to the left side. (transformers_model.py:539)[0m
[2025-03-05 12:26:32,421] [[32m    INFO[0m]: Setting model parallel to True since the number of local processes is 1 and the number of GPUs is 4 (transformers_model.py:402)[0m
[2025-03-05 12:26:32,421] [[32m    INFO[0m]: Model parallel was set to True, setting max memory per GPU to {0: 10169221120, 1: 15557066752, 2: 15557066752, 3: 15573843968} and device map to auto (transformers_model.py:417)[0m
[2025-03-05 12:26:33,062] [[32m    INFO