In [1]:
import sys
import os

# Add the parent directory (Finetuning/) to sys.path to access utils_ft
sys.path.append('..')

from utils_ft import (
    get_model_path, 
    get_output_dir, 
    get_processed_data_path, 
    get_adapters_path, 
    to_relative,
    setup_env
)
from scripts.train_mlx import run_mlx_training, fuse_model

# 1. Setup Environment (Load .env variables if needed)
setup_env()

# 2. Config
MODEL_TYPE = "qwen" 
FRAMEWORK = "mlx"

# 3. Define Paths using the new Utils
# This guarantees they match the structure used by the data converter
base_model = get_model_path(MODEL_TYPE)
data_folder = get_processed_data_path()                   # .../Finetuning/data
adapter_path = get_adapters_path(MODEL_TYPE, FRAMEWORK)   # .../Finetuning/adapters/qwen_mlx
fused_output = get_output_dir(MODEL_TYPE, FRAMEWORK)      # .../local_models/ft_qwen_mlx

# 4. Verify Paths (Optional, prints relative paths for clarity)
print(f"--- Configuration ---")
print(f"Base Model:   {base_model}")
print(f"Data Folder:  {to_relative(data_folder)}")
print(f"Adapter Path: {to_relative(adapter_path)}")
print(f"Output Path:  {to_relative(fused_output)}")

--- Configuration ---
Base Model:   Qwen/Qwen2.5-7B-Instruct
Data Folder:  Finetuning/data
Adapter Path: Finetuning/adapters/qwen_mlx
Output Path:  local_models/ft_qwen_mlx


In [2]:
run_mlx_training(base_model, data_folder, adapter_path, iters=900, num_layers=16)

--- Initializing MLX Fine-tuning (CLI Mode) ---
--- Model: ./Qwen/Qwen2.5-7B-Instruct ---
--- Data Folder: ./../data ---
Generating configuration file at: ./../adapters/qwen_mlx/train_config.yaml
Executing MLX CLI training...
Loading configuration file ../adapters/qwen_mlx/train_config.yaml
Loading pretrained model


Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19401.74it/s]


Loading datasets
Training
Trainable parameters: 0.151% (11.534M/7615.617M)
Starting training..., iters: 900


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 1: Val loss 1.130, Val took 166.700s
Iter 20: Train loss 1.611, Learning Rate 9.992e-06, It/sec 0.526, Tokens/sec 21.229, Trained Tokens 807, Peak mem 17.595 GB
Iter 40: Train loss 0.469, Learning Rate 9.961e-06, It/sec 0.579, Tokens/sec 21.672, Trained Tokens 1555, Peak mem 17.595 GB
Iter 60: Train loss 0.515, Learning Rate 9.905e-06, It/sec 0.517, Tokens/sec 25.451, Trained Tokens 2539, Peak mem 17.733 GB
Iter 80: Train loss 0.346, Learning Rate 9.825e-06, It/sec 0.478, Tokens/sec 21.686, Trained Tokens 3446, Peak mem 17.804 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 100: Val loss 0.300, Val took 166.448s
Iter 100: Train loss 0.353, Learning Rate 9.722e-06, It/sec 0.487, Tokens/sec 28.983, Trained Tokens 4636, Peak mem 17.804 GB
Iter 100: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000100_adapters.safetensors.
Iter 120: Train loss 0.288, Learning Rate 9.596e-06, It/sec 0.440, Tokens/sec 28.851, Trained Tokens 5946, Peak mem 17.804 GB
Iter 140: Train loss 0.222, Learning Rate 9.447e-06, It/sec 0.574, Tokens/sec 20.024, Trained Tokens 6644, Peak mem 17.804 GB
Iter 160: Train loss 0.185, Learning Rate 9.277e-06, It/sec 0.484, Tokens/sec 23.022, Trained Tokens 7595, Peak mem 17.804 GB
Iter 180: Train loss 0.441, Learning Rate 9.086e-06, It/sec 0.535, Tokens/sec 19.619, Trained Tokens 8329, Peak mem 17.804 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 200: Val loss 0.255, Val took 166.484s
Iter 200: Train loss 0.370, Learning Rate 8.875e-06, It/sec 0.471, Tokens/sec 24.506, Trained Tokens 9369, Peak mem 17.804 GB
Iter 200: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000200_adapters.safetensors.
Iter 220: Train loss 0.247, Learning Rate 8.645e-06, It/sec 0.576, Tokens/sec 18.039, Trained Tokens 9995, Peak mem 17.804 GB
Iter 240: Train loss 0.400, Learning Rate 8.397e-06, It/sec 0.482, Tokens/sec 22.208, Trained Tokens 10916, Peak mem 17.804 GB
Iter 260: Train loss 0.274, Learning Rate 8.133e-06, It/sec 0.443, Tokens/sec 36.710, Trained Tokens 12574, Peak mem 19.105 GB
Iter 280: Train loss 0.295, Learning Rate 7.854e-06, It/sec 0.467, Tokens/sec 31.435, Trained Tokens 13921, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 300: Val loss 0.250, Val took 166.537s
Iter 300: Train loss 0.388, Learning Rate 7.560e-06, It/sec 0.526, Tokens/sec 17.554, Trained Tokens 14589, Peak mem 19.105 GB
Iter 300: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000300_adapters.safetensors.
Iter 320: Train loss 0.351, Learning Rate 7.254e-06, It/sec 0.472, Tokens/sec 19.412, Trained Tokens 15412, Peak mem 19.105 GB
Iter 340: Train loss 0.361, Learning Rate 6.938e-06, It/sec 0.480, Tokens/sec 17.126, Trained Tokens 16125, Peak mem 19.105 GB
Iter 360: Train loss 0.360, Learning Rate 6.611e-06, It/sec 0.546, Tokens/sec 34.664, Trained Tokens 17395, Peak mem 19.105 GB
Iter 380: Train loss 0.308, Learning Rate 6.277e-06, It/sec 0.511, Tokens/sec 19.056, Trained Tokens 18141, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 400: Val loss 0.242, Val took 166.618s
Iter 400: Train loss 0.312, Learning Rate 5.937e-06, It/sec 0.547, Tokens/sec 25.250, Trained Tokens 19064, Peak mem 19.105 GB
Iter 400: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000400_adapters.safetensors.
Iter 420: Train loss 0.205, Learning Rate 5.592e-06, It/sec 0.555, Tokens/sec 19.201, Trained Tokens 19756, Peak mem 19.105 GB
Iter 440: Train loss 0.350, Learning Rate 5.244e-06, It/sec 0.452, Tokens/sec 14.636, Trained Tokens 20404, Peak mem 19.105 GB
Iter 460: Train loss 0.286, Learning Rate 4.895e-06, It/sec 0.555, Tokens/sec 37.411, Trained Tokens 21751, Peak mem 19.105 GB
Iter 480: Train loss 0.109, Learning Rate 4.547e-06, It/sec 0.537, Tokens/sec 24.630, Trained Tokens 22668, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 500: Val loss 0.237, Val took 166.594s
Iter 500: Train loss 0.146, Learning Rate 4.201e-06, It/sec 0.494, Tokens/sec 14.879, Trained Tokens 23271, Peak mem 19.105 GB
Iter 500: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000500_adapters.safetensors.
Iter 520: Train loss 0.216, Learning Rate 3.858e-06, It/sec 0.527, Tokens/sec 39.174, Trained Tokens 24759, Peak mem 19.105 GB
Iter 540: Train loss 0.139, Learning Rate 3.521e-06, It/sec 0.518, Tokens/sec 19.786, Trained Tokens 25523, Peak mem 19.105 GB
Iter 560: Train loss 0.257, Learning Rate 3.192e-06, It/sec 0.554, Tokens/sec 31.490, Trained Tokens 26660, Peak mem 19.105 GB
Iter 580: Train loss 0.074, Learning Rate 2.871e-06, It/sec 0.522, Tokens/sec 20.148, Trained Tokens 27432, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 600: Val loss 0.238, Val took 166.625s
Iter 600: Train loss 0.143, Learning Rate 2.561e-06, It/sec 0.546, Tokens/sec 19.848, Trained Tokens 28159, Peak mem 19.105 GB
Iter 600: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000600_adapters.safetensors.
Iter 620: Train loss 0.182, Learning Rate 2.262e-06, It/sec 0.450, Tokens/sec 13.572, Trained Tokens 28762, Peak mem 19.105 GB
Iter 640: Train loss 0.144, Learning Rate 1.977e-06, It/sec 0.456, Tokens/sec 38.751, Trained Tokens 30461, Peak mem 19.105 GB
Iter 660: Train loss 0.160, Learning Rate 1.707e-06, It/sec 0.518, Tokens/sec 26.141, Trained Tokens 31470, Peak mem 19.105 GB
Iter 680: Train loss 0.205, Learning Rate 1.452e-06, It/sec 0.457, Tokens/sec 22.736, Trained Tokens 32465, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 700: Val loss 0.241, Val took 166.577s
Iter 700: Train loss 0.129, Learning Rate 1.215e-06, It/sec 0.518, Tokens/sec 16.599, Trained Tokens 33106, Peak mem 19.105 GB
Iter 700: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000700_adapters.safetensors.
Iter 720: Train loss 0.228, Learning Rate 9.963e-07, It/sec 0.615, Tokens/sec 25.063, Trained Tokens 33921, Peak mem 19.105 GB
Iter 740: Train loss 0.112, Learning Rate 7.972e-07, It/sec 0.541, Tokens/sec 20.807, Trained Tokens 34690, Peak mem 19.105 GB
Iter 760: Train loss 0.164, Learning Rate 6.185e-07, It/sec 0.477, Tokens/sec 22.879, Trained Tokens 35650, Peak mem 19.105 GB
Iter 780: Train loss 0.214, Learning Rate 4.611e-07, It/sec 0.416, Tokens/sec 35.838, Trained Tokens 37372, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 800: Val loss 0.240, Val took 166.791s
Iter 800: Train loss 0.151, Learning Rate 3.259e-07, It/sec 0.586, Tokens/sec 16.543, Trained Tokens 37937, Peak mem 19.105 GB
Iter 800: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000800_adapters.safetensors.
Iter 820: Train loss 0.150, Learning Rate 2.134e-07, It/sec 0.542, Tokens/sec 31.835, Trained Tokens 39111, Peak mem 19.105 GB
Iter 840: Train loss 0.143, Learning Rate 1.243e-07, It/sec 0.581, Tokens/sec 24.022, Trained Tokens 39938, Peak mem 19.105 GB
Iter 860: Train loss 0.147, Learning Rate 5.886e-08, It/sec 0.454, Tokens/sec 21.282, Trained Tokens 40876, Peak mem 19.105 GB
Iter 880: Train loss 0.179, Learning Rate 1.754e-08, It/sec 0.495, Tokens/sec 21.138, Trained Tokens 41730, Peak mem 19.105 GB


Calculating loss...: 166it [02:46,  1.00s/it]


Iter 900: Val loss 0.241, Val took 166.505s
Iter 900: Train loss 0.190, Learning Rate 4.870e-10, It/sec 0.461, Tokens/sec 22.013, Trained Tokens 42686, Peak mem 19.105 GB
Iter 900: Saved adapter weights to ../adapters/qwen_mlx/adapters.safetensors and ../adapters/qwen_mlx/0000900_adapters.safetensors.
Saved final weights to ../adapters/qwen_mlx/adapters.safetensors.
✓ Training completed.


In [3]:
fuse_model(base_model, adapter_path, fused_output)
print(f"Success! Model ready at ./{to_relative(fused_output)}")

Fusing LoRA adapters (Step: Final)...
--- Staging checkpoint from: adapters.safetensors ---
Loading pretrained model


Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 18057.67it/s]


Success! Fused model saved to: ./local_models/ft_qwen_mlx
Success! Model ready at ./local_models/ft_qwen_mlx


In [4]:
# 2. Config
MODEL_TYPE = "llama" 
FRAMEWORK = "mlx"

# 3. Define Paths using the new Utils
# This guarantees they match the structure used by the data converter
base_model = get_model_path(MODEL_TYPE)
data_folder = get_processed_data_path()                   # .../Finetuning/data
adapter_path = get_adapters_path(MODEL_TYPE, FRAMEWORK)   # .../Finetuning/adapters/qwen_mlx
fused_output = get_output_dir(MODEL_TYPE, FRAMEWORK)      # .../local_models/ft_qwen_mlx

# 4. Verify Paths (Optional, prints relative paths for clarity)
print(f"--- Configuration ---")
print(f"Base Model:   {base_model}")
print(f"Data Folder:  {to_relative(data_folder)}")
print(f"Adapter Path: {to_relative(adapter_path)}")
print(f"Output Path:  {to_relative(fused_output)}")

--- Configuration ---
Base Model:   meta-llama/Meta-Llama-3-8B-Instruct
Data Folder:  Finetuning/data
Adapter Path: Finetuning/adapters/llama_mlx
Output Path:  local_models/ft_llama_mlx


In [5]:
run_mlx_training(base_model, data_folder, adapter_path, iters=900, num_layers=16)

--- Initializing MLX Fine-tuning (CLI Mode) ---
--- Model: ./meta-llama/Meta-Llama-3-8B-Instruct ---
--- Data Folder: ./../data ---
Generating configuration file at: ./../adapters/llama_mlx/train_config.yaml
Executing MLX CLI training...
Loading configuration file ../adapters/llama_mlx/train_config.yaml
Loading pretrained model


Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 12179.87it/s]


Loading datasets
Training
Trainable parameters: 0.131% (10.486M/8030.261M)
Starting training..., iters: 900


Calculating loss...: 166it [02:49,  1.02s/it]


Iter 1: Val loss 1.660, Val took 169.101s
Iter 20: Train loss 1.688, Learning Rate 9.992e-06, It/sec 0.530, Tokens/sec 20.553, Trained Tokens 775, Peak mem 18.094 GB
Iter 40: Train loss 0.574, Learning Rate 9.961e-06, It/sec 0.592, Tokens/sec 21.279, Trained Tokens 1494, Peak mem 18.094 GB
Iter 60: Train loss 0.590, Learning Rate 9.905e-06, It/sec 0.529, Tokens/sec 24.588, Trained Tokens 2423, Peak mem 18.233 GB
Iter 80: Train loss 0.353, Learning Rate 9.825e-06, It/sec 0.480, Tokens/sec 21.180, Trained Tokens 3305, Peak mem 18.341 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 100: Val loss 0.381, Val took 168.839s
Iter 100: Train loss 0.428, Learning Rate 9.722e-06, It/sec 0.491, Tokens/sec 28.747, Trained Tokens 4475, Peak mem 18.341 GB
Iter 100: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000100_adapters.safetensors.
Iter 120: Train loss 0.316, Learning Rate 9.596e-06, It/sec 0.454, Tokens/sec 28.555, Trained Tokens 5734, Peak mem 18.341 GB
Iter 140: Train loss 0.336, Learning Rate 9.447e-06, It/sec 0.593, Tokens/sec 19.907, Trained Tokens 6405, Peak mem 18.341 GB
Iter 160: Train loss 0.249, Learning Rate 9.277e-06, It/sec 0.497, Tokens/sec 22.827, Trained Tokens 7323, Peak mem 18.341 GB
Iter 180: Train loss 0.457, Learning Rate 9.086e-06, It/sec 0.539, Tokens/sec 19.222, Trained Tokens 8036, Peak mem 18.341 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 200: Val loss 0.348, Val took 168.877s
Iter 200: Train loss 0.436, Learning Rate 8.875e-06, It/sec 0.475, Tokens/sec 24.186, Trained Tokens 9055, Peak mem 18.341 GB
Iter 200: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000200_adapters.safetensors.
Iter 220: Train loss 0.353, Learning Rate 8.645e-06, It/sec 0.584, Tokens/sec 17.574, Trained Tokens 9657, Peak mem 18.341 GB
Iter 240: Train loss 0.463, Learning Rate 8.397e-06, It/sec 0.485, Tokens/sec 21.677, Trained Tokens 10551, Peak mem 18.341 GB
Iter 260: Train loss 0.294, Learning Rate 8.133e-06, It/sec 0.444, Tokens/sec 36.130, Trained Tokens 12180, Peak mem 19.506 GB
Iter 280: Train loss 0.421, Learning Rate 7.854e-06, It/sec 0.472, Tokens/sec 30.320, Trained Tokens 13465, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 300: Val loss 0.329, Val took 168.871s
Iter 300: Train loss 0.488, Learning Rate 7.560e-06, It/sec 0.537, Tokens/sec 17.338, Trained Tokens 14111, Peak mem 19.506 GB
Iter 300: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000300_adapters.safetensors.
Iter 320: Train loss 0.408, Learning Rate 7.254e-06, It/sec 0.495, Tokens/sec 19.581, Trained Tokens 14902, Peak mem 19.506 GB
Iter 340: Train loss 0.468, Learning Rate 6.938e-06, It/sec 0.486, Tokens/sec 16.520, Trained Tokens 15582, Peak mem 19.506 GB
Iter 360: Train loss 0.469, Learning Rate 6.611e-06, It/sec 0.556, Tokens/sec 34.535, Trained Tokens 16824, Peak mem 19.506 GB
Iter 380: Train loss 0.392, Learning Rate 6.277e-06, It/sec 0.527, Tokens/sec 18.826, Trained Tokens 17538, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 400: Val loss 0.323, Val took 168.861s
Iter 400: Train loss 0.378, Learning Rate 5.937e-06, It/sec 0.560, Tokens/sec 24.956, Trained Tokens 18429, Peak mem 19.506 GB
Iter 400: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000400_adapters.safetensors.
Iter 420: Train loss 0.307, Learning Rate 5.592e-06, It/sec 0.560, Tokens/sec 18.476, Trained Tokens 19089, Peak mem 19.506 GB
Iter 440: Train loss 0.438, Learning Rate 5.244e-06, It/sec 0.455, Tokens/sec 14.256, Trained Tokens 19716, Peak mem 19.506 GB
Iter 460: Train loss 0.345, Learning Rate 4.895e-06, It/sec 0.567, Tokens/sec 37.430, Trained Tokens 21037, Peak mem 19.506 GB
Iter 480: Train loss 0.125, Learning Rate 4.547e-06, It/sec 0.553, Tokens/sec 24.619, Trained Tokens 21928, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 500: Val loss 0.317, Val took 168.895s
Iter 500: Train loss 0.156, Learning Rate 4.201e-06, It/sec 0.498, Tokens/sec 14.526, Trained Tokens 22511, Peak mem 19.506 GB
Iter 500: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000500_adapters.safetensors.
Iter 520: Train loss 0.274, Learning Rate 3.858e-06, It/sec 0.533, Tokens/sec 38.774, Trained Tokens 23967, Peak mem 19.506 GB
Iter 540: Train loss 0.211, Learning Rate 3.521e-06, It/sec 0.523, Tokens/sec 19.372, Trained Tokens 24708, Peak mem 19.506 GB
Iter 560: Train loss 0.316, Learning Rate 3.192e-06, It/sec 0.561, Tokens/sec 31.003, Trained Tokens 25814, Peak mem 19.506 GB
Iter 580: Train loss 0.108, Learning Rate 2.871e-06, It/sec 0.536, Tokens/sec 19.025, Trained Tokens 26524, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 600: Val loss 0.317, Val took 168.907s
Iter 600: Train loss 0.189, Learning Rate 2.561e-06, It/sec 0.566, Tokens/sec 19.683, Trained Tokens 27220, Peak mem 19.506 GB
Iter 600: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000600_adapters.safetensors.
Iter 620: Train loss 0.258, Learning Rate 2.262e-06, It/sec 0.457, Tokens/sec 13.217, Trained Tokens 27799, Peak mem 19.506 GB
Iter 640: Train loss 0.243, Learning Rate 1.977e-06, It/sec 0.458, Tokens/sec 38.320, Trained Tokens 29471, Peak mem 19.506 GB
Iter 660: Train loss 0.247, Learning Rate 1.707e-06, It/sec 0.526, Tokens/sec 25.902, Trained Tokens 30455, Peak mem 19.506 GB
Iter 680: Train loss 0.240, Learning Rate 1.452e-06, It/sec 0.460, Tokens/sec 22.003, Trained Tokens 31412, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 700: Val loss 0.318, Val took 168.868s
Iter 700: Train loss 0.232, Learning Rate 1.215e-06, It/sec 0.531, Tokens/sec 16.308, Trained Tokens 32026, Peak mem 19.506 GB
Iter 700: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000700_adapters.safetensors.
Iter 720: Train loss 0.282, Learning Rate 9.963e-07, It/sec 0.625, Tokens/sec 24.781, Trained Tokens 32819, Peak mem 19.506 GB
Iter 740: Train loss 0.165, Learning Rate 7.972e-07, It/sec 0.556, Tokens/sec 20.514, Trained Tokens 33557, Peak mem 19.506 GB
Iter 760: Train loss 0.189, Learning Rate 6.185e-07, It/sec 0.489, Tokens/sec 22.657, Trained Tokens 34484, Peak mem 19.506 GB
Iter 780: Train loss 0.298, Learning Rate 4.611e-07, It/sec 0.419, Tokens/sec 35.351, Trained Tokens 36173, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 800: Val loss 0.318, Val took 168.891s
Iter 800: Train loss 0.216, Learning Rate 3.259e-07, It/sec 0.597, Tokens/sec 16.186, Trained Tokens 36715, Peak mem 19.506 GB
Iter 800: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000800_adapters.safetensors.
Iter 820: Train loss 0.208, Learning Rate 2.134e-07, It/sec 0.552, Tokens/sec 30.560, Trained Tokens 37822, Peak mem 19.506 GB
Iter 840: Train loss 0.146, Learning Rate 1.243e-07, It/sec 0.584, Tokens/sec 23.104, Trained Tokens 38613, Peak mem 19.506 GB
Iter 860: Train loss 0.185, Learning Rate 5.886e-08, It/sec 0.461, Tokens/sec 20.916, Trained Tokens 39521, Peak mem 19.506 GB
Iter 880: Train loss 0.212, Learning Rate 1.754e-08, It/sec 0.500, Tokens/sec 20.799, Trained Tokens 40353, Peak mem 19.506 GB


Calculating loss...: 166it [02:48,  1.02s/it]


Iter 900: Val loss 0.318, Val took 168.943s
Iter 900: Train loss 0.290, Learning Rate 4.870e-10, It/sec 0.466, Tokens/sec 21.722, Trained Tokens 41286, Peak mem 19.506 GB
Iter 900: Saved adapter weights to ../adapters/llama_mlx/adapters.safetensors and ../adapters/llama_mlx/0000900_adapters.safetensors.
Saved final weights to ../adapters/llama_mlx/adapters.safetensors.
✓ Training completed.


In [6]:
fuse_model(base_model, adapter_path, fused_output, 300)
print(f"Success! Model ready at ./{to_relative(fused_output)}")

Fusing LoRA adapters (Step: 300)...
--- Staging checkpoint from: 0000300_adapters.safetensors ---
Loading pretrained model


Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19574.61it/s]


Success! Fused model saved to: ./local_models/ft_llama_mlx
Success! Model ready at ./local_models/ft_llama_mlx
