In [1]:
import ray

In [2]:
if ray.is_initialized():
    ray.shutdown()
ray.init()

2024-11-23 20:51:03,694	INFO worker.py:1819 -- Started a local Ray instance.


0,1
Python version:,3.10.11
Ray version:,2.39.0


[36m(RayTrainWorker pid=22136)[0m Setting up process group for: env:// [rank=0, world_size=6]
[36m(TorchTrainer pid=22132)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=22132)[0m - (node_id=af02e1790808eb0e49800dbeb469a93ef6b0d8663d2dfe6dfa0a108a, ip=127.0.0.1, pid=22136) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=22132)[0m - (node_id=af02e1790808eb0e49800dbeb469a93ef6b0d8663d2dfe6dfa0a108a, ip=127.0.0.1, pid=22138) world_rank=1, local_rank=1, node_rank=0
[36m(TorchTrainer pid=22132)[0m - (node_id=af02e1790808eb0e49800dbeb469a93ef6b0d8663d2dfe6dfa0a108a, ip=127.0.0.1, pid=22137) world_rank=2, local_rank=2, node_rank=0
[36m(TorchTrainer pid=22132)[0m - (node_id=af02e1790808eb0e49800dbeb469a93ef6b0d8663d2dfe6dfa0a108a, ip=127.0.0.1, pid=22135) world_rank=3, local_rank=3, node_rank=0
[36m(TorchTrainer pid=22132)[0m - (node_id=af02e1790808eb0e49800dbeb469a93ef6b0d8663d2dfe6dfa0a108a, ip=127.0.0.1, pid=22139) world_rank=4, local_rank=4, nod

In [3]:
ray.cluster_resources()

{'CPU': 10.0,
 'object_store_memory': 2147483648.0,
 'node:127.0.0.1': 1.0,
 'node:__internal_head__': 1.0,
 'memory': 7435726029.0}

In [4]:
def read_data():
    with open('/Users/majid/Projects/nlp/GPT/tinyshakespeare.txt', 'r', encoding='utf-8') as f:
        allshakespeare = f.read()

    vocab = sorted(list(set(allshakespeare)))
    ctoi = {c: i for i, c in enumerate(vocab)}
    itoc = {i: c for i, c in enumerate(vocab)}
    encode = lambda s: [ctoi[x] for x in s]
    decode = lambda l: ''.join([itoc[x] for x in l])
    n = int(len(allshakespeare)*0.9)
    train_data = torch.tensor(encode(allshakespeare[:n]), dtype=torch.long)
    val_data = torch.tensor(encode(allshakespeare[n:]), dtype=torch.long)
    return train_data, val_data, encode, decode, len(vocab)

In [5]:
ray.data.DatasetContext.get_current().execution_options.preserve_order = True  # deterministic

In [6]:
import os
import random
from ray.data.preprocessor import Preprocessor
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel

## build the model

In [7]:
# hyper params
context_size = 8
batch_size = 64
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 32
n_head = 4
n_blocks = 3
dropout = 0.2
torch.manual_seed(1337)

<torch._C.Generator at 0x13949a0b0>

In [8]:
class AttentionHead(nn.Module):
    def __init__(self, head_size, n_embed=32, dropout=0.2) -> None:
        super().__init__()
        self.head_size = head_size
        self.WQ = nn.Linear(n_embed, head_size, bias=False)
        self.WK = nn.Linear(n_embed, head_size, bias=False)
        self.WV = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        x input of size (B, T, embedding_size)
        """
        B, T, embedding_size = x.shape
        q = self.WQ(x)  # q of shape (B, T, head_size)
        k = self.WK(x)
        v = self.WV(x)

        scores = q@k.transpose(-2, -1) * self.head_size**-0.5  # scores of shape (B, T, T)
        scores = scores.masked_fill(self.tril[:T, :T]==0, value=-torch.inf)
        attentions = F.softmax(scores, dim=-1)
        attentions = self.dropout(attentions)
        output = attentions@v  # output of size (B, T, head_size)

        return output


class FeedFroward(nn.Module):
    def __init__(self, n_embed=32, dropout=0.2):
        super().__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(n_embed, n_embed*4),
            nn.ReLU(),
            nn.Linear(n_embed*4, n_embed),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.feedforward(x)


class DecoderBlock(nn.Module):
    def __init__(self, n_heads, n_embed=32) -> None:
        super().__init__()
        head_size = n_embed // n_heads
        self.MultiHeadAttention = nn.ModuleList([AttentionHead(head_size) for _ in range(n_heads)])
        self.project = nn.Linear(n_embed, n_embed)
        self.layer_norm1 = nn.LayerNorm(n_embed)
        self.feedForward = FeedFroward()
        self.layer_norm2 = nn.LayerNorm(n_embed)
    
    def forward(self, x):
        """
        x is of shape (B, T, n_embed)
        """
        x = self.layer_norm1(x)
        attentioned = torch.concat([attention(x) for attention in self.MultiHeadAttention], dim=-1)
        x = x + self.project(attentioned)
        x = self.layer_norm2(x)
        x = x + self.feedForward(x)
        return x


class GPT(nn.Module):
    def __init__(self, n_heads, n_blocks, vocab_size, n_embed=32):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embed)
        self.pos_embed = nn.Embedding(context_size, n_embed)
        self.decoders = nn.Sequential(*[DecoderBlock(n_heads) for _ in range(n_blocks)], nn.LayerNorm(n_embed))
        # self.layer_norm = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, x, targets=None):
        """
        x: shape (B, T)
        targets: (B, T)
        """
        B, T = x.shape
        embeddings = self.embedding(x) 
        pos_embeds = self.pos_embed(torch.arange(T, device=device))  # T * n_embed
        x = embeddings + pos_embeds
        x = self.decoders(x)
        # x = self.layer_norm(x)
        logits = self.lm_head(x)
        B, T, C = logits.shape  # logits shape (B, T, C)
        
        if targets is not None:
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None
    
    @torch.inference_mode()
    def generate(self, x, max_size=100):
        """
        x of shape (B, T)
        """
        self.eval()
        for step in range(max_size):
            logits, loss = self(x[:, -context_size:])
            new_token_logits = logits[:, -1, :]
            probs = F.softmax(new_token_logits, dim=-1)
            next_tokens = torch.multinomial(probs, num_samples=1)  
            x = torch.concat((x, next_tokens), dim=1)
        return x


## train the model

In [9]:
def get_batch(data, split='train'):
    # data = train_data if split == 'train' else val_data
    idx = torch.randint(high=len(data)-context_size, size=(batch_size,))
    x = torch.stack([data[i:i+context_size] for i in idx])
    y = torch.stack([data[i+1:i+context_size+1] for i in idx])
    return x, y

In [10]:
def train_step(model, train_data, batch_size, optimizer, n_steps):
    model.train()
    total_loss = 0
    for step in range(n_steps):
        xb, yb = get_batch(train_data, split='train')
        xb.to(device)
        yb.to(device)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        total_loss += (loss.detach().item() - total_loss) / (step + 1)
    return total_loss

In [11]:
@torch.no_grad()
def eval_step(model, val_data, batch_size, n_steps):
    model.eval()
    total_loss = 0
    with torch.inference_mode():
        for step in range(n_steps):
            xb, yb = get_batch(val_data, split='eval')
            xb.to(device)
            yb.to(device)
            logits, loss = model(xb, yb)
            total_loss += (loss.item() - total_loss) / (step + 1)
    return total_loss

In [12]:
import ray.train as raytrain
from ray.train import Checkpoint, session
from ray.train.torch import TorchCheckpoint, TorchTrainer

In [13]:
# Training loop
def train_loop_per_worker(config):
    # Hyperparameters
    dropout_p = config["dropout_p"]
    lr = config["lr"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    n_train_steps = config["n_train_steps"]
    n_eval_steps = config["n_eval_steps"]
    n_heads = config["n_heads"]
    n_blocks = config["n_blocks"]

    # Get datasets
    torch.manual_seed(1337) #set seed
    train_data, val_data, encode, decode, vocab_size = read_data()
   

    # Model 
    gpt = GPT(n_heads, n_blocks, vocab_size)
    gpt = raytrain.torch.prepare_model(gpt)

    # Training components
    optimizer = torch.optim.AdamW(gpt.parameters(), lr=lr)

    # Training
    batch_size_per_worker = batch_size // raytrain.get_context().get_world_size()
    for epoch in range(num_epochs):
        # Step
        train_loss = train_step(gpt, train_data, batch_size_per_worker, optimizer, n_train_steps)
        val_loss = eval_step(gpt, val_data, batch_size_per_worker, n_eval_steps)

        # Checkpoint
        base_model = gpt.module if isinstance(gpt, DistributedDataParallel) else gpt
        checkpoint_dir = tempfile.mkdtemp()
        torch.save(
            {"model_state_dict": base_model.state_dict()},
            os.path.join(checkpoint_dir, "model.pt"),
        )
        checkpoint = Checkpoint.from_directory(checkpoint_dir)

        # Report metrics and checkpoint.
        raytrain.report({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss}, checkpoint=checkpoint)

In [14]:
train_config = {"dropout_p": 0.2,
          "lr": 3e-4,
          "num_epochs": 3,
          "batch_size": 64,
          "n_train_steps": 2000,
          "n_eval_steps": 100,
          "n_heads": 4,
          "n_blocks": 3
          }

In [15]:
from ray.air.config import CheckpointConfig, DatasetConfig, RunConfig, ScalingConfig

In [16]:
scaling_config = ScalingConfig(
    num_workers=6,
)

In [142]:
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")

In [49]:
import os
import tempfile
run_config = RunConfig(name="gpt", checkpoint_config=checkpoint_config, storage_path=os.path.abspath("./ray_results"))

In [143]:
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_config,
    scaling_config=scaling_config,
    run_config=run_config,
)

In [144]:
results = trainer.fit()

2024-11-21 18:49:05,597	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-11-21 18:49:05 (running for 00:00:00.11)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2024-11-21 18:49:10 (running for 00:00:05.20)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-21 18:49:15 (running for 00:00:10.30)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-21 18:49:20 (running for 00:00:15.40)
Using FIFO schedul

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


== Status ==
Current time: 2024-11-21 18:50:47 (running for 00:01:41.85)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-21 18:50:52 (running for 00:01:46.93)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-21 18:50:57 (running for 00:01:52.02)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-21 18:51:02 (running for 00:01:57.02)
Using FIFO schedul

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
2024-11-21 18:51:29,647	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/majid/Projects/nlp/GPT/ray_results/gpt' in 0.0038s.
2024-11-21 18:51:29,651	INFO tune.py:1041 -- Total run time: 144.05 seconds (144.04 seconds for the tuning loop).


== Status ==
Current time: 2024-11-21 18:51:29 (running for 00:02:24.05)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-21_18-49-05/gpt/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)




In [148]:
results.best_checkpoints

[(Checkpoint(filesystem=local, path=/Users/majid/Projects/nlp/GPT/ray_results/gpt/TorchTrainer_31698_00000_0_2024-11-21_18-49-05/checkpoint_000002),
  {'epoch': 2,
   'train_loss': 2.2041960158348086,
   'val_loss': 2.1340403580665583,
   'timestamp': 1732233088,
   'checkpoint_dir_name': 'checkpoint_000002',
   'should_checkpoint': True,
   'done': False,
   'training_iteration': 3,
   'trial_id': '31698_00000',
   'date': '2024-11-21_18-51-28',
   'time_this_iter_s': 43.60825204849243,
   'time_total_s': 140.24046921730042,
   'pid': 87751,
   'hostname': 'Majids-MacBook-Pro.local',
   'node_ip': '127.0.0.1',
   'config': {'train_loop_config': {'dropout_p': 0.2,
     'lr': 0.0003,
     'num_epochs': 3,
     'batch_size': 64,
     'n_train_steps': 2000,
     'n_eval_steps': 100,
     'n_heads': 4,
     'n_blocks': 3}},
   'time_since_restore': 140.24046921730042,
   'iterations_since_restore': 3})]

## test generation

In [159]:
checkpoint = results.checkpoint
checkpoint

Checkpoint(filesystem=local, path=/Users/majid/Projects/nlp/GPT/ray_results/gpt/TorchTrainer_31698_00000_0_2024-11-21_18-49-05/checkpoint_000002)

In [168]:
os.path.isfile(checkpoint.path+"/model.pt")

True

In [67]:
def load_model_from_checkpoint(checkpoint):
    model = GPT(train_config['n_heads'], train_config['n_blocks'], 65, n_embed=32)
    model_state = torch.load(checkpoint.path+"/model.pt", map_location='cpu', weights_only=True)
    model.load_state_dict(model_state['model_state_dict'])
    return model

In [193]:
generation = model.generate(torch.zeros((1, 1), dtype=torch.long), max_size=300)
print(decode(generation[0].tolist()))


For intence that uns to grep'ss olonss dexno wane gray;
And arrive blese! I',
and I frock to wenle ve will hand. Thy lode kightors of in hia Sarce or cow pown the fleat heor proight Ceath,
O, I vorcon tling thim, pour.

FO:
Halat's that not gais ary?
Ho bort, ruan
Your decouk, wor ou for not day.

H


## Experiment tracking with MLflow

In [17]:
import mlflow
from pathlib import Path
from ray.air.integrations.mlflow import MLflowLoggerCallback
import time

In [18]:
MODEL_REGISTRY = Path("./tmp/mlflow")
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
MLFLOW_TRACKING_URI = "file://" + str(MODEL_REGISTRY.absolute())
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print (mlflow.get_tracking_uri())

file:///Users/majid/Projects/nlp/GPT/tmp/mlflow


In [197]:
experiment_name = f"gpt-{int(time.time())}"
mlflow_callback = MLflowLoggerCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    experiment_name=experiment_name,
    save_artifact=True)

In [201]:
# Run configuration with MLflow callback
run_config = RunConfig(
    callbacks=[mlflow_callback],
    checkpoint_config=checkpoint_config,
    storage_path=os.path.abspath("./ray_results/mlflow")
)

In [202]:
# Trainer
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_config,
    scaling_config=scaling_config,
    run_config=run_config,  # uses RunConfig with MLflow callback
)

# Train
results = trainer.fit()

2024-11-23 12:24:55,567	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-11-23 12:24:55 (running for 00:00:00.12)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2024-11-23 12:25:00 (running for 00:00:05.15)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-23 12:25:05 (running for 00:00:10.18)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== S

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


== Status ==
Current time: 2024-11-23 12:25:51 (running for 00:00:55.92)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-23 12:25:56 (running for 00:01:00.92)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-23 12:26:01 (running for 00:01:06.00)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== S

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


== Status ==
Current time: 2024-11-23 12:26:37 (running for 00:01:41.54)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-23 12:26:42 (running for 00:01:46.64)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-11-23 12:26:47 (running for 00:01:51.73)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== S

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
2024-11-23 12:27:17,031	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/majid/Projects/nlp/GPT/ray_results/mlflow/TorchTrainer_2024-11-23_12-24-55' in 0.0027s.
2024-11-23 12:27:17,034	INFO tune.py:1041 -- Total run time: 141.47 seconds (141.45 seconds for the tuning loop).


== Status ==
Current time: 2024-11-23 12:27:17 (running for 00:02:21.45)
Using FIFO scheduling algorithm.
Logical resource usage: 7.0/10 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-11-21_17-12-48_899457_5746/artifacts/2024-11-23_12-24-55/TorchTrainer_2024-11-23_12-24-55/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)




In [203]:
results

Result(
  metrics={'epoch': 2, 'train_loss': 2.2041960158348086, 'val_loss': 2.1340403580665583},
  path='/Users/majid/Projects/nlp/GPT/ray_results/mlflow/TorchTrainer_2024-11-23_12-24-55/TorchTrainer_db59a_00000_0_2024-11-23_12-24-55',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/Users/majid/Projects/nlp/GPT/ray_results/mlflow/TorchTrainer_2024-11-23_12-24-55/TorchTrainer_db59a_00000_0_2024-11-23_12-24-55/checkpoint_000002)
)

In [204]:
sorted_runs = mlflow.search_runs(experiment_names=[experiment_name], order_by=["metrics.val_loss ASC"])
sorted_runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.config/train_loop_config/lr,metrics.epoch,metrics.train_loss,metrics.iterations_since_restore,...,params.train_loop_config/num_epochs,params.train_loop_config/n_blocks,params.train_loop_config/n_heads,params.train_loop_config/n_eval_steps,params.train_loop_config/batch_size,params.train_loop_config/n_train_steps,tags..ipynb_checkpoints/trial_name-checkpoint,tags.trial_name,tags..ipynb_checkpoints/mlflow-checkpoint.runName,tags.mlflow.runName
0,607c8c79136444ce84d603c0b0ae77c7,742657146908851070,FINISHED,file:///Users/majid/Projects/nlp/GPT/tmp/mlflo...,2024-11-23 17:24:58.443000+00:00,2024-11-23 17:27:17.026000+00:00,0.0003,2.0,2.204196,3.0,...,3,3,4,100,64,2000,TorchTrainer_db59a_00000,TorchTrainer_db59a_00000,TorchTrainer_db59a_00000,TorchTrainer_db59a_00000
1,bf15a2979fbc4bf3886801903106d71c,742657146908851070,FINISHED,file:///Users/majid/Projects/nlp/GPT/tmp/mlflo...,2024-11-23 17:07:08.769000+00:00,2024-11-23 17:09:27.885000+00:00,0.0003,2.0,2.204196,3.0,...,3,3,4,100,64,2000,TorchTrainer_5db4b_00000,TorchTrainer_5db4b_00000,TorchTrainer_5db4b_00000,TorchTrainer_5db4b_00000


In [229]:
from urllib.parse import urlparse
mlflow_run = mlflow.get_run('607c8c79136444ce84d603c0b0ae77c7')
artifact_dir = urlparse(mlflow_run.info.artifact_uri).path
artifact_dir

'/Users/majid/Projects/nlp/GPT/tmp/mlflow/742657146908851070/607c8c79136444ce84d603c0b0ae77c7/artifacts'

In [234]:
mlflow_run.info

<RunInfo: artifact_uri='file:///Users/majid/Projects/nlp/GPT/tmp/mlflow/742657146908851070/607c8c79136444ce84d603c0b0ae77c7/artifacts', end_time=1732382837026, experiment_id='742657146908851070', lifecycle_stage='active', run_id='607c8c79136444ce84d603c0b0ae77c7', run_name='TorchTrainer_db59a_00000', run_uuid='607c8c79136444ce84d603c0b0ae77c7', start_time=1732382698443, status='FINISHED', user_id='unknown'>

In [227]:
from ray.train import Result
results = Result.from_path(artifact_dir)
results

Result(
  metrics={'epoch': 2, 'train_loss': 2.2041960158348086, 'val_loss': 2.1340403580665583},
  path='/Users/majid/Projects/nlp/GPT/tmp/mlflow/742657146908851070/607c8c79136444ce84d603c0b0ae77c7/artifacts',
  filesystem='local',
  checkpoint=None
)

In [228]:
results.checkpoint

## Hyperparameter tuning

In [51]:
from ray import tune
from ray.tune import Tuner
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.search import ConcurrencyLimiter
from ray.tune.search.hyperopt import HyperOptSearch

In [52]:
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_config,
    scaling_config=scaling_config,
)

In [53]:
mlflow_callback = MLflowLoggerCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    experiment_name='tuning_trial_1',
    save_artifact=True)

In [54]:
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")
run_config = RunConfig(
    callbacks=[mlflow_callback],
    checkpoint_config=checkpoint_config,
    storage_path=os.path.abspath("./ray_results/mlflow")
)

In [55]:
initial_params = [{'train_loop_config': {"dropout_p": 0.2, "lr": 3e-4, "n_train_steps": 2000}}]
search_alg = HyperOptSearch(points_to_evaluate=initial_params)
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)

In [56]:
param_space = {
    "train_loop_config": {
        "dropout_p": tune.uniform(0.1, 0.8),
        "lr": tune.loguniform(1e-5, 5e-4),
        "n_train_steps": tune.randint(1000, 4000),
    }
}


In [57]:
scheduler = AsyncHyperBandScheduler(
    max_t=train_config["num_epochs"],
    grace_period=1,
)

In [58]:
tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    search_alg=search_alg,
    scheduler=scheduler,
    num_samples=2,
)

In [59]:
tuner = Tuner(
    trainable=trainer,
    run_config=run_config,
    param_space=param_space,
    tune_config=tune_config,
)

In [60]:
results = tuner.fit()

0,1
Current time:,2024-11-23 21:36:56
Running for:,00:03:35.50
Memory:,12.4/16.0 GiB

Trial name,status,loc,train_loop_config/dr opout_p,train_loop_config/lr,train_loop_config/n_ train_steps,iter,total time (s),epoch,train_loss,val_loss
TorchTrainer_117a19f8,TERMINATED,127.0.0.1:22688,0.2,0.0003,2000,3,136.179,2,2.2042,2.13404
TorchTrainer_114daa31,TERMINATED,127.0.0.1:22833,0.353292,0.000137316,2986,1,73.2199,0,2.73005,2.38613


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
2024-11-23 21:36:56,660	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/majid/Projects/nlp/GPT/ray_results/mlflow/TorchTrainer_2024-11-23_21-33-18' in 0.0073s.
2024-11-23 21:36:56,666	INFO tune.py:1041 -- Total run time: 215.53 seconds (215.49 seconds for the tuning loop).


In [62]:
best_trial = results.get_best_result(metric="val_loss", mode="min")
best_trial

Result(
  metrics={'epoch': 2, 'train_loss': 2.2041960158348086, 'val_loss': 2.1340403580665583},
  path='/Users/majid/Projects/nlp/GPT/ray_results/mlflow/TorchTrainer_2024-11-23_21-33-18/TorchTrainer_117a19f8_1_dropout_p=0.2000,lr=0.0003,n_train_steps=2000_2024-11-23_21-33-21',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/Users/majid/Projects/nlp/GPT/ray_results/mlflow/TorchTrainer_2024-11-23_21-33-18/TorchTrainer_117a19f8_1_dropout_p=0.2000,lr=0.0003,n_train_steps=2000_2024-11-23_21-33-21/checkpoint_000002)
)

In [66]:
model = load_model_from_checkpoint(from_checkpoint(best_trial.checkpoint))


AttributeError: 'Checkpoint' object has no attribute 'get_model'