In [2]:
!pip install transformers datasets accelerate ray[tune] --quiet

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/igraph-0.11.8-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/texttable-1.7.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/looseversion-1.3.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/o

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import ray
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def get_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained("distilgpt2")
    model.resize_token_embeddings(len(tokenizer))
    return model.to(device), tokenizer

def load_tokenized_dataset(tokenizer, block_size=64):
    raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")

    def tokenize(example):
        tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=block_size)
        tokens["labels"] = tokens["input_ids"].copy() 
        return tokens

    tokenized = raw_dataset.map(tokenize, batched=True, remove_columns=["text"])
    tokenized.set_format("torch")
    return tokenized

def train_with_tune(config):
    print(f"🔧 Starting trial with config: {config}")
    try:
        model, tokenizer = get_model_and_tokenizer()
        dataset = load_tokenized_dataset(tokenizer)

        train_size = int(0.8 * len(dataset))
        train_dataset = dataset.select(range(train_size))
        eval_dataset = dataset.select(range(train_size, len(dataset)))

        print(f"📊 Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}")

        training_args = TrainingArguments(
            output_dir="./output",
            per_device_train_batch_size=config["batch_size"],
            learning_rate=config["lr"],
            num_train_epochs=config["epochs"],
            logging_steps=5,
            save_strategy="no",
            report_to="none",
            fp16=torch.cuda.is_available(),
        )

        # Use a data collator that supports masked LM tasks
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=False  # GPT-style LM = causal, not masked
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )

        trainer.train()

        result = trainer.evaluate()
        print("📉 Eval results:", result)

        eval_loss = result.get("eval_loss", None)
        if eval_loss is None:
            print("⚠️ Warning: eval_loss missing. Reporting dummy loss = 9999.")
            eval_loss = 9999.0

        #tune.report(loss=eval_loss)
        train.report({"loss": eval_loss})

    except Exception as e:
        print(f"❌ Trial crashed: {e}")
        train.report({"loss": 9999.0})

# Define search space
search_space = {
    "batch_size": tune.choice([1, 2]),
    "lr": tune.loguniform(1e-5, 1e-4),
    "epochs": tune.choice([1, 2]),
}

scheduler = ASHAScheduler()

ray.shutdown()
ray.init(ignore_reinit_error=True, num_cpus=2)

# Run tuning
analysis = tune.run(
    train_with_tune,
    config=search_space,
    num_samples=2,
    scheduler=scheduler,
    metric="loss",
    mode="min",
    resources_per_trial={"cpu": 1, "gpu": 0.5 if torch.cuda.is_available() else 0},
    raise_on_failed_trial=False,
)

# Final result
if analysis.best_config:
    print("🎯 Best hyperparameters found:", analysis.best_config)
else:
    print("⚠️ No successful trials. But now your model is ready to return loss correctly.")


Using device: cuda


2025-07-11 05:28:39,776	INFO worker.py:1917 -- Started a local Ray instance.
2025-07-11 05:28:40,383	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-07-11 05:29:09
Running for:,00:00:28.89
Memory:,66.0/1007.7 GiB

Trial name,status,loc,batch_size,epochs,lr,iter,total time (s),loss
train_with_tune_e7281_00000,TERMINATED,10.0.62.11:175877,2,1,1.26073e-05,1,19.1564,4.39772
train_with_tune_e7281_00001,TERMINATED,10.0.62.11:175878,1,1,9.26354e-05,1,23.7124,4.02048




[36m(train_with_tune pid=175877)[0m 🔧 Starting trial with config: {'batch_size': 2, 'lr': 1.2607329607942673e-05, 'epochs': 1}
[36m(train_with_tune pid=175877)[0m 📊 Train size: 293, Eval size: 74
[36m(train_with_tune pid=175878)[0m 🔧 Starting trial with config: {'batch_size': 1, 'lr': 9.263536712014789e-05, 'epochs': 1}


  0%|          | 0/147 [00:00<?, ?it/s]
  1%|          | 1/147 [00:00<00:56,  2.58it/s]
  3%|▎         | 5/147 [00:00<00:12, 11.39it/s]


[36m(train_with_tune pid=175877)[0m {'loss': 5.5797, 'grad_norm': 20.786624908447266, 'learning_rate': 1.2521565460949866e-05, 'epoch': 0.03}
[36m(train_with_tune pid=175877)[0m {'loss': 5.7283, 'grad_norm': 20.491172790527344, 'learning_rate': 1.209274472598583e-05, 'epoch': 0.07}


  7%|▋         | 10/147 [00:00<00:07, 17.33it/s]


[36m(train_with_tune pid=175877)[0m {'loss': 5.9763, 'grad_norm': 24.713289260864258, 'learning_rate': 1.17496881380146e-05, 'epoch': 0.1}
[36m(train_with_tune pid=175878)[0m {'loss': 4.4338, 'grad_norm': inf, 'learning_rate': 9.231920545762179e-05, 'epoch': 0.02}


 11%|█         | 16/147 [00:00<00:05, 23.23it/s]


[36m(train_with_tune pid=175877)[0m {'loss': 5.5181, 'grad_norm': 40.40081787109375, 'learning_rate': 1.1320867403050564e-05, 'epoch': 0.14}


 15%|█▍        | 22/147 [00:01<00:05, 22.51it/s]


[36m(train_with_tune pid=175878)[0m 📊 Train size: 293, Eval size: 74


  0%|          | 0/293 [00:00<?, ?it/s]
 33%|███▎      | 98/293 [00:04<00:09, 21.00it/s][32m [repeated 51x across cluster][0m
 36%|███▌      | 105/293 [00:05<00:09, 20.78it/s][32m [repeated 27x across cluster][0m


[36m(train_with_tune pid=175877)[0m {'loss': 5.0977, 'grad_norm': 18.07731056213379, 'learning_rate': 3.17327343873387e-06, 'epoch': 0.78}[32m [repeated 38x across cluster][0m


 93%|█████████▎| 136/147 [00:06<00:00, 21.17it/s]
 95%|█████████▌| 140/147 [00:06<00:00, 21.15it/s]
 97%|█████████▋| 142/147 [00:06<00:00, 21.26it/s]


[36m(train_with_tune pid=175877)[0m {'train_runtime': 7.1124, 'train_samples_per_second': 41.196, 'train_steps_per_second': 20.668, 'train_loss': 4.929324870206872, 'epoch': 1.0}


 99%|█████████▊| 145/147 [00:07<00:00, 21.11it/s]
100%|██████████| 147/147 [00:07<00:00, 20.67it/s]
  0%|          | 0/10 [00:00<?, ?it/s]


Trial name,loss
train_with_tune_e7281_00000,4.39772
train_with_tune_e7281_00001,4.02048


[36m(train_with_tune pid=175877)[0m 📉 Eval results: {'eval_loss': 4.397721290588379, 'eval_runtime': 0.1617, 'eval_samples_per_second': 457.516, 'eval_steps_per_second': 61.827, 'epoch': 1.0}


 85%|████████▍ | 248/293 [00:10<00:01, 34.55it/s][32m [repeated 28x across cluster][0m
 89%|████████▊ | 260/293 [00:10<00:00, 34.80it/s][32m [repeated 29x across cluster][0m


[36m(train_with_tune pid=175878)[0m {'loss': 2.978, 'grad_norm': 0.0, 'learning_rate': 1.233030483851798e-05, 'epoch': 0.89}[32m [repeated 38x across cluster][0m


 92%|█████████▏| 270/293 [00:10<00:00, 34.32it/s]
 93%|█████████▎| 272/293 [00:10<00:00, 34.24it/s]
 94%|█████████▍| 276/293 [00:10<00:00, 34.46it/s]
 96%|█████████▌| 280/293 [00:10<00:00, 34.56it/s]
 97%|█████████▋| 284/293 [00:11<00:00, 34.38it/s]
 98%|█████████▊| 288/293 [00:11<00:00, 34.18it/s]
 99%|█████████▉| 290/293 [00:11<00:00, 34.18it/s]
100%|██████████| 293/293 [00:11<00:00, 25.83it/s]
2025-07-11 05:29:09,287	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_with_tune_2025-07-11_05-28-40' in 0.0040s.
2025-07-11 05:29:09,291	INFO tune.py:1041 -- Total run time: 28.91 seconds (28.89 seconds for the tuning loop).


🎯 Best hyperparameters found: {'batch_size': 1, 'lr': 9.263536712014789e-05, 'epochs': 1}


100%|██████████| 10/10 [00:00<00:00, 116.93it/s]


[36m(train_with_tune pid=175878)[0m 📉 Eval results: {'eval_loss': 4.020482540130615, 'eval_runtime': 0.0979, 'eval_samples_per_second': 755.745, 'eval_steps_per_second': 102.128, 'epoch': 1.0}
