forked from Azure/azureml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
job.py
128 lines (101 loc) · 3.7 KB
/
job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# description: train Huggingface transformer using DeepSpeed
#
# In this example we train a 1.6B parameter gpt2 model using Deepspeed and
# Huggingface's transformers library.
from dataclasses import dataclass, asdict
from pathlib import Path
from azureml.core import Workspace, ScriptRunConfig, Environment, Experiment
from azureml.core.runconfig import PyTorchConfiguration
TARGET_GPU_COUNT = {"gpu-V100-1": 1, "gpu-V100-2": 2, "gpu-V100-4": 4}
@dataclass
class JobArguments:
"""Arguments controlling job submission to Azure ML."""
target_name: str
model_checkpoint: str = "distilbert-base-uncased"
task: str = "cola"
node_count: int = 1
num_train_epochs: int = 3
per_device_train_batch_size: int = 16
per_device_eval_batch_size: int = 16
def submit_azureml_run(args: JobArguments):
"""Submit GLUE experiment to azureml."""
ws = Workspace.from_config()
# get root of git repo
prefix = Path(__file__).parent
source_directory = str(prefix.joinpath("src"))
target = ws.compute_targets[args.target_name]
env = Environment.from_dockerfile(
"deepspeed-transformers", Path(__file__).parent.joinpath("../dockerfile")
)
distributed_job_config = get_distributed_job_config(args)
cmd = f"""ds_report && python finetune_glue.py
--output_dir outputs
--model_checkpoint {args.model_checkpoint}
--task {args.task}
--num_train_epochs {args.num_train_epochs}
--per_device_train_batch_size {args.per_device_train_batch_size}
--per_device_eval_batch_size {args.per_device_eval_batch_size}
--disable_tqdm 1
--local_rank $LOCAL_RANK
--deepspeed ds_config.json
--learning_rate 3e-05
--adam_beta1 0.8
--adam_beta2 0.999
--weight_decay 3e-07
--warmup_steps 500
--fp16 true
""".split()
config = ScriptRunConfig(
source_directory=source_directory,
command=cmd,
environment=env,
compute_target=target,
distributed_job_config=distributed_job_config,
)
run = Experiment(ws, "deepspeed-transformers-example").submit(config)
print(run.get_portal_url()) # link to ml.azure.com
run.set_tags(asdict(args))
def get_distributed_job_config(args: JobArguments):
n_proc_per_node = TARGET_GPU_COUNT[args.target_name]
process_count = n_proc_per_node * args.node_count
distributed_job_config = PyTorchConfiguration(
process_count=process_count, node_count=args.node_count
)
return distributed_job_config
if __name__ == "__main__":
target_names = [
# "gpu-V100-1", # single GPU
# "gpu-V100-2", # two GPUs
"gpu-V100-4" # four GPUs
]
# https://huggingface.co/transformers/pretrained_models.html
model_checkpoints = [
"distilbert-base-uncased", # 66M
# "bert-base-uncased", # 110M
# "bert-large-uncased", # 336M
# "gpt2", # 117M
# "gpt2-medium", # 345M
# "gpt2-large", # 774M
# "gpt2-xl", # 1558M
]
# https://openreview.net/pdf?id=rJ4km2R5t7
tasks = [
# "wnli", # 634, inference
# "rte", # 2.5k, inference
# "mrpc", # 3.7k, paraphrase
# "stsb", # 7k, sentence similarity
"cola", # 8.5k, single-sentence
# "sst2", # 67k, single-sentence
# "qnli", # 105k, inference
# "mnli", # 393k, inference
# "qqp", # 364k, paraphrase
]
for target_name in target_names:
for model_checkpoint in model_checkpoints:
for task in tasks:
args = JobArguments(
target_name=target_name,
model_checkpoint=model_checkpoint,
task=task,
)
submit_azureml_run(args)