/
captioning.py
377 lines (312 loc) · 17.2 KB
/
captioning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import argparse
import os
import math
import ruamel.yaml as yaml
import time
import datetime
import json
from functools import partial
import torch
import torch.backends.cudnn as cudnn
import lightning as L
from models import XVLMCaptioning, BLIPCaptioning
import utils
from utils.misc import millions, num_params
from utils.loggers import init_wandb_logger
from utils.functions import get_unprunable_parameters
from utils.prune_utils import make_prunable, stats, named_masked_parameters
from utils.optim import create_optimizer, create_scheduler
from datasets.coco_karpathy_dataset import coco_karpathy_train_collate_fn
from datasets import create_dataset, create_sampler, create_loader
from evaltools import coco_caption_eval
def train(model, data_loader, optimizer, scheduler, epoch, fabric: L.Fabric, config, debug_mode=False):
model.train()
optimizer.zero_grad()
steps = 0
steps_per_epoch = math.ceil(len(data_loader.dataset) / (config['batch_size_target']))
for i, (image, caption, ids) in enumerate(data_loader):
# forward and backward under appropriate context manager
is_accumulating = not (((i+1) % config['grad_acc_steps'] == 0) or ((i+1) == len(data_loader)))
with fabric.no_backward_sync(model, enabled=is_accumulating):
loss = model(image, caption, already_tokenized=True)
loss /= config['grad_acc_steps']
fabric.backward(loss)
# optimization step
if not is_accumulating:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
# log on the web
loss_scalar = loss.item()
lr_value = optimizer.param_groups[0]["lr"]
if (steps+1) % config['log_freq'] == 0:
fabric.log_dict({"loss": loss_scalar, "lr": lr_value})
# log locally
print(f"[Epoch {epoch+1}] Step = {steps+1} / {steps_per_epoch}\t loss = {loss_scalar}")
steps += 1
# if --debug is passed to the main script
if debug_mode and steps == 16: break
@torch.no_grad()
def evaluation(result_dir, model, data_loader, epoch, fabric: L.Fabric, config, split):
assert split in ("val", "test")
model.eval()
result = []
for batch_idx, (image, image_id) in enumerate(data_loader):
captions = model.generate(
image,
sample=False,
num_beams=config['num_beams'],
max_length=config['max_length'],
min_length=config['min_length']
)
for caption, img_id in zip(captions, image_id):
result.append({"image_id": img_id.item(), "caption": caption})
if ((batch_idx + 1) % config['log_freq'] == 0) or (batch_idx == len(data_loader)-1):
progress = (batch_idx + 1) / len(data_loader) * 100
print(f"[({split}) Caption Generation] Batch = {batch_idx+1} / {len(data_loader)} ({progress:.2f}%)")
# dump on disk the json file containing the
# captions generated on each rank
captions_path = os.path.join(result_dir, f"captioning_{split}_result_epoch{epoch}_rank{fabric.global_rank}.json")
with open(captions_path, "w") as f:
json.dump(result, f)
return result
def merge_eval_from_ranks(result_dir, epoch, fabric, split):
assert split in ("val", "test")
result = []
for r in range(fabric.world_size):
path_for_this_rank = os.path.join(result_dir, f"captioning_{split}_result_epoch{epoch}_rank{r}.json")
with open(path_for_this_rank, "r") as f:
r_result = json.load(f)
result += r_result
result_path = os.path.join(result_dir, f"captioning_{split}_result_epoch{epoch}.json")
with open(result_path, "w") as f:
json.dump(result, f)
return result_path
def cleanup_eval(result_dir, epoch, fabric, split):
for r in range(fabric.world_size):
path_for_this_rank = os.path.join(result_dir, f"captioning_{split}_result_epoch{epoch}_rank{r}.json")
os.remove(path_for_this_rank)
def main(args, config):
if "16" in args.precision:
torch.set_float32_matmul_precision(precision="medium")
elif "32" in args.precision:
torch.set_float32_matmul_precision(precision="high")
# setup the loggers before distributing
loggers = []
if args.wandb:
loggers.append(init_wandb_logger(config))
# initialize distributed mode
fabric = L.Fabric(
accelerator="cuda",
devices=args.devices,
strategy="ddp",
precision=args.precision,
loggers=loggers
)
fabric.launch()
utils.setup_for_distributed(is_master=fabric.is_global_zero)
# setup useful variables
world_size = fabric.world_size
global_rank = fabric.global_rank
# define gradient accumulation steps
E = config['batch_size_target']
B = config['batch_size_train']
config['grad_acc_steps'] = (E // world_size) // B
# reproducibility settings
L.seed_everything(args.seed)
cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
# model initialization (for this script, I need to initialize the model before the dataloader)
# since the dataloader will have a collate_fn made with a partial from model data
print("Creating model...")
if args.model == "xvlm":
model = XVLMCaptioning(config=config)
setattr(model, "name", "xvlm")
elif args.model == "blip":
model = BLIPCaptioning(
image_size=config['image_res'],
vit=config['vit'],
vit_grad_ckpt=config['vit_grad_ckpt'],
vit_ckpt_layer=config['vit_ckpt_layer'],
prompt=config['prompt']
)
setattr(model, "name", "blip")
else:
raise NotImplementedError(f"Model {args.model} not implemented!")
if not args.dense:
make_prunable(model, pattern_lock=True, mask_on_the_fly=True)
# if loading from a pretrained checkpoint after the 1st stage finetuning on the 4M data, it is assumed
# that the state dict contains the pruning masks (xvlm only)
if args.model == 'xvlm':
masks_for_loading = args.mask if not args.load_capt_pretrain else None
elif args.model == 'blip':
masks_for_loading = args.mask
model.load_from_pruned_pretrained(args.pretraining_weights, masks_for_loading, config, args.load_capt_pretrain)
else:
model.load_pretrained(args.pretraining_weights, config, load_capt_pretrain=args.load_capt_pretrain)
# dummy op, but useful to check that the sparsity is 0% with the upcoming lines
make_prunable(model, pattern_lock=False, mask_on_the_fly=False)
# log some cute stuff
print("Total Params: ", millions(num_params(model)))
masked_params = named_masked_parameters(model, exclude=get_unprunable_parameters(model.name))
remaining_params, prunable_params = stats(masked_params)
print(f"Remaining Params = {millions(remaining_params, decimals=2)} / {millions(prunable_params, decimals=2)}", end=" ")
print(f"({100*remaining_params/prunable_params:.2f}%)")
# datasets initialization
print("Creating COCO Caption dataset...")
train_dataset, val_dataset, test_dataset = create_dataset('captioning', config)
# (distributed) samplers initialization
datasets = [train_dataset, val_dataset, test_dataset]
samplers = create_sampler(
datasets,
shuffles=[True, False, False],
num_replicas=world_size,
global_rank=global_rank,
is_eval=[False, True, True] # this enables DistributedEvalSampler for val and test
)
# dataloaders initialization
# NOTE: the collate function already embeds the tokenization process
train_collate_fn = partial(coco_karpathy_train_collate_fn, tokenizer=model.tokenizer, max_tokens=config['max_tokens'])
train_loader, val_loader, test_loader = create_loader(
datasets,
samplers,
batch_size=[config['batch_size_train'], config['batch_size_test'], config['batch_size_test']],
num_workers=[8, 8, 8],
is_trains=[True, False, False],
collate_fns=[train_collate_fn, None, None]
)
# everything about data management is ready, so set it up with fabric
train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader, use_distributed_sampler=False)
# optimizer initialization
arg_opt = utils.AttrDict(config['optimizer'])
optimizer = create_optimizer(arg_opt, model)
# setup model and optimizer with fabric
model, optimizer = fabric.setup(model, optimizer)
# resume from a snapshot if existing
# NOTE: the code for resuming from a snapshot is given, but it is not tested
# during the experiments of the main paper, I have never used it
start_epoch = 0
start_time = time.time()
sched_state_dict = None
if os.path.exists(args.snapshot):
restored_state = fabric.load(args.snapshot)
model.load_state_dict(restored_state['model'])
optimizer.load_state_dict(restored_state['optimizer'])
sched_state_dict = restored_state['scheduler']
start_epoch = restored_state['last_epoch'] + 1
start_time = time.time() - restored_state['elapsed_time']
print(f"Loaded state, resuming from epoch = {start_epoch}")
print(
"IMPORTANT: You are resuming training from a snapshot.\n"
"As per the README.md, note that while the code for resuming is given, it has not been tested.\n"
"The authors are not responsible for any issues that may arise from resuming training.\n\n"
)
# learning rate scheduler initialization
arg_sche = utils.AttrDict(config['scheduler'])
steps_per_epoch = math.ceil(len(train_dataset) / config['batch_size_target'])
num_training_steps = steps_per_epoch * arg_sche['epochs']
num_warmup_steps = int(num_training_steps * arg_sche['num_warmup_steps'])
lr_scheduler = create_scheduler(
mode=arg_sche['sched'],
optimizer=optimizer,
num_warmup_steps=num_warmup_steps,
total_steps=num_training_steps,
last_epoch=-1 if sched_state_dict is None else sched_state_dict['last_epoch']-1
)
if sched_state_dict is not None:
lr_scheduler.load_state_dict(sched_state_dict)
# start training! :)
max_epoch = config['scheduler']['epochs']
for epoch in range(start_epoch, max_epoch):
# needed to reshuffle the distribution of data among ranks epoch-wise
train_loader.sampler.set_epoch(epoch)
val_loader.sampler.set_epoch(epoch)
test_loader.sampler.set_epoch(epoch)
# one training epoch
train(model, train_loader, optimizer, lr_scheduler, epoch, fabric, config, debug_mode=args.debug)
# save model checkpoint after training for an epoch
state = {'model': model, 'optimizer': optimizer, 'scheduler': lr_scheduler.state_dict(),
'last_epoch': epoch, 'elapsed_time': time.time() - start_time}
fabric.save(path=args.snapshot, state=state)
# evaluate the model
if epoch >= config['start_eval']:
# generate captions for the validation set and the test sets
evaluation(args.result_dir, model, val_loader, epoch, fabric, config, split="val")
evaluation(args.result_dir, model, test_loader, epoch, fabric, config, split="test")
# merge the results from all ranks and clean up the json files (only on rank 0 to avoid locking issues)
fabric.barrier()
if fabric.is_global_zero:
val_result_file = merge_eval_from_ranks(args.result_dir, epoch, fabric, split="val")
cleanup_eval(args.result_dir, epoch, fabric, split="val")
test_result_file = merge_eval_from_ranks(args.result_dir, epoch, fabric, split="test")
cleanup_eval(args.result_dir, epoch, fabric, split="test")
# compute scores using the PyCOCO APIs
# NOTE: if you pass --skip_eval to the main script, evaluation will not be performed
# This is useful if you want to evaluate the results on a different machine (my use case during development of the paper)
# You are responsible for evaluating these files offline, using the function `coco_caption_eval` from evaltools/ic/__init__.py
if not args.skip_eval:
coco_val = coco_caption_eval(config['val_gt_file'], val_result_file)
coco_test = coco_caption_eval(config['test_gt_file'], test_result_file)
log_stats = {**{f'val_{k}': round(v*100, 4) for k, v in coco_val.eval.items()},
**{f'test_{k}': round(v*100, 4) for k, v in coco_test.eval.items()},
'epoch': epoch}
fabric.log_dict(log_stats)
print(log_stats)
else:
print("Skipping evaluation...")
# synchronize all processes before the next epoch
fabric.barrier()
# final logging and exit
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Total Finetuning Time for Image Captioning {}'.format(total_time_str))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True, type=str, choices=['xvlm', 'blip'])
parser.add_argument('-pre', '--pretraining_weights', type=str, required=True)
parser.add_argument('-m', '--mask', type=str, required=False,
help="Path to the pruning mask to load. If not provided, ensure to pass the --dense flag.")
parser.add_argument('--dense', action="store_true", default=False,
help="If passed, the dense model will be trained (even if you pass a mask).")
parser.add_argument('--config', required=True, type=str,
help="Path to the .yaml file containing the configuration for the script. "
"For simplicity, you can use the provided .yaml files in the configs/ folder. "
"For example, you can use the configs/xvlm/captioning.yaml file for XVLM, and "
"the configs/blip/captioning.yaml file for BLIP.")
parser.add_argument('--skip_eval', action="store_true", default=False,
help="If passed, the evaluation step will be skipped. "
"And you fill find the files in the --output_dir folder for both val and test splits. "
"Have a look at the `coco_caption_eval` function in evaltools/ic/__init__.py to evaluate the results offline.")
parser.add_argument('--output_dir', default='experiments/captioning',
help="Path to the output directory where the results will be saved. "
"This includes a copy of the config and the json files with the generated captions.")
parser.add_argument('--snapshot', default="snapshots/captioning.pt",
help="Path to the snapshot file where the model will be saved after each epoch. "
"Set this flag to an existing checkpoint to resume training from that point. "
"If the file does not exist, the script will start training from scratch.")
parser.add_argument('--seed', default=42, type=int,
help="Seed for reproducibility. Default is 42.")
parser.add_argument('--load_capt_pretrain', action='store_true',
help="If passed, the model will load the weights from a 1epoch pretraining on the 4M dataset. "
"XVLM only. Please see the XVLM paper, appendix A.2 for more details.")
parser.add_argument('-wdb', '--wandb', action="store_true", default=False,
help="Log data onWeights & Biases. Make sure to log to your account first with `wandb login`.")
parser.add_argument('-exp', '--experiment_name', type=str, required=False,
help="Name of the experiment on Weights & Biases.")
parser.add_argument('--wdb_offline', action="store_true",
help="Locally cache W&B data instead of logging it to the cloud. You can then sync the run with `wandb sync`.")
parser.add_argument('--debug', action="store_true", default=False,
help="Enables debugging mode to ensure everything works fine. Only 16 grad steps per epoch are performed.")
parser.add_argument('--devices', type=int, default=1,
help="Number of devices (i.e., gpus) to use for DistributedDataParallel with Fabric. Default is 1.")
parser.add_argument('--precision', type=str, default='bf16-mixed', choices=['32-true', 'bf16-mixed', '16-mixed'],
help="Precision strategy to use for training. Default is bf16-mixed (used in the paper).")
args = parser.parse_args()
config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader)
os.makedirs(args.output_dir, exist_ok=True)
yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w'))
args.result_dir = os.path.join(args.output_dir, 'result')
os.makedirs(args.result_dir, exist_ok=True)
# mixup command line arguments and config
config.update(vars(args))
main(args, config)