# prepare

In [2]:
# imports
import os
import argparse
import json
import sys
sys.path.append("..")

import torch
import pytorch_lightning as pl
import torchmetrics
import transformers

from utils import (
    PersonaDataset,
    GenerativeCollator,
    RetrievalCollator,
    aggregate_encoder_output,
    sim_func,
)
from models import GPT_GenerativeModel

pl.utilities.seed.seed_everything(42)

Global seed set to 42


42

In [3]:
# proxy
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

In [3]:
# gpt config
parser = argparse.ArgumentParser()
gpt_args = parser.parse_args("")
with open("configs/gpt_config.json", "r") as config:
    opt = json.load(config)
vars(gpt_args).update(opt)

In [12]:
# config
parser = argparse.ArgumentParser()
gpt_args = parser.parse_args("")
with open("../config.json", "r") as config:
    opt = json.load(config)
vars(gpt_args).update(opt)

opt = {
    "epochs": 3,
    "lr": 5e-05,
    "gradient_clip_val": 1,
    "batch_size": 8,
    "val_split": -1,
    "max_len": 128,
    "num_warmup_steps": 1500,
    "project_name": "gpt_answer",
    "experiment_name": "context+gks>answer(5e-05)",
    "dataset_mod": "get_examples_gpt",
    "rnd_context": 0,
}
vars(gpt_args).update(opt)

# pretrained model

In [9]:
# gpt tokenizer
gpt_tokenizer = transformers.AutoTokenizer.from_pretrained(
    gpt_args.gpt,
    truncation_side=gpt_args.truncation_side,
    padding_side=gpt_args.padding_side,
)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
gpt_tokenizer.add_special_tokens(gpt_args.special_tokens_dict)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


7

In [10]:
# gpt
gpt = transformers.GPT2LMHeadModel.from_pretrained(gpt_args.gpt)
gpt.resize_token_embeddings(len(gpt_tokenizer))

Embedding(50265, 1280)

# data

In [16]:
# dataset
train_dataset = PersonaDataset(gpt_args.train_data_path, mod=gpt_args.dataset_mod, rnd_context=gpt_args.rnd_context)
val_dataset = PersonaDataset(gpt_args.test_data_path, mod=gpt_args.dataset_mod, rnd_context=gpt_args.rnd_context)[:1000]
train_size = len(train_dataset)
val_size = len(val_dataset)
vars(gpt_args).update({"train_size": train_size, "val_size": val_size})
print(train_size, val_size)

143156 1000


In [17]:
# gpt callator
gpt_callator = GenerativeCollator(
    gpt_tokenizer, padding=gpt_args.padding, max_length=gpt_args.max_len
)

In [18]:
# dataloader
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=gpt_args.batch_size, shuffle=True, collate_fn=gpt_callator
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=1, shuffle=False, collate_fn=gpt_callator.test
)

In [19]:
# scheduler len
scheduler_len = len(train_dataloader) * gpt_args.epochs

# pl trainloop

In [20]:
# pl model
model = GPT_GenerativeModel(
    GPT=gpt,
    tokenizer=gpt_tokenizer,
    batch_size=gpt_args.batch_size,
    scheduler_len=scheduler_len,
    num_warmup_steps=gpt_args.num_warmup_steps,
    lr=gpt_args.lr,
    max_len=gpt_args.max_len,
    collator=gpt_callator,
    base_config=gpt_args,
)

  rank_zero_warn(


In [21]:
# logger
logger = pl.loggers.comet.CometLogger(
    api_key=gpt_args.api_key,
    save_dir=gpt_args.save_dir,
    project_name=gpt_args.project_name,
    experiment_name=gpt_args.experiment_name,
)
logger.log_hyperparams(gpt_args)

CometLogger will be initialized in online mode
COMET INFO: Experiment is live on comet.com https://www.comet.com/anpopaicoconat/gpt-answer/2ba72f4292404b8abf505cea8cd77991



In [22]:
# checkpoint callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
     monitor='val_loss',
     dirpath=gpt_args.save_dir,
     filename='gpt-{epoch:02d}-{val_loss:.2f}',
     save_top_k=1,
     mode='min',
 )

In [23]:
# trainer
trainer = pl.Trainer(
    max_epochs=gpt_args.epochs,
    accelerator="gpu",
    devices=1,
    gradient_clip_val=gpt_args.gradient_clip_val,
    logger=logger,
    num_sanity_val_steps=1,
    callbacks=[checkpoint_callback]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# fit
trainer.fit(model, train_dataloader, val_dataloader)