# prepare

In [1]:
# imports
import os
import argparse
import json

import torch
import pytorch_lightning as pl
import torchmetrics
import transformers

from utils import (
    PersonaDataset,
    GenerativeCollator,
    RetrievalCollator,
    aggregate_encoder_output,
    sim_func,
)
from models import RetrievalModel, GenerativeModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# proxy
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

In [3]:
# config
parser = argparse.ArgumentParser()
gpt_args = parser.parse_args("")
with open("configs/gpt_config.json", "r") as config:
    opt = json.load(config)
vars(gpt_args).update(opt)

# pretrained model

In [4]:
# gpt tokenizer
with open(gpt_args.special_tokens_dict, "r") as config:
    special_tokens_dict = json.load(config)

gpt_tokenizer = transformers.AutoTokenizer.from_pretrained(
    gpt_args.pretrained_gpt,
    truncation_side=gpt_args.truncation_side,
    padding_side=gpt_args.padding_side,
)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
gpt_tokenizer.add_special_tokens(special_tokens_dict)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


7

In [5]:
# gpt
gpt = transformers.GPT2LMHeadModel.from_pretrained(gpt_args.pretrained_gpt)
gpt.resize_token_embeddings(len(gpt_tokenizer))

Embedding(50265, 1280)

# data

In [6]:
# dataset
dataset = PersonaDataset(gpt_args.data_path, mod='get_examples_gpt')

train_size = len(dataset) - len(dataset) // gpt_args.val_split
val_size = len(dataset) // gpt_args.val_split
vars(gpt_args).update({"train_size": train_size, "val_size": val_size})

train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size]
)

In [7]:
# gpt callator
gpt_callator = GenerativeCollator(
    gpt_tokenizer, padding=gpt_args.padding, max_length=gpt_args.max_len
)

In [8]:
# dataloader
train_dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=gpt_args.batch_size, shuffle=True, collate_fn=gpt_callator
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=1, shuffle=False, collate_fn=gpt_callator.test
)

In [9]:
# scheduler len
scheduler_len = len(train_dataloader) * gpt_args.epochs

# pl trainloop

In [10]:
# pl model
model = GenerativeModel(
    gpt,
    gpt_tokenizer,
    gpt_args.batch_size,
    scheduler_len,
    gpt_args.num_warmup_steps,
    gpt_args.lr,
    gpt_args.max_len,
)

  rank_zero_warn(


In [11]:
# logger
logger = pl.loggers.comet.CometLogger(
    api_key=gpt_args.api_key,
    save_dir=gpt_args.save_dir,
    project_name=gpt_args.project_name,
    experiment_name=gpt_args.experiment_name,
)
logger.log_hyperparams(gpt_args)

CometLogger will be initialized in online mode
COMET INFO: Experiment is live on comet.com https://www.comet.com/anpopaicoconat/gpt-answer/17cfdbfd0d3c4f389712c2fa7fee685d



In [12]:
# trainer
trainer = pl.Trainer(
    max_epochs=gpt_args.epochs,
    accelerator="gpu",
    devices=1,
    gradient_clip_val=gpt_args.gradient_clip_val,
    logger=logger,
    num_sanity_val_steps=1,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
# fit
trainer.fit(model, train_dataloader, val_dataloader)

RuntimeError: CUDA out of memory. Tried to allocate 246.00 MiB (GPU 0; 23.69 GiB total capacity; 0 bytes already allocated; 201.56 MiB free; 0 bytes reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF