In [23]:
# !pip install transformers datasets rouge_score huggingface_hub
# !sudo apt-get install git-lfs

In [24]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
import datetime
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch
from pathlib import Path

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)
from datasets import load_metric
from huggingface_hub import Repository, notebook_login, create_repo, HfApi
import pdb


from torch.utils.tensorboard import SummaryWriter

# Configs
rouge_score = load_metric("rouge")
logger = logging.getLogger(__name__)

In [25]:
# # This does not work with mimicbot
# notebook_login()

In [54]:
# TODO: delete this and add proper configs when tranforming into script

data_path = Path("C:/Users/1seba/AppData/Roaming/mimicbot/data")
models_path = data_path / "models"
session_path = data_path / "TutorialServer" / "2022-07-05-21-31"
MODEL_FROM = "microsoft/DialoGPT-small"
MODEL_TO = "SebastianS/test-final"
# MODEL_TO = "SebastianS/test-small"


In [55]:
# TODO: use config paths
trn_df = pd.read_csv(str(session_path / "training_data" / "train.csv"))
val_df = pd.read_csv(str(session_path / "training_data" / "test.csv"))
trn_df.head(1)

Unnamed: 0,response,context1,context2
0,8...,7...,6...


In [56]:
# Args to allow for easy convertion of python script to notebook
# TODO: update save to and model_name
class Args():
  def __init__(self):
    self.output_dir = str(models_path)
    # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.device = torch.device("cpu")
    self.model_type = 'gpt2'
    self.model_path = str(models_path / MODEL_TO)
    self.model_name = MODEL_FROM
    self.config_name = "microsoft/DialoGPT-small"
    self.tokenizer_name = MODEL_FROM
    self.save_to = MODEL_TO
    self.repo = None
    self.cache_dir = str(models_path / "cache")
    self.block_size = 512
    self.do_train = True
    self.do_eval = True
    self.evaluate_during_training = False
    self.per_gpu_train_batch_size = 4
    self.per_gpu_eval_batch_size = 4
    self.gradient_accumulation_steps = 1
    self.learning_rate = 5e-5
    self.weight_decay = 0.0
    self.adam_epsilon = 1e-8
    self.max_grad_norm = 1.0
    self.num_train_epochs = 3
    self.max_steps = -1
    self.warmup_steps = 0
    self.logging_steps = 1000
    self.save_steps = 3500
    self.save_total_limit = None
    self.eval_all_checkpoints = False
    self.no_cuda = not torch.cuda.is_available()
    self.overwrite_output_dir = True
    self.overwrite_cache = True
    self.should_continue = False
    self.seed = 42
    self.local_rank = -1
    self.fp16 = False
    self.fp16_opt_level = 'O1'
args = Args()

In [49]:
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [50]:
def encodeWithoutResponse(row, tokenizer):
  overfitEncoded = construct_conv(row, tokenizer)
  overfitStr = tokenizer.decode(overfitEncoded)
  splitByMessage = overfitStr.split(tokenizer.eos_token)
  overfitExcludingResponse = tokenizer.eos_token.join(splitByMessage[:-2]) + tokenizer.eos_token
  return tokenizer.encode(overfitExcludingResponse, return_tensors="pt")

In [51]:
def decodeGeneratedOutput(input, output, tokenizer):
  return tokenizer.decode(output[:, input.shape[-1]:][0], skip_special_tokens=True)

# Run Benchmark

In [52]:
# this takes a while so I reduce it to 60
benchmarkDf = val_df.iloc[:30]

In [33]:
# use_auth_token=True in case your model is private
config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir, use_auth_token=True)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir, use_auth_token=True)
model = AutoModelWithLMHead.from_pretrained(
    args.model_name,
    from_tf=False,
    config=config,
    cache_dir=args.cache_dir,
    use_auth_token=True
).to(args.device)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/335M [00:00<?, ?B/s]

In [34]:
def makePreds(df, model, tokenizer):
  preds = []
  for i, row in df.iterrows():
    testInput = encodeWithoutResponse(row, tokenizer).to(args.device)
    chatHistoryIds = model.generate(
      testInput,
      max_length = 200,
      pad_token_id = tokenizer.eos_token_id,
      no_repeat_ngram_size=3,
      do_sample=True,
      top_k=100,
      top_p=0.7,
      temperature=0.8,
    )
    preds.append(decodeGeneratedOutput(testInput, chatHistoryIds, tokenizer))
  return preds

In [35]:
def computeRouge(df, model, tokenizer):
  labels = list(df["response"])
  preds = makePreds(df, model, tokenizer)
  scores = rouge_score.compute(
      predictions=preds, references=labels
  )
  return ({k: np.round(v.mid.fmeasure*100, 4) for k, v in scores.items()}, labels, preds)


In [36]:
benchmarkScore = computeRouge(benchmarkDf, model, tokenizer)
benchmarkScore[0]

{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}

In [37]:
list(zip(benchmarkScore[1], benchmarkScore[2]))[:60]

[('8...', ''),
 ('8...', '...'),
 ('8...', '...'),
 ('8...', ''),
 ('8...', ',..'),
 ('8...', ',.'),
 ('8...', ''),
 ('8...', ',,.'),
 ('8...', ''),
 ('8...', ',,,.'),
 ('8...', ''),
 ('8...', ',.'),
 ('8...', ''),
 ('8...', ',.'),
 ('8...', ',.'),
 ('7...', ''),
 ('7...', '..'),
 ('7...', ''),
 ('7...', ',.'),
 ('7...', ''),
 ('7...', '?'),
 ('7...', ',.'),
 ('7...', ''),
 ('7...', ''),
 ('7...', ',,.'),
 ('7...', ''),
 ('7...', ',.'),
 ('7...', ''),
 ('7...', ',,'),
 ('7...', ',,,,.')]

# Saving

In [57]:
try:
  create_repo(args.save_to, private=False)
  hf_api = HfApi()
  hf_api.upload_file(
      path_or_fileobj=str(Path.cwd() / "huggingface/README.md"),
      path_in_repo="README.md",
      repo_id=args.save_to)
  tokenizer.push_to_hub(args.model_path, commit_message="init tokenizer")
except:
  print("Repo already exists")

To https://huggingface.co/SebastianS/test-final
   d062c3f..8fe4c07  main -> main

   d062c3f..8fe4c07  main -> main



'https://huggingface.co/SebastianS/test-final/commit/8fe4c07f55432e19cf609b9dd896e6f464240848'

In [39]:
def saveToRepo(args, model, tokenizer, message):
  model.push_to_hub(args.model_path, commit_message=f"model: {message}")
  # tokenizer.push_to_hub(args.model_path, commit_message=f"tokenizer: {message}")

# Train

In [40]:
# Caching and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [41]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, saveToHub) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        if not (Path(args.output_dir) / "SummaryWriter-log").exists():
            os.makedirs(Path(args.output_dir / "SummaryWriter-log"))
        tb_writer = SummaryWriter(log_dir=str(Path(args.output_dir) / "SummaryWriter-log"))
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_path
        and os.path.isfile(os.path.join(args.model_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_path and os.path.exists(args.model_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")
    
    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        print(f"({datetime.datetime.now().hour}:{datetime.datetime.now().minute})Iteration #", _ + 1)
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
        
        # run benchmark assessment
        print(computeRouge(benchmarkDf, model, tokenizer)[0])
        # save
        # pdb.set_trace()
        if saveToHub:
          saveToRepo(args, model, tokenizer, f"Epoch #{_+1}")
        # pdb.set_trace()
    if args.local_rank in [-1, 0]:
        tb_writer.close()    
    
    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [42]:
# Main runner

def main(df_trn, df_val, args, saveToHub = False):
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    
    # Setup CUDA, GPU & distributed training
    device = torch.device("cpu" if args.no_cuda else "cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, saveToHub)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [43]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [44]:
args.num_train_epochs = 1
args.per_gpu_train_batch_size = 1
args.per_gpu_eval_batch_size = 1
main(trn_df, val_df, args, True)

07/09/2022 18:23:20 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x0000021F814A6D70>
07/09/2022 18:23:20 - INFO - __main__ -   Creating features from dataset file at C:\Users\1seba\AppData\Roaming\mimicbot\data\models\cache
07/09/2022 18:23:20 - INFO - __main__ -   Saving features into cached file C:\Users\1seba\AppData\Roaming\mimicbot\data\models\cache\gpt2_cached_lm_-512
07/09/2022 18:23:21 - INFO - __main__ -   ***** Running training *****
07/09/2022 18:23:21 - INFO - __main__ -     Num examples = 30
07/09/2022 18:23:21 - INFO - __main__ -     Num Epochs = 1
07/09/2022 18:23:21 - INFO - __main__ -     Instantaneous batch size per GPU = 1
07/09/2022 18:23:21 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
07/09/2022 18:23:21 - INFO - __main__ -     Gradient Accumulation steps = 1
07/09/2022 18:23:21 - INFO - __main__ -     Total optimization steps = 30


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

(18:23)Iteration # 1


Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

{'rouge1': 50.0, 'rouge2': 0.0, 'rougeL': 50.0, 'rougeLsum': 50.0}


Cloning https://huggingface.co/SebastianS/test-final into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]

To https://huggingface.co/SebastianS/test-final
   18cacf3..d062c3f  main -> main

   18cacf3..d062c3f  main -> main

07/09/2022 18:25:04 - INFO - __main__ -    global_step = 30, average loss = 2.6565797845522563
07/09/2022 18:25:04 - INFO - __main__ -   Saving model checkpoint to C:\Users\1seba\AppData\Roaming\mimicbot\data\models
07/09/2022 18:25:21 - INFO - __main__ -   Evaluate the following checkpoints: ['C:\\Users\\1seba\\AppData\\Roaming\\mimicbot\\data\\models']
07/09/2022 18:25:28 - INFO - __main__ -   Creating features from dataset file at C:\Users\1seba\AppData\Roaming\mimicbot\data\models\cache
07/09/2022 18:25:28 - INFO - __main__ -   Saving features into cached file C:\Users\1seba\AppData\Roaming\mimicbot\data\models\cache\gpt2_cached_lm_-512
07/09/2022 18:25:28 - INFO - __main__ -   ***** Running evaluation  *****
07/09/2022 18:25:28 - INFO - __main__ -     Num examples = 30
07/09/2022 18:25:28 - INFO - __main__ -     Batch size = 1


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

07/09/2022 18:25:29 - INFO - __main__ -   ***** Eval results  *****
07/09/2022 18:25:29 - INFO - __main__ -     perplexity = tensor(6.2098)


{'perplexity_': tensor(6.2098)}

In [None]:
# 63
%debug

> [1;32mc:\projects\mimicbot\mimicbotwrapper\env\lib\site-packages\huggingface_hub\repository.py[0m(1351)[0;36mis_repo_clean[1;34m()[0m
[1;32m   1349 [1;33m            ).stdout.strip()
[0m[1;32m   1350 [1;33m        [1;32mexcept[0m [0msubprocess[0m[1;33m.[0m[0mCalledProcessError[0m [1;32mas[0m [0mexc[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m-> 1351 [1;33m            [1;32mraise[0m [0mEnvironmentError[0m[1;33m([0m[0mexc[0m[1;33m.[0m[0mstderr[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m   1352 [1;33m[1;33m[0m[0m
[0m[1;32m   1353 [1;33m        [1;32mreturn[0m [0mlen[0m[1;33m([0m[0mgit_status[0m[1;33m)[0m [1;33m==[0m [1;36m0[0m[1;33m[0m[1;33m[0m[0m
[0m


In [58]:
tokenizer = AutoTokenizer.from_pretrained(args.save_to)
model = AutoModelWithLMHead.from_pretrained(args.save_to).to(args.device)
newBenchmarkScores = computeRouge(benchmarkDf, model, tokenizer)
print(newBenchmarkScores[0])

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/487M [00:00<?, ?B/s]

{'rouge1': 56.6667, 'rouge2': 0.0, 'rougeL': 56.6667, 'rougeLsum': 56.6667}


# Overfit Test

In [None]:
args.num_train_epochs = 10
args.per_gpu_train_batch_size = 1
args.per_gpu_eval_batch_size = 1
main(benchmarkDf[:10], benchmarkDf[:10], args, False)

06/04/2022 14:21:43 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7fcaeb5c26d0>
06/04/2022 14:21:43 - INFO - __main__ -   Creating features from dataset file at cached
06/04/2022 14:21:43 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_-512
06/04/2022 14:21:43 - INFO - __main__ -   ***** Running training *****
06/04/2022 14:21:43 - INFO - __main__ -     Num examples = 10
06/04/2022 14:21:43 - INFO - __main__ -     Num Epochs = 10
06/04/2022 14:21:43 - INFO - __main__ -     Instantaneous batch size per GPU = 1
06/04/2022 14:21:43 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
06/04/2022 14:21:43 - INFO - __main__ -     Gradient Accumulation steps = 1
06/04/2022 14:21:43 - INFO - __main__ -     Total optimization steps = 100


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration # 1


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 4.9399, 'rouge2': 0.2778, 'rougeL': 4.7881, 'rougeLsum': 4.7419}
Iteration # 2


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 2.9831, 'rouge2': 0.101, 'rougeL': 2.7051, 'rougeLsum': 2.7258}
Iteration # 3


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 2.399, 'rouge2': 0.1282, 'rougeL': 2.4708, 'rougeLsum': 2.338}
Iteration # 4


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 2.7534, 'rouge2': 0.0, 'rougeL': 2.5685, 'rougeLsum': 2.5522}
Iteration # 5


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 5.9458, 'rouge2': 1.6667, 'rougeL': 5.5407, 'rougeLsum': 5.639}
Iteration # 6


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 5.9651, 'rouge2': 1.25, 'rougeL': 5.7726, 'rougeLsum': 5.8152}
Iteration # 7


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 8.3405, 'rouge2': 3.8704, 'rougeL': 8.3641, 'rougeLsum': 8.4126}
Iteration # 8


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 8.9175, 'rouge2': 6.6667, 'rougeL': 8.8414, 'rougeLsum': 8.7674}
Iteration # 9


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

{'rouge1': 9.9925, 'rouge2': 6.7816, 'rougeL': 9.7553, 'rougeLsum': 9.8607}
Iteration # 10


Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

06/04/2022 14:23:38 - INFO - __main__ -    global_step = 100, average loss = 2.149359789341688
06/04/2022 14:23:38 - INFO - __main__ -   Saving model checkpoint to output-small


{'rouge1': 9.1861, 'rouge2': 6.119, 'rougeL': 9.0294, 'rougeLsum': 9.0687}


06/04/2022 14:23:49 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
06/04/2022 14:23:53 - INFO - __main__ -   Creating features from dataset file at cached
06/04/2022 14:23:53 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_-512
06/04/2022 14:23:53 - INFO - __main__ -   ***** Running evaluation  *****
06/04/2022 14:23:53 - INFO - __main__ -     Num examples = 10
06/04/2022 14:23:53 - INFO - __main__ -     Batch size = 1


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

06/04/2022 14:23:54 - INFO - __main__ -   ***** Eval results  *****
06/04/2022 14:23:54 - INFO - __main__ -     perplexity = tensor(2.0856)


{'perplexity_': tensor(2.0856)}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
model = AutoModelWithLMHead.from_pretrained(args.output_dir).to(args.device)

overfitScores = computeRouge(benchmarkDf[:10], model, tokenizer)
print(overfitScores[0])



{'rouge1': 62.5, 'rouge2': 60.0, 'rougeL': 63.4524, 'rougeLsum': 63.4524}


In [None]:
list(zip(overfitScores[1], overfitScores[2]))

[('so im thinking i probably wont be able to stream the game to you but i might put a camera in front of my screen lol',
  'so im thinking i probably wont be able to stream the game to you but i might put a camera in front of my screen lol'),
 ('I got up for two minutes at 11am to get water and then went back to sleep',
  'I slept 8 hrs'),
 ('regardless of the outcome', 'regardless of the outcome'),
 ('the human race', 'that is not the case'),
 ("I'm left confused", 'Ok bye'),
 ('*psst* you guys wanna quake', ''),
 ('Started a call that lasted 0 minutes.',
  'Started a call that lasted 0 minutes.'),
 ('yeah give me a second', 'yeah give me a second'),
 ('just search for mick gordon', 'just search for mick gordon'),
 ('when do you want to play', 'when do you want to play')]