In [1]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.

import logging
import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import pandas as pd

import datasets
import evaluate
import torch
from datasets import load_dataset
from accelerate import Accelerator, DistributedType

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    is_torch_tpu_available,
    set_seed,
)
from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.


require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

logger = logging.getLogger(__name__)


In [3]:
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# information sent is the one passed as arguments along with your Python/PyTorch versions.

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

In [None]:
#This is the directory where the model will be saved. Follow this format to set the directory: './size_of_the_model (e.g. small) / type_of_the_model (e.g. gpt2) / dataset_size (e.g. 50000) / number_of_epochs (e.g. 5)
# Change the direction appropriately
direction = './' + 'gpt_2_test'

In [6]:
# You might need to change the num_train_epochs or save_steps. After 'save_steps' number of steps, a version of the model will be saved as a checkpoint. To save memory, try to choose the best value for this argument.
training_args = TrainingArguments(output_dir=direction,
                                  num_train_epochs=5,
                                  logging_steps=5000,
                                  save_strategy='epoch',                                   
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  warmup_steps=100,
                                  weight_decay=0.01,  
                                  logging_dir='./logs')

In [7]:
# Set seed before initializing model.
set_seed(1234)

# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.

data_files = {}
dataset_args = {}


#Change the file name if needed.
data_files["train"] = 'train.csv'

raw_datasets = load_dataset('csv',
    data_files=data_files,
)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
    raw_datasets["validation"] = load_dataset('csv',
        data_files=data_files,
        split=f"train[:15%]",
    )
    raw_datasets["train"] = load_dataset('csv',
        data_files=data_files,
        split=f"train[15%:]",
    )

Downloading and preparing dataset csv/default to /home/anaseh_umass_edu/.cache/huggingface/datasets/csv/default-70b08638e51d0fe0/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1829.18it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 104.91it/s]
                            

Dataset csv downloaded and prepared to /home/anaseh_umass_edu/.cache/huggingface/datasets/csv/default-70b08638e51d0fe0/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 97.35it/s]




In [8]:
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

#config_kwargs = {
#    "cache_dir": model_args.cache_dir,
#    "revision": model_args.model_revision,
#    "use_auth_token": True if model_args.use_auth_token else None,
#}



# Change the config if needed.
config = AutoConfig.from_pretrained("gpt2-large")


# Change the tokenizer if needed.
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")

In [9]:
# Change the model if needed.
model = AutoModelForCausalLM.from_pretrained("gpt2-large", config=config)


In [10]:
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# Run this cell only if you want to fine-tune a part of the model
# Freezing layers
layers = [5, 10, 15]

# First freeze the whole model
for params in model.parameters():
    params.requires_grad = False

# Then unfreeze the selected blocks
for layer in layers:    
    for params in model.transformer.h[layer].parameters():
        params.requires_grad = True

In [None]:
#To make sure if selected blocks are frozen
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.requires_grad)

In [11]:

# Preprocessing the datasets.
# First we tokenize all the texts.
#if training_args.do_train:
column_names = list(raw_datasets["train"].features)
#else:
#    column_names = list(raw_datasets["validation"].features)
text_column_name = "text" if "text" in column_names else column_names[0]



def tokenize_function(examples):
    
    output = tokenizer(examples[text_column_name])

    return output

In [12]:
with training_args.main_process_first():
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=column_names,
            desc="Running tokenizer on dataset",
        )



Running tokenizer on dataset: 100%|██████████| 1/1 [00:00<00:00,  4.77ba/s]
Running tokenizer on dataset: 100%|██████████| 1/1 [00:00<00:00,  1.43ba/s]


In [13]:
block_size = tokenizer.model_max_length
if block_size > 1024:
    logger.warning(
        "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
        " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
        " override this default with `--block_size xxx`."
    )
    block_size = 1024



In [14]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

with training_args.main_process_first():
        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            desc=f"Grouping texts in chunks of {block_size}",
        )

Grouping texts in chunks of 1024: 100%|██████████| 1/1 [00:00<00:00, 13.88ba/s]
Grouping texts in chunks of 1024: 100%|██████████| 1/1 [00:00<00:00,  3.20ba/s]


In [16]:
train_dataset = lm_datasets["train"]
max_train_samples = len(train_dataset)
train_dataset = train_dataset.select(range(max_train_samples))


eval_dataset = lm_datasets["validation"]

max_eval_samples = len(eval_dataset)
eval_dataset = eval_dataset.select(range(max_eval_samples))

In [17]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

metric = evaluate.load("accuracy")

In [18]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

In [19]:
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome."
        )
    elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
        logger.info(
            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
        )

In [20]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [21]:
# Training
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
    checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload

***** Running training *****
  Num examples = 41
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 105
  Number of trainable parameters = 1315575808


Step,Training Loss


Saving model checkpoint to ./gpt_2_test/checkpoint-100
Configuration saved in ./gpt_2_test/checkpoint-100/config.json
Configuration saved in ./gpt_2_test/checkpoint-100/generation_config.json
Model weights saved in ./gpt_2_test/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./gpt_2_test/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./gpt_2_test/checkpoint-100/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./gpt_2_test
Configuration saved in ./gpt_2_test/config.json
Configuration saved in ./gpt_2_test/generation_config.json
Model weights saved in ./gpt_2_test/pytorch_model.bin
tokenizer config file saved in ./gpt_2_test/tokenizer_config.json
Special tokens file saved in ./gpt_2_test/special_tokens_map.json


In [23]:
metrics = train_result.metrics


metrics["train_samples"] = len(train_dataset)

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =        5.0
  total_flos               =  1417541GF
  train_loss               =     1.6904
  train_runtime            = 0:06:10.33
  train_samples            =         41
  train_samples_per_second =      0.554
  train_steps_per_second   =      0.284


In [24]:
logger.info("*** Evaluate ***")

metrics = trainer.evaluate()

metrics["eval_samples"] = len(eval_dataset)
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 219
  Batch size = 2


***** eval metrics *****
  epoch                   =        5.0
  eval_accuracy           =     0.4562
  eval_loss               =     3.0792
  eval_runtime            = 0:01:08.58
  eval_samples            =        219
  eval_samples_per_second =      3.193
  eval_steps_per_second   =      1.604
  perplexity              =    21.7405
