In [1]:
# Imports
from importlib import reload
import os
import shutil
from pathlib import Path
import numpy as np
import torch
from datasets import load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import OPTForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import evaluate

import utils.preprocess_data
reload(utils.preprocess_data)
from utils.preprocess_data import preprocess_orig_data

  from .autonotebook import tqdm as notebook_tqdm


#### Setup

In [8]:
# Set up DL framework and device
dl_framework = 'pt'

is_gpu = torch.cuda.is_available()
if is_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Set up file and dir paths
clean = True # Whether to clean the raw data directory
data_type = 'applications' # 'is_experimental' or 'applications'

path_to_galactica_folder = Path(r'../galactica')

if data_type == 'applications':
    path_to_raw = Path(r'./data/raw_applications.json')
elif data_type == 'is_experimental':
    path_to_raw = Path(r'./data/raw_is_experimental.json')
path_to_data = Path(path_to_galactica_folder, path_to_raw)

path_to_state_dict = "./state_dict/model_state_dict.pt"
path_to_state_dict = Path(path_to_state_dict)
dir_to_state_dict = path_to_state_dict.parent
shutil.rmtree(dir_to_state_dict, ignore_errors=True)
os.mkdir(dir_to_state_dict)

# Set up data
preprocess = False
subset_size = 1024

# Set up model instanciation
checkpoint = "facebook/galactica-125m"
num_hidden_layers = 1
print("\nSet-up completed")


Set-up completed


#### Preprocess the data to Datasets

In [9]:
# Preprocess the data to get raw data into /galactica/data
if preprocess == True:
    if data_type == 'applications':
        path_to_orig = Path(r'./data/orig_applications.json')
    elif data_type == 'is_experimental':
        path_to_orig = Path(r'./data/orig_is_experimental.json')

    preprocess_orig_data(str(path_to_galactica_folder), str(path_to_orig), clean=clean)

#### Load the data, model, tokenizer and tokenize

In [10]:
# Load the data
raw_dataset = load_from_disk(str(path_to_data))
print("\nDataset loaded: ", raw_dataset)

# Take a subset of raw_dataset
# TODO: implement the following method, much more elegant
# TODO: small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
ds_dict = dict()
for split in raw_dataset:
    ds = raw_dataset[split]
    ds = Dataset.from_dict(ds[0:subset_size])
    ds = ds.cast(raw_dataset[split].features)
    ds_dict[split] = ds
raw_dataset=DatasetDict(ds_dict)

# Get the number of labels
num_labels = len(raw_dataset['train'].features['labels'].names)
print("\nNumber of labels: ", num_labels)

# Load the Model
#TODO: understand how to properly instantiate the model when device.cuda == 'cuda'
model = OPTForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
state_dict = model.state_dict()
torch.save(state_dict, str(path_to_state_dict))

if device.type == 'cuda':
    config = AutoConfig.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
    with init_empty_weights():
        model = OPTForSequenceClassification._from_config(config)
    model.tie_weights()
    no_split_module_classes = None #List of modules with any residual connection of some kind
    model = load_checkpoint_and_dispatch(model, str(path_to_state_dict), device_map="auto", no_split_module_classes=no_split_module_classes)
print("\nModel instantiated")

# Freeze the model but the last layer
# TODO: how to programmatically get the name of the last layer
for param in model.named_parameters():
    if param[0] != 'score.weight':
        param[1].requires_grad = False
print("\nParameters frozen")

# TODO: This is probably wrong, check what is the correct max_length
max_length = model.config.word_embed_proj_dim
print("\nMax length = ", max_length)

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True) # use_fast is recommended to be False, set to True for testing purposes
# use_fast argument; check https://huggingface.co/docs/transformers/model_doc/opt#overview
id2label = {i: label for label, i in tokenizer.vocab.items()}
pad_token_id = model.config.pad_token_id
tokenizer.add_special_tokens({'pad_token': id2label[pad_token_id]})
print("\nTokenizer instantiated")

def tokenize_function(sequences):
    return tokenizer(sequences['text'], max_length=max_length, truncation=True)

#TODO: save the tokenized datasets to disk
tokenized_datasets = raw_dataset.map(tokenize_function , batched=True)
print("\nTokenized datasets: ", tokenized_datasets)


Dataset loaded:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12006
    })
})


                                                                    


Number of labels:  8


Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['model.decoder.layers.5.self_attn_layer_norm.bias', 'model.decoder.layers.10.self_attn.q_proj.weight', 'model.decoder.layers.3.self_attn.v_proj.bias', 'model.decoder.layers.9.self_attn.k_proj.bias', 'model.decoder.layers.6.self_attn.v_proj.weight', 'model.decoder.layers.3.self_attn.k_proj.bias', 'model.decoder.layers.11.fc2.bias', 'model.decoder.layers.10.self_attn.out_proj.weight', 'model.decoder.layers.3.fc1.weight', 'model.decoder.layers.2.fc1.weight', 'model.decoder.layers.10.self_attn.k_proj.weight', 'model.decoder.layers.3.self_attn.out_proj.weight', 'model.decoder.layers.11.self_attn_layer_norm.weight', 'model.decoder.layers.11.fc2.weight', 'model.decoder.layers.4.self_attn_layer_norm.bias', 'model.decoder.layers.11.fc1.bias', 'model.decoder.layers.6.self_attn.q_proj.bias', 'model.decoder.layers.8.self_attn.v_proj.bias', 'model.decoder.layers.8.self_attn


Model instantiated

Parameters frozen

Max length =  768

Tokenizer instantiated


                                                                 


Tokenized datasets:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1024
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1024
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1024
    })
})




#### Define the training parameters

In [59]:
# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("\nDataCollator instantiated")

# Define the TrainingArguments and Trainer
# Default parameters in comments, first in the sequence of options
training_args = TrainingArguments(
    output_dir="./test-trainer",
    overwrite_output_dir=False, # False
    num_train_epochs=3, # 3
    per_device_train_batch_size=64, # 8
    per_device_eval_batch_size=64, # 8
    gradient_accumulation_steps=1, # 1
    learning_rate=5e-5, # 5e-5
    weight_decay=0, # 0
    warmup_steps=0, # 0
    evaluation_strategy='epoch', # 'no', 'epoch', 'steps'
    log_level='info', # 'passive''debug', 'info', 'warning', 'error' and 'critical' 
    log_level_replica='info', # same log_level
    log_on_each_node=True,
    #logging_dir // keep default output_dir/runs/CURRENT_DATETIME_HOSTNAME
    logging_first_step=True,
    logging_steps=16,
    logging_nan_inf_filter=True,
    logging_strategy = 'epoch', # 'steps', others: 'no', 'epoch'
    save_strategy='steps', # 'steps', others: 'no', 'epoch' # checkpoint save strategy
    save_steps=16,
    save_total_limit = 10, #max number of checkpoints
    run_name = 'test_run',
    )
print("\nTrainingArguments instantiated")

# Cleaning
import gc
gc.collect()

import torch
torch.cuda.empty_cache()
print("\nCleaning done")

# Define the evaluation metrics
def compute_metrics(eval_pred, average = 'macro', # average in [None, 'micro', 'macro', 'weighted']
                    # accuracy_metric = evaluate.load('accuracy'),
                    # f1_metric = evaluate.load('f1'),
                    # precision_metric = evaluate.load('precision'),
                    # recall_metric = evaluate.load('recall')
):

    f1_metric = evaluate.load('f1')
    # recall_metric = evaluate.load('recall')
    accuracy_metric = evaluate.load('accuracy')
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    results = {}
    results.update(accuracy_metric.compute(predictions=preds, references=labels))
    results.update(f1_metric.compute(predictions=preds, references=labels, average=average))
    # results.update(precision_metric.compute(predictions=preds, references=labels, average=average))
    # results.update(recall_metric.compute(predictions=preds, references=labels, average=average))
    # results = f1_metric.compute(predictions=preds, references=labels, average=average)
    return results 


import logging
import sys
import datasets, transformers
logger = logging.getLogger(__name__)
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
# set the main code and the modules it uses to the same log-level according to the node
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)




# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print("\nTrainer instantiated")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).



DataCollator instantiated

TrainingArguments instantiated

Cleaning done

Trainer instantiated


#### Train

In [60]:
output = trainer.train()
output

The following columns in the training set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: id, token_type_ids, title, text, _labels. If id, token_type_ids, title, text, _labels are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,024
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 48
  Number of trainable parameters = 6,144


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5738,1.605371,0.428711,0.180759
2,1.5842,1.603199,0.425781,0.177828


Saving model checkpoint to ./test-trainer/checkpoint-16
Configuration saved in ./test-trainer/checkpoint-16/config.json
Configuration saved in ./test-trainer/checkpoint-16/config.json
Model weights saved in ./test-trainer/checkpoint-16/pytorch_model.bin
tokenizer config file saved in ./test-trainer/checkpoint-16/tokenizer_config.json
Special tokens file saved in ./test-trainer/checkpoint-16/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `OPTForSequenceClassification.forward` and have been ignored: id, token_type_ids, title, text, _labels. If id, token_type_ids, title, text, _labels are not expected by `OPTForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1024
  Batch size = 64
Saving model checkpoint to ./test-trainer/checkpoint-32
Configuration saved in ./test-trainer/checkpoint-32/config.json
Saving model checkpoint to ./test-trainer/checkpoint-32
Configu

TrainOutput(global_step=48, training_loss=1.5783667042851448, metrics={'train_runtime': 35.8761, 'train_samples_per_second': 85.628, 'train_steps_per_second': 1.338, 'total_flos': 98306886205440.0, 'train_loss': 1.5783667042851448, 'epoch': 3.0})