In [1]:
# Imports
from importlib import reload
import os
import shutil
from pathlib import Path
import numpy as np
import torch
from datasets import load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import OPTForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import evaluate

import utils.preprocess_data
reload(utils.preprocess_data)
from utils.preprocess_data import preprocess_orig_data

  from .autonotebook import tqdm as notebook_tqdm


#### Setup

In [2]:
# Set up DL framework and device
dl_framework = 'pt'

is_gpu = torch.cuda.is_available()
if is_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Set up file and dir paths
clean_raw_data = True # Whether to clean the raw data directory
data_type = 'applications' # 'is_experimental' or 'applications'

path_to_galactica_folder = Path(r'../galactica')

if data_type == 'applications':
    path_to_raw_data = Path(r'./data/raw_applications.json')
elif data_type == 'is_experimental':
    path_to_raw_data = Path(r'./data/raw_is_experimental.json')
path_to_data = Path(path_to_galactica_folder, path_to_raw_data)

path_to_state_dict = "./state_dict/model_state_dict.pt"
path_to_state_dict = Path(path_to_state_dict)
path_to_state_dict_dir = path_to_state_dict.parent
shutil.rmtree(path_to_state_dict_dir, ignore_errors=True)
os.mkdir(path_to_state_dict_dir)

path_to_training_dir = "./test-trainer"
path_to_training_dir = Path(path_to_training_dir)
# shutil.rmtree(path_to_training_dir, ignore_errors=True)
# os.mkdir(path_to_training_dir)

path_to_tensorboard_dir = Path(path_to_training_dir, 'runs')
shutil.rmtree(path_to_tensorboard_dir, ignore_errors=True)

# Set up data
preprocess = False
subset_ratio = 0.15 # 0.10

# Set up model instanciation
checkpoint = "facebook/galactica-125m"
num_hidden_layers = 12 # 12
print("\nSet-up completed")


Set-up completed


#### Preprocess the data to Datasets

In [3]:
# Preprocess the data to get raw data into /galactica/data
if preprocess == True:
    if data_type == 'applications':
        path_to_orig = Path(r'./data/orig_applications.json')
    elif data_type == 'is_experimental':
        path_to_orig = Path(r'./data/orig_is_experimental.json')

    preprocess_orig_data(str(path_to_galactica_folder), str(path_to_orig), clean=clean_raw_data)

# TODO: Add a check whether the data is there
print(f"Processed data available in the file: {path_to_raw_data}")

Processed data available in the file: data/raw_applications.json


#### Load the data, model, tokenizer and tokenize

In [4]:
# Load the data
raw_dataset = load_from_disk(str(path_to_data))
print("\nDataset loaded: ", raw_dataset)

# Take a subset of raw_dataset
# TODO: implement the following method, much more elegant
# TODO: small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
ds_dict = dict()
for split in raw_dataset:
    ds = raw_dataset[split]
    subset_size = int(len(ds) * subset_ratio)
    ds = Dataset.from_dict(ds[0:subset_size])
    ds = ds.cast(raw_dataset[split].features)
    ds_dict[split] = ds
raw_dataset=DatasetDict(ds_dict)

# Get the number of labels
num_labels = len(raw_dataset['train'].features['labels'].names)
print("\nNumber of labels: ", num_labels)

# Load the Model
#TODO: understand how to properly instantiate the model when device.cuda == 'cuda'
model = OPTForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
state_dict = model.state_dict()
torch.save(state_dict, str(path_to_state_dict))

if device.type == 'cuda':
    config = AutoConfig.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
    with init_empty_weights():
        model = OPTForSequenceClassification._from_config(config)
    model.tie_weights()
    no_split_module_classes = None #List of modules with any residual connection of some kind
    model = load_checkpoint_and_dispatch(model, str(path_to_state_dict), device_map="auto", no_split_module_classes=no_split_module_classes)
print("\nModel instantiated")

# Freeze the model but the last layer
# TODO: how to programmatically get the name of the last layer
for param in model.named_parameters():
    if param[0] != 'score.weight':
        param[1].requires_grad = False
print("\nParameters frozen")

# TODO: This is probably wrong, check what is the correct max_length
max_length = model.config.word_embed_proj_dim
print("\nMax length = ", max_length)

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True) # use_fast is recommended to be False, set to True for testing purposes
# use_fast argument; check https://huggingface.co/docs/transformers/model_doc/opt#overview
id2label = {i: label for label, i in tokenizer.vocab.items()}
pad_token_id = model.config.pad_token_id
tokenizer.add_special_tokens({'pad_token': id2label[pad_token_id]})
print("\nTokenizer instantiated")

def tokenize_function(sequences):
    return tokenizer(sequences['text'], max_length=max_length, truncation=True)

#TODO: save the tokenized datasets to disk
tokenized_datasets = raw_dataset.map(tokenize_function , batched=True)
print("\nTokenized datasets: ", tokenized_datasets)


Dataset loaded:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12006
    })
})


                                                                     


Number of labels:  8


Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing OPTForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model instantiated

Parameters frozen

Max length =  768

Tokenizer instantiated


                                                                   


Tokenized datasets:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14406
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1800
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1800
    })
})




#### Define the training parameters

In [5]:
# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("\nDataCollator instantiated")

# Define the TrainingArguments and Trainer
# Default parameters in comments, first in the sequence of options
training_args = TrainingArguments(
    output_dir=str(path_to_training_dir),
    overwrite_output_dir=False, # False
    num_train_epochs=3, # 3
    per_device_train_batch_size=64, # 8
    per_device_eval_batch_size=64, # 8
    gradient_accumulation_steps=1, # 1
    learning_rate=5e-5, # 5e-5
    weight_decay=0, # 0
    warmup_steps=0, # 0
    evaluation_strategy='steps', # 'no', 'epoch', 'steps'
    eval_steps=32,
    log_level='warning', # 'passive''debug', 'info', 'warning', 'error' and 'critical' 
    log_level_replica='warning', # same log_level
    log_on_each_node=True,
    #logging_dir // keep default output_dir/runs/CURRENT_DATETIME_HOSTNAME
    logging_first_step=False,
    logging_steps=32,
    logging_nan_inf_filter=True,
    logging_strategy = 'steps', # 'steps', others: 'no', 'epoch'
    save_strategy='steps', # 'steps', others: 'no', 'epoch' # checkpoint save strategy
    save_steps=64,
    save_total_limit = 10, #max number of checkpoints, then oldest checkpoint is overwritten
    run_name = 'test_run',
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="tensorboard", #wandb
    )
print("\nTrainingArguments instantiated")

# Cleaning
import gc
gc.collect()

import torch
torch.cuda.empty_cache()
print("\nCleaning done")

# Define the evaluation computation
def compute_metrics(eval_pred, average = 'weighted'):

    accuracy_metric = evaluate.load('accuracy')
    f1_metric = evaluate.load('f1')
    precision_metric = evaluate.load('precision')
    recall_metric = evaluate.load('recall')
    logits, labels = eval_pred

    # TODO: recheck axis = -
    preds = np.argmax(logits, axis = -1)

    results = {}
    results.update(accuracy_metric.compute(predictions=preds, references=labels))
    results.update(f1_metric.compute(predictions=preds, references=labels, average=average))
    results.update(precision_metric.compute(predictions=preds, references=labels, average=average))
    results.update(recall_metric.compute(predictions=preds, references=labels, average=average))

    return results 

from transformers import TrainerCallback
from copy import deepcopy

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_evaluate(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#trainer.add_callback(CustomCallback(trainer)) 

print("\nTrainer instantiated")


DataCollator instantiated

TrainingArguments instantiated

Cleaning done

Trainer instantiated


#### Train

In [6]:
output = trainer.train()
output

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
32,2.1293,1.844972,0.418889,0.359458,0.376328,0.418889
64,1.6835,1.465634,0.52,0.435244,0.559023,0.52
96,1.38,1.213397,0.61,0.532705,0.659494,0.61
128,1.168,1.020252,0.711667,0.651281,0.743595,0.711667
160,1.0037,0.871285,0.771667,0.731042,0.800273,0.771667


In [None]:
# tensorboard dev upload --logdir galactica/test-trainer/runs