In [1]:
# Imports
from importlib import reload
import os
import shutil
from pathlib import Path
import numpy as np
import torch
from datasets import load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import OPTForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import evaluate

import utils.preprocess_data
reload(utils.preprocess_data)
from utils.preprocess_data import preprocess_orig_data

  from .autonotebook import tqdm as notebook_tqdm


#### Setup

In [2]:
# Set up DL framework and device
dl_framework = 'pt'

is_gpu = torch.cuda.is_available()
if is_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Set up file and dir paths
clean = True # Whether to clean the raw data directory
data_type = 'applications' # 'is_experimental' or 'applications'

path_to_galactica_folder = Path(r'../galactica')

if data_type == 'applications':
    path_to_raw = Path(r'./data/raw_applications.json')
elif data_type == 'is_experimental':
    path_to_raw = Path(r'./data/raw_is_experimental.json')
path_to_data = Path(path_to_galactica_folder, path_to_raw)

path_to_state_dict = "./state_dict/model_state_dict.pt"
path_to_state_dict = Path(path_to_state_dict)
dir_to_state_dict = path_to_state_dict.parent
shutil.rmtree(dir_to_state_dict, ignore_errors=True)
os.mkdir(dir_to_state_dict)

# Set up data
preprocess = False
subset_size = 512

# Set up model instanciation
checkpoint = "facebook/galactica-125m"
num_hidden_layers = 1
print("\nSet-up completed")


Set-up completed


#### Preprocess the data to Datasets

In [3]:
# Preprocess the data to get raw data into /galactica/data
if preprocess == True:
    if data_type == 'applications':
        path_to_orig = Path(r'./data/orig_applications.json')
    elif data_type == 'is_experimental':
        path_to_orig = Path(r'./data/orig_is_experimental.json')

    preprocess_orig_data(str(path_to_galactica_folder), str(path_to_orig), clean=clean)

Found cached dataset json (/home/cedric.dietzi/.cache/huggingface/datasets/json/default-14e53905d1432555/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 301.73it/s]
                                                                                                 

#### Load the data, model, tokenizer and tokenize

In [5]:
# Load the data
raw_dataset = load_from_disk(str(path_to_data))
print("\nDataset loaded: ", raw_dataset)

# Take a subset of raw_dataset
# TODO: implement the following method, much more elegant
# TODO: small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
ds_dict = dict()
for split in raw_dataset:
    ds = raw_dataset[split]
    ds = Dataset.from_dict(ds[0:subset_size])
    ds = ds.cast(raw_dataset[split].features)
    ds_dict[split] = ds
raw_dataset=DatasetDict(ds_dict)

# Get the number of labels
num_labels = len(raw_dataset['train'].features['labels'].names)
print("\nNumber of labels: ", num_labels)

# Load the Model
#TODO: understand how to properly instantiate the model when device.cuda == 'cuda'
model = OPTForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
state_dict = model.state_dict()
torch.save(state_dict, str(path_to_state_dict))

if device.type == 'cuda':
    config = AutoConfig.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
    with init_empty_weights():
        model = OPTForSequenceClassification._from_config(config)
    model.tie_weights()
    no_split_module_classes = None #List of modules with any residual connection of some kind
    model = load_checkpoint_and_dispatch(model, str(path_to_state_dict), device_map="auto", no_split_module_classes=no_split_module_classes)
print("\nModel instantiated")

# Freeze the model but the last layer
# TODO: how to programmatically get the name of the last layer
for param in model.named_parameters():
    if param[0] != 'score.weight':
        param[1].requires_grad = False
print("\nParameters frozen")

# TODO: This is probably wrong, check what is the correct max_length
max_length = model.config.word_embed_proj_dim
print("\nMax length = ", max_length)

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True) # use_fast is recommended to be False, set to True for testing purposes
# use_fast argument; check https://huggingface.co/docs/transformers/model_doc/opt#overview
id2label = {i: label for label, i in tokenizer.vocab.items()}
pad_token_id = model.config.pad_token_id
tokenizer.add_special_tokens({'pad_token': id2label[pad_token_id]})
print("\nTokenizer instantiated")

def tokenize_function(sequences):
    return tokenizer(sequences['text'], max_length=max_length, truncation=True)

#TODO: save the tokenized datasets to disk
tokenized_datasets = raw_dataset.map(tokenize_function , batched=True)
print("\nTokenized datasets: ", tokenized_datasets)


Dataset loaded:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12006
    })
})


                                                                   


Number of labels:  8


Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['model.decoder.layers.9.self_attn.q_proj.weight', 'model.decoder.layers.5.fc1.weight', 'model.decoder.layers.5.self_attn.out_proj.weight', 'model.decoder.layers.10.self_attn.q_proj.weight', 'model.decoder.layers.8.self_attn.out_proj.weight', 'model.decoder.layers.8.self_attn.q_proj.bias', 'model.decoder.layers.1.self_attn.q_proj.weight', 'model.decoder.layers.8.self_attn.k_proj.bias', 'model.decoder.layers.11.self_attn.k_proj.weight', 'model.decoder.layers.6.self_attn.out_proj.weight', 'model.decoder.layers.7.fc1.bias', 'model.decoder.layers.11.final_layer_norm.weight', 'model.decoder.layers.10.self_attn_layer_norm.weight', 'model.decoder.layers.1.fc2.bias', 'model.decoder.layers.3.self_attn.q_proj.bias', 'model.decoder.layers.8.self_attn.k_proj.weight', 'model.decoder.layers.9.self_attn_layer_norm.bias', 'model.decoder.layers.11.self_attn.out_proj.bias', 'mode


Model instantiated

Parameters frozen

Max length =  768

Tokenizer instantiated


                                                               


Tokenized datasets:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 512
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 512
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 512
    })
})




#### Define the training parameters

In [39]:
eval_pred = ([[0.5, 0.2, 0.6], [0.2, 0.9, 0.2], [0.1,0.0,0.0]], [1,1,2])
logits, labels = eval_pred
preds = np.argmax(logits, axis = -1)
accuracy_metric = evaluate.load('accuracy')
accuracy_metric.compute(predictions=preds, references = labels)

{'accuracy': 0.3333333333333333}

In [None]:
accuracy_metric = evaluate.load('accuracy'), 
f1_metric = evaluate.load('f1', average='micro'), 
precision_metric = evaluate.load('precision', average='micro'), 
recall_metric = evaluate.load('recall', average='micro')

accuracy_metric = evaluate.load('accuracy'), 
f1_metric = evaluate.load('f1'), 
precision_metric = evaluate.load('precision'), 
recall_metric = evaluate.load('recall')

def compute_metrics(eval_pred):

    results = {}
    eval_pred = ([[0.5, 0.2, 0.6], [0.2, 0.9, 0.2], [0.1,0.0,0.0]], [1,1,2])
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    r = accuracy_metric.compute(predictions=preds, references = labels)
    results.append(r)
    # results = {}
    # results.update(accuracy_metric.compute(predictions=preds, references = labels))
    # results.update(f1_metric.compute(predictions=preds, references = labels, average="micro"))
    # results.update(precision_metric.compute(predictions=preds, references = labels, average="micro"))
    # results.update(recall_metric.compute(predictions=preds, references = labels, average="micro"))
    return results

eval_pred = ([[0.5, 0.2, 0.6], [0.2, 0.9, 0.2], [0.1,0.0,0.0]], [1,1,2])

compute_metrics(eval_pred)


In [19]:
# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("\nDataCollator instantiated")

# Define the TrainingArguments and Trainer
# Default parameters in comments
training_args = TrainingArguments(
    output_dir = "test-trainer",
    overwrite_output_dir = False, # False
    do_train = True,
    do_eval = True,
    num_train_epochs = 3, # 3
    per_device_train_batch_size = 64, # 8
    per_device_eval_batch_size = 64, # 8
    gradient_accumulation_steps = 1, # 1
    learning_rate = 5e-5, # 5e-5
    weight_decay = 0, # 0
    warmup_steps = 0, # 0
    evaluation_strategy = 'epoch', # 'no' by default, others: 'epoch', 'steps'
    )
print("\nTrainingArguments instantiated")

# Cleaning
import gc
gc.collect()

import torch
torch.cuda.empty_cache()
print("\nCleaning done")

# Define the evaluation metrics
def compute_metrics(eval_preds):
    metrics = evaluate.load("accuracy")
    metrics=evaluate.combine(["accuracy", "recall", "precision", "f1"], av)
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels, average="weighted") # average in [None, 'micro', 'macro', 'weighted']

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print("\nTrainer instantiated")


DataCollator instantiated

TrainingArguments instantiated

Cleaning done

Trainer instantiated


#### Train

In [20]:
output = trainer.train()
output



Epoch,Training Loss,Validation Loss


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].