In [1]:
# Imports
from importlib import reload
import os
import time
import shutil
from pathlib import Path
import numpy as np
import torch
from datasets import load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import evaluate

import src.setup
reload(src.setup)
from src.setup import path_to_orig_data, path_to_raw_data, path_to_tokenized_data
from src.setup import path_to_state_dict, path_to_training_dir
from src.setup import device
from src.setup import clear_raw_data
from src.setup import do_preprocess_orig_data, subset_ratio, do_tokenize_data
from src.setup import test_size, valid_size
from src.setup import ModelClass, checkpoint, num_hidden_layers
from src.setup import transformer_head_name, transformer_max_seq_length
from src.setup import training_arguments_kw
import src.preprocess_data
reload(src.preprocess_data)
from src.preprocess_data import preprocess_orig_data

  from .autonotebook import tqdm as notebook_tqdm


#### Preprocess the data to Datasets

In [2]:
# Preprocess the data to get raw data into /galactica/data
if do_preprocess_orig_data == True:
    preprocess_orig_data(str(path_to_orig_data), 
                         str(path_to_raw_data), 
                         test_size=test_size,
                         valid_size=valid_size,
                         clear=clear_raw_data)
    
print(f"Processed data available in the file: {path_to_raw_data}")

Processed data available in the file: /home/cedric.dietzi/projects/galactica/data/raw_applications.json


#### Load the data, model, tokenizer and tokenize

In [3]:
# Load the dataset
print('\n' + '*'*50)
print("Loading the datasets")
raw_datasets = load_from_disk(str(path_to_raw_data))
print("\nDatasets loaded: ", raw_datasets)

# Take a subset of raw_dataset
print('\n' + '*'*50)
print("Sampling the datasets")
for split in raw_datasets:
    subset_size = int(len(raw_datasets[split]) * subset_ratio[split])
    raw_datasets[split] = raw_datasets[split].shuffle(seed=42).select(range(subset_size))
print("\nDatasets sampled: ", raw_datasets)

# Get the number of labels
num_labels = len(raw_datasets['train'].features['labels'].names)
print('\n' + '*'*50)
print("Number of labels: ", num_labels)

# Load the Model
print('\n' + '*'*50)
print("Instantiating the model")
model = ModelClass.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
if device.type == 'cuda':
    #TODO: understand how to simplify this instantiation
    state_dict = model.state_dict()
    torch.save(state_dict, str(path_to_state_dict))
    config = AutoConfig.from_pretrained(checkpoint, num_labels=num_labels, num_hidden_layers=num_hidden_layers)
    with init_empty_weights():
        model = ModelClass._from_config(config)
    model.tie_weights()
    no_split_module_classes = None #List of modules with any residual connection of some kind
    model = load_checkpoint_and_dispatch(model, str(path_to_state_dict), device_map="auto", no_split_module_classes=no_split_module_classes)
print("\nModel instantiated")

# Freeze the model but the last layer
print('\n' + '*'*50)
print("Freezing the parameters")
for param in model.named_parameters():
    if param[0] != transformer_head_name:
        param[1].requires_grad = False
print("\nParameters frozen")

# Load the Tokenizer
print('\n' + '*'*50)
print("Instantiating the tokenizer")
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)
id2label = {i: label for label, i in tokenizer.vocab.items()}
pad_token_id = model.config.pad_token_id
tokenizer.add_special_tokens({'pad_token': id2label[pad_token_id]})
print("\nTokenizer instantiated")

if do_tokenize_data:
    # Tokenizing the Dataset
    print('\n' + '*'*50)
    print("Tokenizing the datasets")
    def tokenize_function(sequences):
        return tokenizer(sequences['text'], max_length=transformer_max_seq_length, truncation=True)
    tokenized_datasets = raw_datasets.map(tokenize_function , batched=True)
    print("\nDatasets tokenized: ", tokenized_datasets)

    # Save the datasets to disk
    print('\n' + '*'*50)
    print("Saving the datasets")
    tokenized_datasets.save_to_disk(path_to_tokenized_data)

print(f"\nDatasets available in :{path_to_tokenized_data}")
if raw_datasets:
    del(raw_datasets)
if tokenized_datasets:
    del(tokenized_datasets)

Loading cached shuffled indices for dataset at /home/cedric.dietzi/projects/galactica/data/raw_applications.json/train/cache-3565d5e63d291982.arrow
Loading cached shuffled indices for dataset at /home/cedric.dietzi/projects/galactica/data/raw_applications.json/test/cache-94751cb814f39085.arrow
Loading cached shuffled indices for dataset at /home/cedric.dietzi/projects/galactica/data/raw_applications.json/validation/cache-8a27f18eb0a49e1c.arrow



**************************************************
Loading the datasets

Datasets loaded:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 12006
    })
})

**************************************************
Sampling the datasets

Datasets sampled:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 28813
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels'],
        num_rows: 1200
    })
})

**************************************************
Number of labels:  8

****

Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing OPTForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model instantiated

**************************************************
Freezing the parameters

Parameters frozen

**************************************************
Instantiating the tokenizer


Loading cached processed dataset at /home/cedric.dietzi/projects/galactica/data/raw_applications.json/train/cache-eec5480444058ac5.arrow
Loading cached processed dataset at /home/cedric.dietzi/projects/galactica/data/raw_applications.json/test/cache-f23d63a1565388ba.arrow
Loading cached processed dataset at /home/cedric.dietzi/projects/galactica/data/raw_applications.json/validation/cache-58f8e73ee0a5b4da.arrow



Tokenizer instantiated

**************************************************
Tokenizing the datasets

Datasets tokenized:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 28813
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1200
    })
})

**************************************************
Saving the datasets


                                                                                                 


Datasets available in :/home/cedric.dietzi/projects/galactica/data/tokenized_applications.json




#### Define the training parameters

In [4]:
# Load the dataset
print('\n' + '*'*50)
print("Loading the datasets")
tokenized_datasets = load_from_disk(str(path_to_tokenized_data))
print("\nDatasets loaded: ", tokenized_datasets)

# Define the data collator
print('\n' + '*'*50)
print("Instantiating the DataCollator")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("\nDataCollator instantiated")

# Define the TrainingArguments and Trainer
print('\n' + '*'*50)
print("Instantiating TrainingArguments")
output_dir = Path(path_to_training_dir, time.strftime("%Y-%m-%d_%H:%m:%S"))
training_args = TrainingArguments(
    output_dir = str(output_dir),
    **training_arguments_kw)
print("\nTrainingArguments instantiated")

# Cleaning
print('\n' + '*'*50)
print("Cleaning memory")
import gc
gc.collect()

import torch
torch.cuda.empty_cache()
print("\nCleaning done")

# Define the evaluation computation
print('\n' + '*'*50)
print("Define the evaluation metrics")
def compute_metrics(eval_pred, average = 'weighted'):

    accuracy_metric = evaluate.load('accuracy')
    f1_metric = evaluate.load('f1')
    precision_metric = evaluate.load('precision')
    recall_metric = evaluate.load('recall')
    logits, labels = eval_pred

    # TODO: recheck axis = -
    preds = np.argmax(logits, axis = -1)

    results = {}
    results.update(accuracy_metric.compute(predictions=preds, references=labels))
    results.update(f1_metric.compute(predictions=preds, references=labels, average=average))
    results.update(precision_metric.compute(predictions=preds, references=labels, average=average))
    results.update(recall_metric.compute(predictions=preds, references=labels, average=average))

    return results 
print("\nEvaluation metrics defined")

# Instantiate the Trainer
print('\n' + '*'*50)
print("Instantiate the Trainer")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\nTrainer instantiated")


**************************************************
Loading the datasets

Datasets loaded:  DatasetDict({
    train: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 28813
    })
    test: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['_labels', 'id', 'title', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1200
    })
})

**************************************************
Instantiating the DataCollator

DataCollator instantiated

**************************************************
Instantiating TrainingArguments

TrainingArguments instantiated

**************************************************
Cleaning memory

Cleaning done

**************************************************
Define the evaluation metrics

Ev

#### Train

In [None]:
output = trainer.train()
output

In [None]:
# tensorboard dev upload --logdir galactica/test-trainer/.....