In [1]:
# Imports
from importlib import reload
from pathlib import Path
import numpy as np
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import OPTForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer

import utils.preprocess_data
reload(utils.preprocess_data)
from utils.preprocess_data import preprocess_orig_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocess
data_type = 'applications' # 'is_experimental'
clean = True

path_to_galactica_folder = Path(r'../galactica')
if data_type == 'applications':
    path_to_orig = Path(r'./data/orig_applications.json')
elif data_type == 'is_experimental':
    path_to_orig = Path(r'./data/orig_is_experimental.json')

preprocess_orig_data(str(path_to_galactica_folder), str(path_to_orig), clean=clean)

Found cached dataset json (/home/cedric.dietzi/.cache/huggingface/datasets/json/default-14e53905d1432555/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 275.54it/s]
                                                                                                 

In [3]:
# Set DL framework and device
dl_framework = 'pt'

is_gpu = torch.cuda.is_available()
if is_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Set-up
if data_type == 'applications':
    path_to_raw = Path(r'./data/raw_applications.json')
elif data_type == 'is_experimental':
    path_to_raw = Path(r'./data/raw_is_experimental.json')
path_to_data = Path(path_to_galactica_folder, path_to_raw)
checkpoint = "facebook/galactica-125m"
path_to_state_dict = "./model/model_state_dict.pt"

# Import the DatasetDict
raw_dataset = load_from_disk(str(path_to_data))
print("\nDataset loaded: ", raw_dataset)

# Get number of labels
num_labels = len(raw_dataset['train'].features['label'].names)
print("\nNumber of labels: ", num_labels)

#Load the Model
#TODO: understand how to properly instantiate the model when device.cuda == 'cuda'
model = OPTForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
state_dict = model.state_dict()
torch.save(state_dict, path_to_state_dict)

if device.type == 'cuda':
    from accelerate import init_empty_weights, load_checkpoint_and_dispatch
    config = AutoConfig.from_pretrained(checkpoint, num_labels=num_labels)
    with init_empty_weights():
        model = OPTForSequenceClassification._from_config(config)
    model.tie_weights()
    no_split_module_classes = None #List of modules with any residual connection of some kind
    model = load_checkpoint_and_dispatch(model, "./model/model_state_dict.pt", device_map="auto", no_split_module_classes=no_split_module_classes)

print("\nModel instantiated")

#TODO: THIS IS WRONG, CHECK THE CORRECT MAX LENGTH
max_length = model.config.word_embed_proj_dim
print("\nMax length = ", max_length)

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True) # use_fast is recommended to be False, set to True for testing purposes
# use_fast argument; check https://huggingface.co/docs/transformers/model_doc/opt#overview
id2label = {i: label for label, i in tokenizer.vocab.items()}
pad_token_id = model.config.pad_token_id
tokenizer.add_special_tokens({'pad_token': id2label[pad_token_id]})
print("\nTokenizer instantiated")


def tokenize_function(sequences):
    return tokenizer(sequences['text'], max_length=max_length, truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function , batched=True)
print("\nTokenized datasets: ", tokenized_datasets)


Dataset loaded:  DatasetDict({
    train: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label'],
        num_rows: 12006
    })
})

Number of labels:  8


Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing OPTForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model instantiated

Max length =  768


Downloading (…)okenizer_config.json: 100%|██████████| 166/166 [00:00<00:00, 950kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.14M/2.14M [00:00<00:00, 9.42MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 3.00/3.00 [00:00<00:00, 22.3kB/s]



Tokenizer instantiated


                                                                   


Tokenized datasets:  DatasetDict({
    train: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12006
    })
})




In [10]:
# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("\nDataCollator instantiated")

# Define the TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir = "test-trainer",
    overwrite_output_dir = True, # False
    num_train_epochs = 3, # 3
    per_device_train_batch_size = 1, # 8
    per_device_eval_batch_size = 1, # 8
    gradient_accumulation_steps = 1, # 1
    learning_rate = 5e-5, # 5e-5
    weight_decay = 0, # 0
    warmup_steps = 0, # 0
    evaluation_strategy = 'no', # 'no'
    )
print("\nTrainingArguments instantiated")

# Cleaning
import gc
gc.collect()

import torch
torch.cuda.empty_cache()
print("\nCleaning done")

# Define the Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
print("\nTrainer instantiated")


DataCollator instantiated

TrainingArguments instantiated

Cleaning done

Trainer instantiated


In [11]:
# Quick training
print("\nStarting training")
trainer.train()


Starting training


Step,Training Loss


KeyboardInterrupt: 