In [1]:
# Imports
from importlib import reload
from pathlib import Path
import numpy as np
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer
from transformers import OPTForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer

import utils.preprocess_data
reload(utils.preprocess_data)
from utils.preprocess_data import preprocess_orig_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocess
data_type = 'applications' # 'is_experimental'
clean = True

path_to_galactica_folder = Path(r'../galactica')
if data_type == 'applications':
    path_to_orig = Path(r'./data/orig_applications.json')
elif data_type == 'is_experimental':
    path_to_orig = Path(r'./data/orig_is_experimental.json')

preprocess_orig_data(str(path_to_galactica_folder), str(path_to_orig), clean=clean)

Found cached dataset json (D:/Users/cdiet/hf/datasets/json/default-ead8f05166997550/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 28.21it/s]
                                                                                                 

In [3]:
# Set DL framework and device
dl_framework = 'pt'

is_gpu = torch.cuda.is_available()
if is_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Set-up
if data_type == 'applications':
    path_to_raw = Path(r'./data/raw_applications.json')
elif data_type == 'is_experimental':
    path_to_raw = Path(r'./data/raw_is_experimental.json')
path_to_data = Path(path_to_galactica_folder, path_to_raw)
checkpoint = "facebook/galactica-125m"

# Import the DatasetDict
raw_dataset = load_from_disk(str(path_to_data))
print("\nDataset loaded: ", raw_dataset)

# Get number of labels
num_labels = len(raw_dataset['train'].features['label'].names)
print("\nNumber of labels: ", num_labels)

#Load the Model
if device.type == 'cuda':
    model = OPTForSequenceClassification.from_pretrained(checkpoint ,device_map=device, num_labels=num_labels)
else:
    model = OPTForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
print("\nModel instantiated")

#TODO: THIS IS WRONG, CHECK THE CORRECT MAX LENGTH
max_length = model.config.word_embed_proj_dim
print("\nMax length = ", max_length)

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
id2label = {i: label for label, i in tokenizer.vocab.items()}
pad_token_id = model.config.pad_token_id
tokenizer.add_special_tokens({'pad_token': id2label[pad_token_id]})
print("\nTokenizer instantiated")


def tokenize_function(sequences):
    return tokenizer(sequences['text'], max_length=max_length, truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function , batched=True)
print("\nTokenized datasets: ", tokenized_datasets)


Dataset loaded:  DatasetDict({
    train: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label'],
        num_rows: 12006
    })
})

Number of labels:  8


Some weights of the model checkpoint at facebook/galactica-125m were not used when initializing OPTForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing OPTForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/galactica-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model instantiated

Max length =  768

Tokenizer instantiated


                                                                   


Tokenized datasets:  DatasetDict({
    train: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 96044
    })
    test: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12005
    })
    validation: Dataset({
        features: ['_label', 'id', 'title', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12006
    })
})




In [4]:
# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the TrainingArguments and Trainer
training_args = TrainingArguments("test-trainer")

# Define the Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Quick training
trainer.train()

  0%|          | 0/36018 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 