In [26]:
import tensorflow as tf
from transformers import AutoTokenizer
import pandas as pd
import re
import evaluate
from transformers import BartForConditionalGeneration
from tqdm import tqdm
from datasets import load_dataset
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
import numpy as np
from indobenchmark import IndoNLGTokenizer

In [2]:
MODEL_CHECKPOINT = "indobenchmark/indobart-v2"
SAVE_PATH = 'models/pt-indobart'
INPUT_TEXT_COL = 'clean_tweet'
QUADRUPLET_COL = 'quadruplet'

In [28]:
tokenizer = IndoNLGTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = BartForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

You are using a model of type mbart to instantiate a model of type bart. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at indobenchmark/indobart-v2 were not used when initializing BartForConditionalGeneration: ['model.encoder.layer_norm.bias', 'model.decoder.layer_norm.weight', 'model.decoder.layer_norm.bias', 'model.encoder.layer_norm.weight']
- This IS expected if you are initializing BartForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Data

In [4]:
max_length = 128

def preprocess_function(examples):
    inputs = examples[INPUT_TEXT_COL]
    targets =examples[QUADRUPLET_COL] 
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [8]:
raw_dataset = load_dataset('csv', data_files='data/quadruplet_only.csv', split='train')
splitted_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
preprocessed_dataset = splitted_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset.column_names,
    load_from_cache_file=False
)

Downloading and preparing dataset csv/default to C:/Users/danendra/.cache/huggingface/datasets/csv/default-5ee63b2ab57cbf46/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 499.92it/s]
                                                        

Dataset csv downloaded and prepared to C:/Users/danendra/.cache/huggingface/datasets/csv/default-5ee63b2ab57cbf46/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


                                                   

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Model Training

In [33]:
preprocessed_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 346
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 39
    })
})

In [27]:
training_args = Seq2SeqTrainingArguments(
    'models/test-seq2seqtrainer',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    resume_from_checkpoint=True,
    num_train_epochs=3,
    save_total_limit=2,
)

In [29]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=preprocessed_dataset["train"],
    eval_dataset=preprocessed_dataset["test"],
    data_collator=data_collator,
)

In [31]:
trainer.train()

 33%|███▎      | 44/132 [00:13<00:27,  3.17it/s]
100%|██████████| 5/5 [00:00<00:00, 44.76it/s]
                                                

{'eval_loss': 1.3629390001296997, 'eval_runtime': 0.1452, 'eval_samples_per_second': 268.523, 'eval_steps_per_second': 34.426, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]
                                                

{'eval_loss': 1.2995836734771729, 'eval_runtime': 0.1278, 'eval_samples_per_second': 305.16, 'eval_steps_per_second': 39.123, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]
                                                 

{'eval_loss': 1.2995836734771729, 'eval_runtime': 0.1233, 'eval_samples_per_second': 316.325, 'eval_steps_per_second': 40.554, 'epoch': 3.0}



100%|██████████| 132/132 [01:04<00:00,  2.03it/s]

{'train_runtime': 64.9425, 'train_samples_per_second': 15.983, 'train_steps_per_second': 2.033, 'train_loss': 1.5145768830270478, 'epoch': 3.0}





TrainOutput(global_step=132, training_loss=1.5145768830270478, metrics={'train_runtime': 64.9425, 'train_samples_per_second': 15.983, 'train_steps_per_second': 2.033, 'train_loss': 1.5145768830270478, 'epoch': 3.0})

In [32]:
model.save_pretrained('models/test-seq2seqtrainer')

# Model Inference

In [43]:
tokenizer = IndoNLGTokenizer.from_pretrained(MODEL_CHECKPOINT)
loaded_model = BartForConditionalGeneration.from_pretrained(SAVE_PATH).to('cuda')

kalau pakai dataset huggingface hasil split/dari awal

In [44]:
tokenized_text = tokenizer(splitted_dataset['test']['clean_tweet']
          , padding=True, truncation=True, return_tensors="pt").to('cuda')
generated_text = loaded_model.generate(tokenized_text['input_ids'], max_length=100)
pred_text = tokenizer.batch_decode(generated_text, skip_special_tokens=True)

In [None]:
test_df = splitted_dataset['test'].to_pandas()
test_df['pred_quadruplet_pt_bart'] = pred_text

kalau udh ada dataset pred sebelumnya

In [None]:
test_df = pd.read_csv('data/quadruplet_test_pred.csv')
test_df.head()

In [None]:
tokenized_text = tokenizer(test_df['clean_tweet'].values.tolist()
          , padding=True, truncation=True, return_tensors="pt")
generated_text = model.generate(tokenized_text['input_ids'], max_length=max_length)
pred_text = tokenizer.batch_decode(generated_text, skip_special_tokens=True)

In [48]:
test_df['pred_quadruplet_pt_bart'] = pred_text

In [None]:
test_df.to_csv('data/quadruplet_test_pred.csv', index=False)