# install `transformers`, `datasets`, `git-lfs`

In [None]:
!pip install transformers[sentencepiece]
!pip install datasets
!apt-get install git-lfs

# login `huggingface`

In [None]:
my_token = ""

In [None]:
from huggingface_hub import login
login()

# import

In [None]:
import transformers
from transformers import (AutoTokenizer, 
                          PreTrainedTokenizer,
                          AutoTokenizer,
                          AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer
)
from datasets import load_dataset

from tokenizers import Tokenizer


# model name
AraBART = "moussaKam/AraBART"
# dataset name
data = "csebuetnlp/xlsum"
# transformer version
transformers.__version__

# load dataset from huggingface hub

In [None]:
dataset = load_dataset( data , "arabic")
dataset

# load tokenizer for `AraBART` model

In [None]:
tokenizer = AutoTokenizer.from_pretrained( AraBART )

In [None]:
dataset['train'][1]

In [None]:
max_input_length = 1024
max_target_length = 128

def preprocessing(rows):
    inputs = [row for row in rows["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(rows["summary"], max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocessing(dataset["train"][:1])

In [None]:
tokenized_dataset = dataset.map(preprocessing, batched=True)

In [None]:
traind_model = AutoModelForSeq2SeqLM.from_pretrained( AraBART )

In [None]:
batch_size = 4
arguments = Seq2SeqTrainingArguments(
    "AraBART-summ",
    evaluation_strategy = ["epoch", "Rouge"],
    learning_rate = 5e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs=1,
    push_to_hub=True,
    push_to_hub_token = my_token,
    predict_with_generate=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=traind_model)

In [None]:
trainer = Seq2SeqTrainer(
    traind_model,
    arguments,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub("AraBART-summ")