# install `transformers`, `datasets`, `git-lfs`

In [1]:
!pip install transformers[sentencepiece]
!pip install datasets
!apt-get install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 98.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 86.8 MB/s 
[?25hCollecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 68.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, sentencepiece
Successfully installed huggingface-hub-0.11.1 sentencepiece-0.1.97 tokenizer

# login `huggingface`

In [2]:
my_token = ""

In [3]:
from huggingface_hub import login
login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


# import

In [4]:
import transformers
from transformers import (AutoTokenizer, 
                          PreTrainedTokenizer,
                          AutoTokenizer,
                          AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer
)
from datasets import load_dataset

from tokenizers import Tokenizer


# model name
AraBART = "moussaKam/AraBART"
# dataset name
data = "csebuetnlp/xlsum"
# transformer version
transformers.__version__

'4.25.1'

# load dataset from huggingface hub

In [5]:
dataset = load_dataset( data , "arabic")
dataset

Downloading builder script:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

Downloading and preparing dataset xlsum/arabic to /root/.cache/huggingface/datasets/csebuetnlp___xlsum/arabic/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce...


Downloading data:   0%|          | 0.00/44.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset xlsum downloaded and prepared to /root/.cache/huggingface/datasets/csebuetnlp___xlsum/arabic/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'summary', 'text'],
        num_rows: 37519
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'summary', 'text'],
        num_rows: 4689
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'summary', 'text'],
        num_rows: 4689
    })
})

# load tokenizer for `AraBART` model

In [6]:
tokenizer = AutoTokenizer.from_pretrained( AraBART )

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [7]:
dataset['train'][1]

{'id': '130528_egypt_nile_dam',
 'url': 'https://www.bbc.com/arabic/middleeast/2013/05/130528_egypt_nile_dam',
 'title': 'هل يفرض سد النهضة الإثيوبي واقعا جديدا على مصر؟',
 'summary': '"هل سيتم تغيير العبارة الشهيرة للمؤرخ اليوناني هيرودوت أن "مصر هبة النيل"، تساؤل طرحه مدونون وناشطون مصريون على مواقع التواصل الإجتماعي بعد أن اعلنت الحكومة الإثيوبية بدء تحويل مجرى النيل الازرق أحد روافد نهر النيل تمهيدا لبناء سد "النهضة" الإثيوبي.',
 'text': 'بحلول عام 2050 ستحتاج مصر إلى 21 مليار متر مكعب فوق حصتها الحالية بحسب خبراء وأثار هذا الإعلان ردود فعل غاضبة في مصر وصل بعضها إلى التساؤل عن إمكانية تنفيذ عملية عسكرية ضد إثيوبيا لمنع بناء السد لكن محمد نصر الدين علام، وزير الموارد المائية والرى الأسبق استبعد هذا الخيار مؤكدا أن النزاع حول ملف "مياه النيل" لايمكن حله سوى بالطريق السلمي والمفاوضات. والنيل الأزرق هو أحد فرعي نهر النيل وهو الذي يمد مصر بنحو 60 في المئة من حصتها السنوية من مياه النيل التي تبلغ 55 مليار مليمتر مكعب سنويا. "استباق للأحداث" واعتبر خبير المياه نادر نورالدين التأثير على ح

In [8]:
max_input_length = 1024
max_target_length = 128

def preprocessing(rows):
    inputs = [row for row in rows["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(rows["summary"], max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
preprocessing(dataset["train"][:1])



{'input_ids': [[0, 193, 164, 26255, 11327, 4, 38570, 8401, 34042, 1473, 4, 78, 560, 16451, 130, 675, 24051, 6, 18399, 7, 38007, 622, 11509, 885, 822, 8, 2051, 6496, 28, 32849, 33, 5, 5, 26680, 10640, 4, 5, 4960, 2722, 6, 9637, 66, 2872, 782, 4, 160, 1051, 1681, 7309, 28, 32849, 496, 44, 1119, 473, 192, 14, 122, 1922, 495, 234, 561, 24051, 490, 1559, 19284, 192, 22153, 161, 1473, 72, 675, 1780, 30145, 2322, 37921, 1998, 8, 4807, 26255, 29, 5, 2051, 5, 26680, 10640, 10, 20845, 30, 38516, 6103, 39, 7, 103, 2287, 11, 8653, 1327, 24051, 8, 2587, 276, 279, 1697, 6170, 5, 34848, 7, 1745, 78, 35701, 497, 708, 164, 984, 6075, 3092, 66, 960, 26, 4043, 4642, 1644, 63, 8292, 56, 2062, 8, 11, 16, 16284, 10, 40638, 136, 356, 4, 2929, 14324, 4, 1459, 429, 297, 5979, 7572, 279, 189, 149, 19907, 622, 11509, 635, 8, 946, 5022, 845, 24051, 5, 18399, 23, 929, 3257, 3722, 17, 3829, 1804, 51, 9236, 25377, 995, 697, 145, 7, 5903, 5252, 122, 8324, 311, 29, 4, 4215, 473, 473, 192, 5, 18399, 4, 72, 675, 1780, 8

In [10]:
tokenized_dataset = dataset.map(preprocessing, batched=True)

  0%|          | 0/38 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [11]:
traind_model = AutoModelForSeq2SeqLM.from_pretrained( AraBART )

Downloading:   0%|          | 0.00/557M [00:00<?, ?B/s]

In [12]:
batch_size = 4
arguments = Seq2SeqTrainingArguments(
    "arabartsummarization",
    evaluation_strategy = "epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs=3,
    push_to_hub=True,
    push_to_hub_token = my_token,
    predict_with_generate=True,
)



In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=traind_model)

In [14]:
trainer = Seq2SeqTrainer(
    traind_model,
    arguments,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
)

Cloning https://huggingface.co/abdalrahmanshahrour/arabartsummarization into local empty directory.


In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, url, text, title, id. If summary, url, text, title, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 37519
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 28140
  Number of trainable parameters = 139221504
You're using a BarthezTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.784,2.382031
2,2.4954,2.341753
3,2.2223,2.339382


Saving model checkpoint to arabartsummarization/checkpoint-500
Configuration saved in arabartsummarization/checkpoint-500/config.json
Model weights saved in arabartsummarization/checkpoint-500/pytorch_model.bin
tokenizer config file saved in arabartsummarization/checkpoint-500/tokenizer_config.json
Special tokens file saved in arabartsummarization/checkpoint-500/special_tokens_map.json
tokenizer config file saved in arabartsummarization/tokenizer_config.json
Special tokens file saved in arabartsummarization/special_tokens_map.json
Saving model checkpoint to arabartsummarization/checkpoint-1000
Configuration saved in arabartsummarization/checkpoint-1000/config.json
Model weights saved in arabartsummarization/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in arabartsummarization/checkpoint-1000/tokenizer_config.json
Special tokens file saved in arabartsummarization/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in arabartsummarization/tokenizer_config.

TrainOutput(global_step=28140, training_loss=2.557509839153493, metrics={'train_runtime': 4538.8659, 'train_samples_per_second': 24.798, 'train_steps_per_second': 6.2, 'total_flos': 5.414310804794573e+16, 'train_loss': 2.557509839153493, 'epoch': 3.0})

In [None]:
trainer.push_to_hub("arabartsummarization")