In [1]:
# !pip install shiba shiba-model evaluate datasets wandb arabert  accelerate -U nltk torchmetrics==0.3.2 transformers

In [2]:
from typing import Dict
import pandas as pd
import numpy as np
import torchmetrics
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction

from shiba import ShibaForClassification, CodepointTokenizer
from training.helpers import DataArguments, get_base_shiba_state_dict,get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator

In [3]:
# #We run extensive experiments in order to fairly
# compare JABER11 with Arabic-BERT, AraBERT,
# CAMeLBERT, ARBERT and MARBERT on the
# ALUE tasks. For all these models, we use AdamW
# optimizer with learning rate with linear decay. We
# search12 the learning rate from {7e-6, 2e-5, 5e-
# 5}, batch size from {8, 16, 32, 64, 128}, hid-
# den dropout from {0.1, 0.2, 0.3, 0.4}, and fixed
# the epoch number to 30. The aforementioned HP
# search strategy is applied to all models, and the
# best hyper-parameters are listed in Table 7 in Ap-
# pendix B.

<h1> Choose the model

In [4]:
model_path = '../checkpoint-611960.pt'
seg_enable = True
bert_model_name = "aubmindlab/bert-base-arabertv02"
apply_farasa=False
file_save = 'Submit10_16_02'

<h1> Read the files

In [5]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))

prediction_label = 'offensive'

df_train = pd.read_csv("data/OSACT2020-sharedTask-train2.txt", sep="\t", quotechar='▁', header=None, names=["Feed", prediction_label, "hate"])[['Feed',prediction_label]]
df_dev = pd.read_csv("data/OSACT2020-sharedTask-dev.txt", sep="\t", quotechar='▁', header=None, names=["Feed", prediction_label, "hate"])[['Feed',prediction_label]]
df_testOrignal = pd.read_csv("data/tweets_v1.0.txt", sep="\t", quotechar='▁', header=None, names=["Feed"])

categories = {idx: cat_name for idx, cat_name in enumerate(set(df_train[prediction_label]))}
id_by_category = {val: key for key, val in categories.items()}

print("categories : ", categories)
print("id_by_category : ", id_by_category)
df_train = Dataset.from_pandas(df_train)
df_dev = Dataset.from_pandas(df_dev)
df_test = Dataset.from_pandas(df_testOrignal)

categories :  {0: 'OFF', 1: 'NOT_OFF'}
id_by_category :  {'OFF': 0, 'NOT_OFF': 1}


  df_train = pd.read_csv("data/OSACT2020-sharedTask-train2.txt", sep="\t", quotechar='▁', header=None, names=["Feed", prediction_label, "hate"])[['Feed',prediction_label]]
  df_dev = pd.read_csv("data/OSACT2020-sharedTask-dev.txt", sep="\t", quotechar='▁', header=None, names=["Feed", prediction_label, "hate"])[['Feed',prediction_label]]
  df_testOrignal = pd.read_csv("data/tweets_v1.0.txt", sep="\t", quotechar='▁', header=None, names=["Feed"])


<h1> Check files count

In [6]:
len(df_train), len(df_dev), len(df_test)

(7000, 1000, 1000)

# Pre-Process data ( if needed)

In [7]:
if seg_enable:
    from arabert.preprocess import ArabertPreprocessor

    arabert_prep = ArabertPreprocessor(model_name=bert_model_name,apply_farasa_segmentation=apply_farasa)
    # arabert_prep.preprocess()
    df_train = pd.DataFrame(df_train)
    df_dev = pd.DataFrame(df_dev)
    df_test = pd.DataFrame(df_test)
    df_train['Feed'] =  df_train['Feed'].apply(arabert_prep.preprocess)
    df_dev['Feed'] =  df_dev['Feed'].apply(arabert_prep.preprocess)
    df_test['Feed'] =  df_test['Feed'].apply(arabert_prep.preprocess)
    df_train = Dataset.from_pandas(df_train)
    df_dev = Dataset.from_pandas(df_dev)
    df_test = Dataset.from_pandas(df_test)


<h1> Load the model

In [8]:
tokenizer = CodepointTokenizer()
model_hyperparams = {'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}
print(model_hyperparams)
model = ShibaForClassification(vocab_size=len(categories), **model_hyperparams)
data_collator = ClassificationDataCollator()
print('Loading and using base shiba states from', model_path)
checkpoint_state_dict = torch.load(model_path)
model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))

{'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}




Loading and using base shiba states from ../checkpoint-611960.pt


<All keys matched successfully>

<h1>Input IDs Method

In [9]:
def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Feed'])['input_ids'][:model.config.max_length],
        'labels': id_by_category[example[prediction_label]]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Feed'])['input_ids'][:model.config.max_length],
        'labels': 0
    }

<h1> Compute Metrics

In [10]:
def compute_metrics(pred: EvalPrediction) -> Dict:
    try:
        # Convert predictions and labels to PyTorch tensors
        # label_probs = torch.tensor(pred.predictions)
        label_probs, embeddings = pred.predictions
        labels = torch.tensor(pred.label_ids)
        label_probs = torch.exp(torch.tensor(label_probs))  # undo the log in log softmax, get indices
        # # Compute accuracy
        # accuracy = torchmetrics.functional.accuracy(label_probs, labels, num_classes=len(categories))

        # Compute F1 score
        f1_score = torchmetrics.functional.f1(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute recall
        recall = torchmetrics.functional.recall(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute precision
        precision = torchmetrics.functional.precision(label_probs, labels, average='macro', num_classes=len(categories))

        # print("label_probs : ", label_probs, " labels : ", labels)

        metrics = {
            # 'accuracy': accuracy.item(),
            'f1_score': f1_score.item(),
            'recall': recall.item(),
            'precision': precision.item()
        }

        # print("metrics : ", metrics)
        # raise NotImplementedError

        return metrics
    except:
        print("pred : ", pred)
        print("pred.predictions : ", pred.predictions)
        print("label_probs : ", label_probs)
        print("label_probs.size : ", label_probs.size())
        print("labels : ", labels)
        print("labels.size() : ", labels.size())

        raise NotImplementedError

<h1> Fine-tune args

In [11]:
training_args = ShibaClassificationArgs(
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    data_seed=42,
    seed=42,
    learning_rate=5e-5,
    do_eval=True,
    do_predict=True,
    do_train=True,
    dropout=0.2,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=100,
    evaluation_strategy='steps',
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    output_dir="fine_result",
    prediction_loss_only=False,
    report_to=[],
    run_name="fine_result",
    save_strategy='no',
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


<h1> Setup the trainer

In [12]:
if seg_enable:
    compute_metrics = None

In [13]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                eval_dataset=df_dev.map(process_example, remove_columns=list(df_dev[0].keys())),
                compute_metrics=compute_metrics,
                )

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

<h1> Train

In [14]:
training = trainer.train()

***** Running training *****
  Num examples = 7,000
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4,380
  Number of trainable parameters = 120,767,234


Step,Training Loss,Validation Loss
100,0.5161,0.498586
200,0.3606,0.5369
300,0.3406,0.284725
400,0.3309,0.391692
500,0.2413,0.270554
600,0.2122,0.279906
700,0.2255,0.266805
800,0.2282,0.282367
900,0.2253,0.364827
1000,0.1488,0.360421


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
***** 

<h1> Prediction

In [15]:
pred = trainer.predict(df_test.map(process_exampleTemp, remove_columns=list(df_test[0].keys())) )
df_testOrignal[prediction_label] = [categories[x] for x in np.argmax(pred.predictions[0], axis=1)]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


file_save<h1> Saving

In [16]:
file_save

'Submit10_16_02'

In [17]:
df_testOrignal.to_csv(file_save+'/offensive.tsv', index=False, header=False, sep="\t")
pd.read_csv(file_save+"/offensive.tsv", sep="\t").head(3)

Unnamed: 0,"أود أن أعلمكم أن التعليق المنشور هنا باسم نور لست مسؤول عنه ذلك أنني لم أكتبه باسمي المعتاد نور ,امل أن تتأكدو من البريد الالكتروني المصحوب دائما باسم نور شكرا",NOT_OFF
0,مافيه فرق بين احمد جبريل والعاهره المستأجره,OFF
1,اذا نطق السفية فلا تجبة لانة سفية وقليل الادب ...,OFF
2,اعتقد حضرتك تدعو لمؤتمر دولى للحوار للسلمي مع ...,OFF


In [18]:
print("done")

done
