In [1]:
# pip install shiba shiba-model evaluate datasets wandb arabert  accelerate -U nltk torchmetrics==0.3.2

In [4]:
from typing import Dict
import pandas as pd
import numpy as np
import torchmetrics
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction

from shiba import ShibaForClassification, CodepointTokenizer
from training.helpers import DataArguments, get_base_shiba_state_dict,get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator

<h1> Choose the model

In [5]:
model_path = '../checkpoint-611960.pt'
seg_enable = True
bert_model_name = "aubmindlab/bert-base-arabertv02"
apply_farasa=False
file_save = 'SEC'

# Pre-Process data ( if needed)

<h1> Read the files

In [6]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))

prediction_label = 'gold_label'

df_train = pd.read_csv("data/arabic_train.tsv", sep="\t")
df_testOrignal = pd.read_csv("data/arabic_dev.tsv", sep="\t")

categories = {idx: cat_name for idx, cat_name in enumerate(set(df_train[prediction_label]))}
id_by_category = {val: key for key, val in categories.items()}

print("categories : ", categories)
print("id_by_category : ", id_by_category)
df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_testOrignal)

categories :  {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
id_by_category :  {'entailment': 0, 'neutral': 1, 'contradiction': 2}


In [7]:
if seg_enable:
    from arabert.preprocess import ArabertPreprocessor

    arabert_prep = ArabertPreprocessor(model_name=bert_model_name,apply_farasa_segmentation=apply_farasa)
    # arabert_prep.preprocess()
    df_train = pd.DataFrame(df_train)
    df_test =  pd.DataFrame(df_test)

    df_train['sentence1'] =  df_train['sentence1'].apply(arabert_prep.preprocess)
    df_train['sentence2'] =  df_train['sentence2'].apply(arabert_prep.preprocess)
    df_test['sentence1'] =  df_test['sentence1'].apply(arabert_prep.preprocess)
    df_test['sentence2'] =  df_test['sentence2'].apply(arabert_prep.preprocess)

    df_train = Dataset.from_pandas(df_train)
    df_test = Dataset.from_pandas(df_test)

<h1> Check files count

In [8]:
len(df_train), len(df_test)

(5010, 2490)

<h1> Load the model

In [9]:
tokenizer = CodepointTokenizer()
model_hyperparams = {'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}
print(model_hyperparams)
model = ShibaForClassification(vocab_size=len(categories), **model_hyperparams)
data_collator = ClassificationDataCollator()
print('Loading and using base shiba states from', model_path)
checkpoint_state_dict = torch.load(model_path)
model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))

{'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}




Loading and using base shiba states from ../checkpoint-611960.pt


<All keys matched successfully>

<h1>Input IDs Method

In [10]:
def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode([example['sentence1'], example['sentence2']])['input_ids'][:model.config.max_length],
        'labels': id_by_category[example[prediction_label]]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode([example['sentence1'], example['sentence2']])['input_ids'][:model.config.max_length],
        'labels': 0
    }

<h1> Compute Metrics

In [11]:
def compute_metrics(pred: EvalPrediction) -> Dict:
    try:
        # Convert predictions and labels to PyTorch tensors
        # label_probs = torch.tensor(pred.predictions)
        label_probs, embeddings = pred.predictions
        labels = torch.tensor(pred.label_ids)
        label_probs = torch.exp(torch.tensor(label_probs))  # undo the log in log softmax, get indices
        # # Compute accuracy
        # accuracy = torchmetrics.functional.accuracy(label_probs, labels, num_classes=len(categories))

        # Compute F1 score
        f1_score = torchmetrics.functional.f1(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute recall
        recall = torchmetrics.functional.recall(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute precision
        precision = torchmetrics.functional.precision(label_probs, labels, average='macro', num_classes=len(categories))

        # print("label_probs : ", label_probs, " labels : ", labels)

        metrics = {
            # 'accuracy': accuracy.item(),
            'f1_score': f1_score.item(),
            'recall': recall.item(),
            'precision': precision.item()
        }

        # print("metrics : ", metrics)
        # raise NotImplementedError

        return metrics
    except:
        print("pred : ", pred)
        print("pred.predictions : ", pred.predictions)
        print("label_probs : ", label_probs)
        print("label_probs.size : ", label_probs.size())
        print("labels : ", labels)
        print("labels.size() : ", labels.size())

        raise NotImplementedError

<h1> Fine-tune args

In [12]:
training_args = ShibaClassificationArgs(
    per_device_train_batch_size=16,
    data_seed=42,
    seed=42,
    do_eval=True,
    do_predict=True,
    do_train=True,
    dropout=0.2,
    evaluation_strategy='no',
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    output_dir="fine_result",
    prediction_loss_only=False,
    report_to=[],
    run_name="fine_result",
    save_strategy='no',
    learning_rate=2e-07
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [13]:
training_args



<h1> Setup the trainer

In [14]:
if seg_enable:
    compute_metrics=None

In [15]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                eval_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                compute_metrics=compute_metrics,
                )

Map:   0%|          | 0/5010 [00:00<?, ? examples/s]

Map:   0%|          | 0/5010 [00:00<?, ? examples/s]

<h1> Train

In [16]:
training = trainer.train()

***** Running training *****
  Num examples = 5,010
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3,140
  Number of trainable parameters = 120,768,003


Step,Training Loss
1,1.1404
100,1.1011
200,1.1035
300,1.1017
400,1.0799
500,1.0503
600,1.0339
700,0.9512
800,0.9287
900,0.9438




Training completed. Do not forget to share your model on huggingface.co/models =)




<h1> Prediction

In [14]:
pred = trainer.predict(df_test.map(process_exampleTemp, remove_columns=list(df_test[0].keys())))
df_testOrignal['prediction'] = [categories[x] for x in np.argmax(pred.predictions[0], axis=1)]




Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

***** Running Prediction *****
  Num examples = 2490
  Batch size = 64


<h1> Saving

In [15]:
df_testOrignal[['pairID', 'prediction']].to_csv(file_save+'/xnli.tsv', index=False, sep="\t")
pd.read_csv(file_save+"/xnli.tsv", sep="\t").head(3)

Unnamed: 0,pairID,prediction
0,1,neutral
1,2,contradiction
2,3,neutral


In [16]:
diagnostic_data = pd.read_csv("data/diagnostic.tsv", sep="\t")


In [17]:
if seg_enable:
    diagnostic_data['sentence2'] =  diagnostic_data['sentence2'].apply(arabert_prep.preprocess)
    diagnostic_data['sentence1'] =  diagnostic_data['sentence1'].apply(arabert_prep.preprocess)


In [18]:
df_diagnostic = Dataset.from_pandas(diagnostic_data)
diagnostic_pred = trainer.predict(df_diagnostic.map(process_exampleTemp, remove_columns=list(df_diagnostic[0].keys())))
diagnostic_data['prediction'] = [categories[x] for x in np.argmax(diagnostic_pred.predictions[0], axis=1)]


Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

***** Running Prediction *****
  Num examples = 1147
  Batch size = 64


In [19]:
diagnostic_data[['pairID', 'prediction']].to_csv(file_save+'/diagnostic.tsv', index=False, sep="\t")
pd.read_csv(file_save+"/diagnostic.tsv", sep="\t").head(3)

Unnamed: 0,pairID,prediction
0,0,neutral
1,1,neutral
2,2,contradiction
