In [3]:
# pip install shiba shiba-model evaluate datasets wandb arabert  accelerate -U nltk torchmetrics==0.3.2

In [1]:
from typing import Dict
import pandas as pd
import numpy as np
import torchmetrics
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction

from shiba import ShibaForClassification, CodepointTokenizer
from training.helpers import DataArguments, get_base_shiba_state_dict,get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator

<h1> Choose the model

In [3]:
model_path = '../checkpoint-611960.pt'
seg_enable = True
bert_model_name = "aubmindlab/bert-base-arabertv02"
apply_farasa=False
file_save = 'Submit_8_02'

# Pre-Process data ( if needed)

<h1> Read the files

In [4]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))

prediction_label = 'label'

df_train = pd.read_csv("data/MADAR-Corpus-26-train.tsv",names=['Text', 'label'], sep="\t")
df_dev = pd.read_csv("data/MADAR-Corpus-26-dev.tsv",names=['Text', 'label'], sep="\t")
df_testOrignal = pd.read_csv("data/MADAR-Corpus-26-test.tsv",names=['Text', 'label'], sep="\t")

categories = {idx: cat_name for idx, cat_name in enumerate(set(df_train[prediction_label]))}
id_by_category = {val: key for key, val in categories.items()}

print("categories : ", categories)
print("id_by_category : ", id_by_category)
df_train = Dataset.from_pandas(df_train)
df_dev = Dataset.from_pandas(df_dev)
df_test = Dataset.from_pandas(df_testOrignal)

categories :  {0: 'SAL', 1: 'FES', 2: 'KHA', 3: 'BEI', 4: 'DAM', 5: 'ASW', 6: 'MOS', 7: 'TUN', 8: 'RIY', 9: 'BAG', 10: 'RAB', 11: 'ALG', 12: 'JER', 13: 'MUS', 14: 'AMM', 15: 'DOH', 16: 'SFX', 17: 'CAI', 18: 'BAS', 19: 'ALE', 20: 'BEN', 21: 'MSA', 22: 'TRI', 23: 'ALX', 24: 'SAN', 25: 'JED'}
id_by_category :  {'SAL': 0, 'FES': 1, 'KHA': 2, 'BEI': 3, 'DAM': 4, 'ASW': 5, 'MOS': 6, 'TUN': 7, 'RIY': 8, 'BAG': 9, 'RAB': 10, 'ALG': 11, 'JER': 12, 'MUS': 13, 'AMM': 14, 'DOH': 15, 'SFX': 16, 'CAI': 17, 'BAS': 18, 'ALE': 19, 'BEN': 20, 'MSA': 21, 'TRI': 22, 'ALX': 23, 'SAN': 24, 'JED': 25}


In [5]:
if seg_enable:
    from arabert.preprocess import ArabertPreprocessor

    arabert_prep = ArabertPreprocessor(model_name=bert_model_name,apply_farasa_segmentation=apply_farasa)
    # arabert_prep.preprocess()
    df_train = pd.DataFrame(df_train)
    df_test =  pd.DataFrame(df_test)
    df_dev = pd.DataFrame(df_dev)
    df_train['Text'] =  df_train['Text'].apply(arabert_prep.preprocess)
    df_test['Text'] =  df_test['Text'].apply(arabert_prep.preprocess)
    df_dev['Text']=df_dev['Text'].apply(arabert_prep.preprocess)
    df_train = Dataset.from_pandas(df_train)
    df_test = Dataset.from_pandas(df_test)
    df_dev = Dataset.from_pandas(df_dev)

<h1> Check files count

In [6]:
len(df_train), len(df_dev), len(df_test)

(41600, 5200, 5200)

<h1> Load the model

In [7]:
tokenizer = CodepointTokenizer()
model_hyperparams = {'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}
print(model_hyperparams)
model = ShibaForClassification(vocab_size=len(categories), **model_hyperparams)
data_collator = ClassificationDataCollator()
print('Loading and using base shiba states from', model_path)
checkpoint_state_dict = torch.load(model_path)
model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))

{'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}




Loading and using base shiba states from ../checkpoint-611960.pt


<All keys matched successfully>

<h1>Input IDs Method

In [8]:
def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Text'])['input_ids'][:model.config.max_length],
        'labels': id_by_category[example[prediction_label]]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Text'])['input_ids'][:model.config.max_length],
        'labels': 0
    }

<h1> Compute Metrics

In [9]:
def compute_metrics(pred: EvalPrediction) -> Dict:
    try:
        # Convert predictions and labels to PyTorch tensors
        # label_probs = torch.tensor(pred.predictions)
        label_probs, embeddings = pred.predictions
        labels = torch.tensor(pred.label_ids)
        label_probs = torch.exp(torch.tensor(label_probs))  # undo the log in log softmax, get indices
        # # Compute accuracy
        # accuracy = torchmetrics.functional.accuracy(label_probs, labels, num_classes=len(categories))

        # Compute F1 score
        f1_score = torchmetrics.functional.f1(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute recall
        recall = torchmetrics.functional.recall(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute precision
        precision = torchmetrics.functional.precision(label_probs, labels, average='macro', num_classes=len(categories))

        # print("label_probs : ", label_probs, " labels : ", labels)

        metrics = {
            # 'accuracy': accuracy.item(),
            'f1_score': f1_score.item(),
            'recall': recall.item(),
            'precision': precision.item()
        }

        # print("metrics : ", metrics)
        # raise NotImplementedError

        return metrics
    except:
        print("pred : ", pred)
        print("pred.predictions : ", pred.predictions)
        print("label_probs : ", label_probs)
        print("label_probs.size : ", label_probs.size())
        print("labels : ", labels)
        print("labels.size() : ", labels.size())

        raise NotImplementedError

<h1> Fine-tune args

In [10]:
training_args = ShibaClassificationArgs(
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    data_seed=42,
    seed=42,
    do_eval=True,
    do_predict=True,
    do_train=True,
    dropout=0.2,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=100,
    evaluation_strategy='steps',
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    output_dir="fine_result",
    prediction_loss_only=False,
    report_to=[],
    run_name="fine_result",
    save_strategy='no',
)

PyTorch: setting up devices


<h1> Setup the trainer

In [11]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                eval_dataset=df_dev.map(process_example, remove_columns=list(df_dev[0].keys())),
                compute_metrics=compute_metrics,
                )

Map:   0%|          | 0/41600 [00:00<?, ? examples/s]

Map:   0%|          | 0/5200 [00:00<?, ? examples/s]

Using auto half precision backend


<h1> Train

In [None]:
training = trainer.train()

***** Running training *****
  Num examples = 41,600
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 52,000
  Number of trainable parameters = 120,785,690


Step,Training Loss,Validation Loss,F1 Score,Recall,Precision
100,3.2629,3.259497,0.023093,0.038462,0.034581


***** Running Evaluation *****
  Num examples = 5200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 5200
  Batch size = 8


<h1> Prediction

In [None]:
pred = trainer.predict(df_test.map(process_exampleTemp, remove_columns=list(df_test[0].keys())))
df_testOrignal[prediction_label] = [categories[x] for x in np.argmax(pred.predictions[0], axis=1)]

<h1> Saving

In [None]:
df_testOrignal[[prediction_label]].to_csv(file_save+'/madar.tsv', index=False, header=False, sep="\t")
pd.read_csv(file_save+"/madar.tsv", sep="\t").head(3)

In [None]:
print("done")