In [1]:
# !pip install shiba shiba-model evaluate datasets wandb arabert  accelerate -U nltk torchmetrics==0.3.2 transformers

In [2]:
from typing import Dict
import pandas as pd
import numpy as np
import torchmetrics
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction

from shiba import ShibaForClassification, CodepointTokenizer
from training.helpers import DataArguments, get_base_shiba_state_dict,get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator

<h1> Choose the model

In [3]:
model_path = '../checkpoint-611960.pt'
seg_enable = True
bert_model_name = "aubmindlab/bert-base-arabertv02"
apply_farasa=False
file_save = 'Submit_64_05'

<h1> Read the files

In [4]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))

prediction_label = 'label'

df_train = pd.read_csv("data/q2q_similarity_workshop_v2.1.tsv", sep="\t")
df_testOrignal = pd.read_csv("data/q2q_no_labels_v1.0.tsv", sep="\t")

categories = {idx: cat_name for idx, cat_name in enumerate(set(df_train[prediction_label]))}
id_by_category = {val: key for key, val in categories.items()}

print("categories : ", categories)
print("id_by_category : ", id_by_category)
df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_testOrignal)

categories :  {0: 0, 1: 1}
id_by_category :  {0: 0, 1: 1}


In [5]:
df_testOrignal

Unnamed: 0,QuestionPairID,question1,question2
0,1,كم عدد حروف الفاتحة؟,كيف تكون فقيهاً؟
1,2,هل حلال أكل الضبع؟,هل أكل الضبع حلال أم حرام؟
2,3,كم عدد الركعات في كل صلاة؟,كم عدد ركعات الصلوات المفروضة؟
3,4,كيف أؤمن بالله؟,كيف أكون مؤمناً؟
4,5,لماذا سميت حواء بهذا الاسم؟,كيف عذب الله قوم ثمود؟
...,...,...,...
3995,3996,كيف تحصل على الرزق؟,لماذا خلق الله الإنسان؟
3996,3997,كيف يكون الجهاد باللسان؟,أين غرق فرعون؟
3997,3998,كيف تتعلم قراءة القرآن بطريقة صحيحة؟,كيف أتعلم تلاوة القرآن الكريم؟
3998,3999,كيف ازيد كثافة شعري؟,كيف تجعل شعرك كثيف؟


<h1> Check files count

In [6]:
len(df_train), len(df_test)

(11997, 4000)

# Pre-Process data ( if needed)

In [7]:
if seg_enable:
    from arabert.preprocess import ArabertPreprocessor

    arabert_prep = ArabertPreprocessor(model_name=bert_model_name, apply_farasa_segmentation=apply_farasa)
    # arabert_prep.preprocess()
    df_train = pd.DataFrame(df_train)
    df_test =  pd.DataFrame(df_test)

    df_train['question1'] =  df_train['question1'].apply(arabert_prep.preprocess)
    df_train['question2'] =  df_train['question2'].apply(arabert_prep.preprocess)
    df_test['question1'] =  df_test['question1'].apply(arabert_prep.preprocess)
    df_test['question2'] =  df_test['question2'].apply(arabert_prep.preprocess)
    df_train = Dataset.from_pandas(df_train)
    df_test = Dataset.from_pandas(df_test)
    print("done")


done


In [8]:
pd.DataFrame(df_test)

Unnamed: 0,QuestionPairID,question1,question2
0,1,كم عدد حروف الفاتحة ؟,كيف تكون فقيها ؟
1,2,هل حلال أكل الضبع ؟,هل أكل الضبع حلال أم حرام ؟
2,3,كم عدد الركعات في كل صلاة ؟,كم عدد ركعات الصلوات المفروضة ؟
3,4,كيف أؤمن بالله ؟,كيف أكون مؤمنا ؟
4,5,لماذا سميت حواء بهذا الاسم ؟,كيف عذب الله قوم ثمود ؟
...,...,...,...
3995,3996,كيف تحصل على الرزق ؟,لماذا خلق الله الإنسان ؟
3996,3997,كيف يكون الجهاد باللسان ؟,أين غرق فرعون ؟
3997,3998,كيف تتعلم قراءة القرآن بطريقة صحيحة ؟,كيف أتعلم تلاوة القرآن الكريم ؟
3998,3999,كيف ازيد كثافة شعري ؟,كيف تجعل شعرك كثيف ؟


<h1> Load the model

In [9]:
tokenizer = CodepointTokenizer()
model_hyperparams = {'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}
print(model_hyperparams)
model = ShibaForClassification(vocab_size=len(categories), **model_hyperparams)
data_collator = ClassificationDataCollator()
print('Loading and using base shiba states from', model_path)
checkpoint_state_dict = torch.load(model_path)
model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))

{'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}




Loading and using base shiba states from ../checkpoint-611960.pt


<All keys matched successfully>

<h1>Input IDs Method

In [10]:
def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode([example['question1'], example['question2']])['input_ids'][:model.config.max_length],
        'labels': id_by_category[example[prediction_label]]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode([example['question1'], example['question2']])['input_ids'][:model.config.max_length],
        'labels': 0
    }

<h1> Compute Metrics

In [11]:
def compute_metrics(pred: EvalPrediction) -> Dict:
    try:
        # Convert predictions and labels to PyTorch tensors
        # label_probs = torch.tensor(pred.predictions)
        label_probs, embeddings = pred.predictions
        labels = torch.tensor(pred.label_ids)
        label_probs = torch.exp(torch.tensor(label_probs))  # undo the log in log softmax, get indices
        # # Compute accuracy
        # accuracy = torchmetrics.functional.accuracy(label_probs, labels, num_classes=len(categories))

        # Compute F1 score
        f1_score = torchmetrics.functional.f1(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute recall
        recall = torchmetrics.functional.recall(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute precision
        precision = torchmetrics.functional.precision(label_probs, labels, average='macro', num_classes=len(categories))

        # print("label_probs : ", label_probs, " labels : ", labels)

        metrics = {
            # 'accuracy': accuracy.item(),
            'f1_score': f1_score.item(),
            'recall': recall.item(),
            'precision': precision.item()
        }

        # print("metrics : ", metrics)
        # raise NotImplementedError

        return metrics
    except:
        print("pred : ", pred)
        print("pred.predictions : ", pred.predictions)
        print("label_probs : ", label_probs)
        print("label_probs.size : ", label_probs.size())
        print("labels : ", labels)
        print("labels.size() : ", labels.size())

        raise NotImplementedError

<h1> Fine-tune args

In [12]:
training_args = ShibaClassificationArgs(
    per_device_eval_batch_size=64,
    per_device_train_batch_size=64,
    data_seed=42,
    seed=42,
    do_eval=True,
    do_predict=True,
    do_train=True,
    dropout=0.5,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=100,
    evaluation_strategy='steps',
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    output_dir="fine_result",
    prediction_loss_only=False,
    report_to=[],
    run_name="fine_result",
    save_strategy='no',
)

PyTorch: setting up devices


<h1> Setup the trainer

In [13]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                eval_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                compute_metrics=compute_metrics,
                )

Map:   0%|          | 0/11997 [00:00<?, ? examples/s]

Map:   0%|          | 0/11997 [00:00<?, ? examples/s]

Using auto half precision backend


<h1> Train

In [14]:
training = trainer.train()

***** Running training *****
  Num examples = 11,997
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1,880
  Number of trainable parameters = 120,767,234


Step,Training Loss,Validation Loss,F1 Score,Recall,Precision
100,0.6917,0.684693,0.526315,0.546226,0.56268
200,0.6717,0.707321,0.571254,0.59637,0.608647
300,0.6386,0.606477,0.670567,0.674761,0.673328
400,0.596,0.512078,0.734081,0.73266,0.737752
500,0.517,0.437938,0.79086,0.792494,0.790262
600,0.4267,0.303548,0.865009,0.865195,0.864835
700,0.3818,0.271692,0.885414,0.89015,0.886281
800,0.3197,0.178582,0.92768,0.92556,0.931016
900,0.246,0.130004,0.95058,0.949898,0.951373
1000,0.1988,0.103894,0.961594,0.962525,0.960856


***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch size = 64
***** Running Evaluation *****
  Num examples = 11997
  Batch si

<h1> Prediction

In [15]:
pred = trainer.predict(df_test.map(process_exampleTemp, remove_columns=list(df_test[0].keys())))
df_testOrignal['prediction'] = [categories[x] for x in np.argmax(pred.predictions[0], axis=1)]



Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

***** Running Prediction *****
  Num examples = 4000
  Batch size = 64


<h1> Saving

In [16]:
df_testOrignal[['QuestionPairID', 'prediction']].to_csv(file_save+'/q2q.tsv', index=False, sep="\t")
pd.read_csv(file_save+"/q2q.tsv", sep="\t").head(3)

Unnamed: 0,QuestionPairID,prediction
0,1,0
1,2,1
2,3,1


In [17]:
print("done")

done
