In [3]:
# pip install shiba shiba-model evaluate datasets wandb arabert  accelerate -U nltk torchmetrics==0.3.2

In [1]:
from typing import Dict
import pandas as pd
import numpy as np
import torchmetrics
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction
from sklearn.metrics import jaccard_score

from shiba import ShibaForClassificationD, CodepointTokenizer
from training.helpers import DataArguments, get_base_shiba_state_dict,get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator

<h1> Choose the model

In [2]:
model_path = '../checkpoint-611960.pt'
seg_enable = True
bert_model_name = "aubmindlab/bert-base-arabertv02"
apply_farasa=False
file_save = 'SEC'
batch = 8
drop_it = 0.3
num_train_epochs = 10
conf = str(drop_it)+"_"+str(batch)

# Pre-Process data ( if needed)

<h1> Compute Metrics

In [3]:
compute_metrics = lambda p: {'jaccard_similarity': jaccard_score(p.predictions[0] > 0.5, p.label_ids, average="macro")}

In [4]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))


df_testOrignal = pd.read_csv("data/emotion_no_labels_v1.0.tsv", sep="\t")

df_train = pd.read_csv("data/2018-E-c-Ar-train.txt", sep="\t")
df_dev = pd.read_csv("data/2018-E-c-Ar-dev.txt", sep="\t")
df_test = df_testOrignal.copy()



In [5]:
df_train[df_train.columns[2:]].iloc[0].values

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0])

In [6]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))
df_testOrignal = pd.read_csv("data/emotion_no_labels_v1.0.tsv", sep="\t")
df_train = pd.read_csv("data/2018-E-c-Ar-train.txt", sep="\t")
df_dev = pd.read_csv("data/2018-E-c-Ar-dev.txt", sep="\t")
df_test = df_testOrignal.copy()
if seg_enable:
    from arabert.preprocess import ArabertPreprocessor
    arabert_prep = ArabertPreprocessor(model_name=bert_model_name,apply_farasa_segmentation=apply_farasa)
    df_train['Tweet'] =  df_train['Tweet'].apply(arabert_prep.preprocess)
    df_dev['Tweet'] =  df_dev['Tweet'].apply(arabert_prep.preprocess)
    df_test['Tweet'] =  df_test['Tweet'].apply(arabert_prep.preprocess)
prediction_label = df_train.columns[2:]
print(prediction_label)
def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Tweet'])['input_ids'][:model.config.max_length],
        'labels': [float(example[label]) for label in prediction_label]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Tweet'])['input_ids'][:model.config.max_length],
        'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    }

df_train = Dataset.from_pandas(df_train)
df_dev = Dataset.from_pandas(df_dev)
df_test = Dataset.from_pandas(df_test)
tokenizer = CodepointTokenizer()
model_hyperparams = {'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}
print(model_hyperparams)
model = ShibaForClassificationD(vocab_size=len(prediction_label), **model_hyperparams)
data_collator = ClassificationDataCollator()
print('Loading and using base shiba states from', model_path)
checkpoint_state_dict = torch.load(model_path)
model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))
training_args = ShibaClassificationArgs(
    per_device_eval_batch_size=batch,
    per_device_train_batch_size=batch,
    data_seed=42,
    seed=42,
    do_eval=True,
    do_predict=True,
    do_train=True,
    dropout=drop_it,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=100,
    evaluation_strategy='steps',
    gradient_accumulation_steps=1,
    num_train_epochs=num_train_epochs,
    output_dir="fine_result",
    prediction_loss_only=False,
    report_to=[],
    run_name="fine_result",
    save_strategy='no',
)

Index(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism',
       'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')
{'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}




Loading and using base shiba states from ../checkpoint-611960.pt


Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [7]:
prediction_label

Index(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism',
       'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [8]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                eval_dataset=df_dev.map(process_example, remove_columns=list(df_dev[0].keys())),
                compute_metrics=compute_metrics,
                )


Map:   0%|          | 0/2278 [00:00<?, ? examples/s]

Map:   0%|          | 0/585 [00:00<?, ? examples/s]

In [9]:
training = trainer.train()

***** Running training *****
  Num examples = 2,278
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2,850
  Number of trainable parameters = 120,774,155


Step,Training Loss,Validation Loss,Jaccard Similarity
100,0.5364,0.476198,0.0
200,0.4493,0.420427,0.024945
300,0.3948,0.390726,0.188934
400,0.364,0.377598,0.207948
500,0.3394,0.362991,0.220805
600,0.3252,0.351737,0.255473
700,0.2906,0.358053,0.272933
800,0.2778,0.352903,0.275865
900,0.2773,0.348103,0.283563
1000,0.2437,0.347885,0.277567


***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  Num examples = 585
  Batch size = 8
***** Running Evaluation *****
  N

In [10]:
pred = trainer.predict(df_test.map(process_exampleTemp, remove_columns=list(df_test[0].keys())))




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
df_save = pd.DataFrame(data=pred.predictions[0] > 0.5, columns=prediction_label, index=df_testOrignal["ID"]).astype(int)
df_save.reset_index(inplace=True)
df_save

Unnamed: 0,ID,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,17439,0,0,0,0,0,0,0,0,1,0,0
1,10196,0,0,0,0,0,0,0,1,1,0,0
2,17470,1,0,0,0,0,0,0,1,1,0,0
3,16262,1,0,0,0,0,0,0,0,0,0,0
4,13597,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,6877,1,0,1,0,0,0,0,0,0,0,0
996,10056,0,0,0,1,0,0,0,0,0,0,0
997,16364,0,0,0,0,0,0,0,0,0,0,0
998,5406,1,0,0,0,0,0,0,0,0,0,0


<h1> Saving

In [12]:
df_save.to_csv(file_save+'/E_c.tsv'+conf, index=False, sep="\t")
print(file_save+'/E_c.tsv'+conf)
pd.read_csv(file_save+'/E_c.tsv'+conf, sep="\t").head(3)

SEC/E_c.tsv0.3_8


Unnamed: 0,ID,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,17439,0,0,0,0,0,0,0,0,1,0,0
1,10196,0,0,0,0,0,0,0,1,1,0,0
2,17470,1,0,0,0,0,0,0,1,1,0,0


In [None]:
# class ShibaForClassificationD(ShibaForTask):
#     def __init__(self, vocab_size: int, **kwargs):
#         super(ShibaForClassificationD, self).__init__(**kwargs)
#         self.vocab_size = vocab_size
#         self.config = self.shiba_model.config
#         self.config.vocab_size = self.vocab_size
#         self.label_layer = torch.nn.Linear(self.shiba_model.config.hidden_size, self.vocab_size)
#         self.dropout = torch.nn.Dropout(p=self.shiba_model.config.dropout)

#         self.loss = torch.nn.BCEWithLogitsLoss()

#     def forward(self, input_ids: torch.Tensor, labels: Optional[torch.Tensor],
#                 attention_mask: torch.Tensor) -> Tuple:
#         cls_embeddings = self.shiba_model(input_ids, attention_mask, None)['embeddings'][:, 0, :]
#         class_hidden_states = self.label_layer(self.dropout(cls_embeddings))

#         output = {
#             'cls_embeddings': cls_embeddings,
#             'class_probs': class_hidden_states  # Note: no log_softmax here for BCEWithLogitsLoss
#         }

#         if labels is not None:
#             output['loss'] = self.loss(class_hidden_states, labels)

#         return output.get('loss', None), output['class_probs'], output['cls_embeddings']
