In [1]:
# pip install shiba shiba-model evaluate datasets wandb arabert  accelerate -U nltk torchmetrics==0.3.2

In [2]:
from typing import Dict
import pandas as pd
import numpy as np
import torchmetrics
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction

from shiba import ShibaForRegression, CodepointTokenizer
from training.helpers import DataArguments, get_base_shiba_state_dict,get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator

<h1> Choose the model

In [16]:
model_path = '../checkpoint-611960.pt'
seg_enable = True
bert_model_name = "aubmindlab/bert-base-arabertv02"
apply_farasa=False
file_save = 'Submit_64_05'

<h1> Read the files

In [17]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))

prediction_label = 'Intensity Score'

df_train = pd.read_csv("data/2018-Valence-reg-Ar-train.txt", sep="\t")
df_dev = pd.read_csv("data/2018-Valence-reg-Ar-dev.txt", sep="\t")
df_testOrignal = pd.read_csv("data/vreg_no_labels_v1.0.tsv", sep="\t")

df_train = Dataset.from_pandas(df_train)
df_dev = Dataset.from_pandas(df_dev)
df_test = Dataset.from_pandas(df_testOrignal)

<h1> Check files count

In [18]:
len(df_train), len(df_dev), len(df_test)

(932, 138, 1000)

# Pre-Process data ( if needed)

In [19]:
if seg_enable:
    from arabert.preprocess import ArabertPreprocessor

    arabert_prep = ArabertPreprocessor(model_name=bert_model_name,apply_farasa_segmentation=apply_farasa)
    # arabert_prep.preprocess()
    df_train = pd.DataFrame(df_train)
    df_dev = pd.DataFrame(df_dev)
    df_test = pd.DataFrame(df_test)
    df_train['Tweet'] =  df_train['Tweet'].apply(arabert_prep.preprocess)
    df_dev['Tweet'] =  df_dev['Tweet'].apply(arabert_prep.preprocess)
    df_test['Tweet'] =  df_test['Tweet'].apply(arabert_prep.preprocess)
    df_train = Dataset.from_pandas(df_train)
    df_dev = Dataset.from_pandas(df_dev)
    df_test = Dataset.from_pandas(df_test)

<h1> Load the model

In [20]:
tokenizer = CodepointTokenizer()
model_hyperparams = {'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}
print(model_hyperparams)
model = ShibaForRegression(**model_hyperparams)
data_collator = ClassificationDataCollator()
print('Loading and using base shiba states from', model_path)
checkpoint_state_dict = torch.load(model_path)
model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))

{'dropout': 0.1, 'deep_transformer_stack_layers': 12, 'local_attention_window': 128}




Loading and using base shiba states from ../checkpoint-611960.pt


<All keys matched successfully>

<h1>Input IDs Method

In [21]:
def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Tweet'])['input_ids'][:2048],
        'labels': example[prediction_label]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Tweet'])['input_ids'][:2048],
        'labels': 0
    }

<h1> Compute Metrics

In [22]:
def compute_metrics(pred: EvalPrediction) -> Dict:
    try:
        # Convert predictions and labels to PyTorch tensors
        # label_probs = torch.tensor(pred.predictions)
        label_probs, embeddings = pred.predictions
        
        labels = torch.tensor(pred.label_ids)
        label_probs = torch.tensor(label_probs).view(-1)
        mse = torchmetrics.functional.mean_squared_error(label_probs, labels)
        mee = torchmetrics.functional.mean_absolute_error(label_probs, labels)
        

        # print("label_probs : ", label_probs, " labels : ", labels)

        metrics = {
            'mse': mse.item(),
            'mee': mee.item(),
        }

        # print("metrics : ", metrics)
        # raise NotImplementedError

        return metrics
    except:
        print("pred : ", pred)
        print("pred.predictions : ", pred.predictions)
        print("label_probs : ", label_probs)
        print("labels : ", labels)

        raise NotImplementedError

<h1> Fine-tune args

In [23]:
training_args = ShibaClassificationArgs(
    per_device_eval_batch_size=64,
    per_device_train_batch_size=64,
    data_seed=42,
    seed=42,
    do_eval=True,
    do_predict=True,
    do_train=True,
    dropout=0.5,
    eval_accumulation_steps=None,
    eval_delay=0,
    eval_steps=100,
    evaluation_strategy='steps',
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    output_dir="fine_result",
    prediction_loss_only=False,
    report_to=[],
    run_name="fine_result",
    save_strategy='no',
)

PyTorch: setting up devices


<h1> Setup the trainer

In [24]:
if seg_enable:
    compute_metrics = None

In [25]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train.map(process_example, remove_columns=list(df_train[0].keys())),
                eval_dataset=df_dev.map(process_example, remove_columns=list(df_dev[0].keys())),
                compute_metrics=compute_metrics,
                )

Map:   0%|          | 0/932 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Using auto half precision backend


<h1> Train

In [None]:
training = trainer.train()

***** Running training *****
  Num examples = 932
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 150
  Number of trainable parameters = 120,766,465


Step,Training Loss,Validation Loss


<h1> Prediction

In [None]:
pred = trainer.predict(df_test.map(process_exampleTemp, remove_columns=list(df_test[0].keys())))
df_testOrignal['prediction'] = [x[0] for x in pred.predictions[0]]

<h1> Saving

In [None]:
df_testOrignal[['ID', 'prediction']].to_csv(file_save+'/v_reg.tsv', index=False, sep="\t")
pd.read_csv(file_save+"/v_reg.tsv", sep="\t").head(3)

In [None]:
#------------------------------
# model.py add this
#------------------------------
# class ShibaForRegression(ShibaForTask):
#     def __init__(self, **kwargs):
#         super(ShibaForRegression, self).__init__(**kwargs)
#         self.regression_layer = torch.nn.Linear(self.shiba_model.config.hidden_size, 1)  # Output is a single continuous value
#         self.dropout = torch.nn.Dropout(p=self.shiba_model.config.dropout)
#         self.loss = torch.nn.MSELoss()  # Use Mean Squared Error (MSE) as the regression loss

#     def forward(self, input_ids: torch.Tensor, labels: Optional[torch.Tensor],
#                 attention_mask: torch.Tensor) -> Tuple:
#         embeddings = self.shiba_model(input_ids, attention_mask, None)['embeddings']

#         regression_input = self.dropout(embeddings[:, 0, :])  # Apply dropout to the input
#         regression_output = self.regression_layer(regression_input)  # Assuming you want to predict a single value for the entire sequence

#         output = {
#             'embeddings': embeddings,
#             'regression_output': regression_output
#         }

#         if labels is not None:
#             loss = self.loss(regression_output, labels.view(-1, 1))  # Use Mean Squared Error (MSE) as the regression loss
#             output['loss'] = loss

#         return output.get('loss', None), output['regression_output'], output['embeddings']
