In [1]:
import os
from typing import Dict
import pandas as pd
import torch
import torchmetrics

import transformers
from datasets import load_dataset, Dataset
from transformers import HfArgumentParser, Trainer, EvalPrediction, BertForSequenceClassification, AutoTokenizer, \
    DataCollatorWithPadding

from shiba.helpers import DataArguments, get_model_hyperparams, ShibaClassificationArgs, \
    ClassificationDataCollator, get_base_shiba_state_dict
from shiba import ShibaForClassification, CodepointTokenizer

from nltk.corpus import stopwords
import string
import re
import evaluate
import numpy as np
import pickle
metric = evaluate.load("f1")
from sklearn import model_selection

In [2]:
transformers.logging.set_verbosity_info()
device = "cuda"
parser = HfArgumentParser((ShibaClassificationArgs, DataArguments))

# training_args, data_args = parser.parse_args_into_dataclasses()
with open('training_args.pkl', 'rb') as f:
    training_args = pickle.load(f)

print("training_args : ", training_args)
training_args.logging_dir = training_args.output_dir
training_args.save_steps = 50
training_args.logging_steps = 50
training_args.eval_steps = 50
training_args.num_train_epochs = 10
training_args.per_device_train_batch_size = 64
training_args.per_device_eval_batch_size = 64
df_train = pd.read_csv("../data/OSACT2020-sharedTask-train.txt", sep="\t", quotechar='▁', header=None, names=["Feed", "Sentiment", "offensive"])[['Feed','Sentiment']]
df_dev = pd.read_csv("../data/OSACT2020-sharedTask-dev.txt", sep="\t", quotechar='▁', header=None, names=["Feed", "Sentiment", "offensive"])[['Feed','Sentiment']]
# df_test = pd.read_csv("../data/tweets_v1.0.txt", sep="\t", quotechar='▁', header=None, names=["Feed"])
# categories = {idx: cat_name for idx, cat_name in enumerate({x['Sentiment'] for x in all_data})}
categories = {idx: cat_name for idx, cat_name in enumerate(set(df_train['Sentiment']))}
id_by_category = {val: key for key, val in categories.items()}

print("categories : ", categories)
print("id_by_category : ", id_by_category)

tokenizer = CodepointTokenizer()
model_hyperparams = get_model_hyperparams(training_args)
model = ShibaForClassification(vocab_size=len(categories), **model_hyperparams).to(device)
data_collator = ClassificationDataCollator()

if training_args.resume_from_checkpoint:
    print('Loading and using base shiba states from', training_args.resume_from_checkpoint)
    checkpoint_state_dict = torch.load(training_args.resume_from_checkpoint)
    model.shiba_model.load_state_dict(get_base_shiba_state_dict(checkpoint_state_dict))

def process_example(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Feed'])['input_ids'][:model.config.max_length],
        'labels': id_by_category[example['Sentiment']]
    }
def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Feed'])['input_ids'][:model.config.max_length],
    }
def compute_metrics(pred: EvalPrediction) -> Dict:
    try:
        # Convert predictions and labels to PyTorch tensors
        # label_probs = torch.tensor(pred.predictions)
        label_probs, embeddings = pred.predictions
        labels = torch.tensor(pred.label_ids)
        label_probs = torch.exp(torch.tensor(label_probs))  # undo the log in log softmax, get indices

        # # Compute accuracy
        # accuracy = torchmetrics.functional.accuracy(label_probs, labels, num_classes=len(categories))

        # Compute F1 score
        f1_score = torchmetrics.functional.f1(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute recall
        recall = torchmetrics.functional.recall(label_probs, labels, average='macro', num_classes=len(categories))

        # Compute precision
        precision = torchmetrics.functional.precision(label_probs, labels, average='macro', num_classes=len(categories))

        # print("label_probs : ", label_probs, " labels : ", labels)

        metrics = {
            # 'accuracy': accuracy.item(),
            'f1_score': f1_score.item(),
            'recall': recall.item(),
            'precision': precision.item()
        }

        # print("metrics : ", metrics)
        # raise NotImplementedError

        return metrics
    except:
        print("pred : ", pred)
        print("pred.predictions : ", pred.predictions)
        print("label_probs : ", label_probs)
        print("label_probs.size : ", label_probs.size())
        print("labels : ", labels)
        print("labels.size() : ", labels.size())

        raise NotImplementedError


os.environ['WANDB_PROJECT'] = 'shiba'
df_train = Dataset.from_pandas(df_train)
df_dev = Dataset.from_pandas(df_dev)





training_args :  ShibaClassificationArgs(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deep_transformer_stack_layers=12,
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=True,
do_train=False,
dropout=0.1,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=300,
evaluation_strategy=steps,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_l

  df_train = pd.read_csv("../data/OSACT2020-sharedTask-train.txt", sep="\t", quotechar='▁', header=None, names=["Feed", "Sentiment", "offensive"])[['Feed','Sentiment']]
  df_dev = pd.read_csv("../data/OSACT2020-sharedTask-dev.txt", sep="\t", quotechar='▁', header=None, names=["Feed", "Sentiment", "offensive"])[['Feed','Sentiment']]


Loading and using base shiba states from ../checkpoint-63528.pt


In [3]:
len(df_train)

7000

In [4]:
df_train1 = df_train[:6000]
df_test = df_train[6000:]

In [5]:
type(df_train)

datasets.arrow_dataset.Dataset

In [6]:
from datasets import Dataset
df_train1 = Dataset.from_dict(df_train1)
df_test = Dataset.from_dict(df_test)

print(type(df_train1))

<class 'datasets.arrow_dataset.Dataset'>


In [10]:
# print(all_data)
trainer = Trainer(model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=df_train1.map(process_example, remove_columns=list(df_train1[0].keys())),
                eval_dataset=df_dev.map(process_example, remove_columns=list(df_dev[0].keys())),
                compute_metrics=compute_metrics
                )

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

AttributeError: 'ShibaClassificationArgs' object has no attribute 'deepspeed_plugin'

In [9]:
training = trainer.train()

NameError: name 'trainer' is not defined

In [9]:
df_testtmp = Dataset.from_pandas(df_test)

In [10]:

def process_exampleTemp(example: Dict) -> Dict:
    return {
        'input_ids': tokenizer.encode(example['Feed'])['input_ids'][:model.config.max_length],
        'labels': 0
    }
what = trainer.predict(df_testtmp.map(process_exampleTemp, remove_columns=list(df_testtmp[0].keys())))

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64


In [None]:
df_dev 

In [11]:
import numpy as np

df_test['Offensive'] = [{0: 'HS', 1: 'NOT_HS'}[x] for x in np.argmax(what.predictions[0], axis=1)]



In [12]:
df_test.to_csv('tweets_v2.0_offensive.csv',sep="\t", quotechar='▁', index=False)
pd.read_csv("tweets_v2.0_offensive.csv", sep="\t", quotechar='▁')

  pd.read_csv("tweets_v2.0_offensive.csv", sep="\t", quotechar='▁')


Unnamed: 0,Feed,Offensive
0,أود أن أعلمكم أن التعليق المنشور هنا باسم نور ...,NOT_HS
1,مافيه فرق بين احمد جبريل والعاهره المستأجره,NOT_HS
2,اذا نطق السفية فلا تجبة لانة سفية وقليل الادب ...,HS
3,اعتقد حضرتك تدعو لمؤتمر دولى للحوار للسلمي مع ...,HS
4,يسرني في المركز الموريتاني لقياس الراي العام ا...,NOT_HS
...,...,...
995,رحت ع الجامعه لقيتها مسكرة -نمزيت,NOT_HS
996,تجميع وليس تصنيع، وإعادة تصدير وليس تصدير، اقت...,NOT_HS
997,احلى اشي لما تكون مقتنع بشغلك وعملك هيك ما بتح...,NOT_HS
998,لو عملت مقارنة بسيطة بين داعش والحشد الطائفي ل...,HS
