## run this on local environment

In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, Value, ClassLabel, Features


https://stackoverflow.com/questions/75510487/huggingface-trainer-k-fold-cross-validation

In [2]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [17]:
traindf = pd.read_excel('./data/Task-2/train.xlsx')
testdf=pd.read_excel('./data/Task-2/test.xlsx')
#clean data
traindf.drop_duplicates(subset='text',inplace=True)
traindf.shape,testdf.shape,traindf['label'].value_counts()

((4326, 2),
 (1000, 2),
 label
  1    2822
 -1    1504
 Name: count, dtype: int64)

In [18]:
from imblearn.over_sampling import RandomOverSampler
#class balancing
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(traindf['text']).reshape(-1, 1), np.array(traindf['label']).reshape(-1, 1))
traindf_balance = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text', 'label'])
traindf_balance['label'].value_counts()

label
 1    2822
-1    2822
Name: count, dtype: int64

In [19]:
from datasets import concatenate_datasets

#data preprocessing
#shuffle training dataset
traindf=traindf_balance.sample(frac=1)
traindf['label'].replace(-1, 0, inplace=True)

testdf=testdf[['text']]

testds_features=Features({'text': Value(dtype='string', id = None)})
testds=Dataset.from_dict(mapping={"text": testdf['text'].to_list()},features=testds_features)


trainds_features = Features({'text': Value(dtype='string', id = None), 'label': ClassLabel(num_classes=2 ,id=None)})
trainds = Dataset.from_dict(mapping={"text": traindf['text'].to_list(), 'label': traindf['label'].to_list()},
                            features=trainds_features)
#whole training dataset
trainds_org = trainds.shuffle(seed=42)


#split
cv_fold=5
slice_ds=[]
for i in range(cv_fold):
    slice_ds.append(trainds_org.shard(num_shards=cv_fold,index=i))

In [20]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## modify epochs in training_args for real training

In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer",num_train_epochs=3,per_device_train_batch_size=8)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_train_org_datasets=trainds_org.map(tokenize_function, batched=True)
tokenized_test_datasets=testds.map(tokenize_function, batched=True)
#cross_validation cv_fold=5
total_f1_score=0
for i in range(cv_fold):
    valds=slice_ds[i]
    if i==0:
        trainds=concatenate_datasets(slice_ds[i+1:cv_fold])
    elif i==cv_fold-1:
        trainds=concatenate_datasets(slice_ds[0:i])
    else:
        temp=slice_ds[0:i]
        for j in range(i+1,cv_fold):
            temp.append(slice_ds[j])
        trainds=concatenate_datasets(temp)


    tokenized_train_datasets = trainds.map(tokenize_function, batched=True)
    tokenized_eval_datasets=valds.map(tokenize_function, batched=True)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_train_datasets,
        eval_dataset=tokenized_eval_datasets,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    val_result=trainer.evaluate()
    total_f1_score+=val_result['eval_f1']
#average f1 score
f1_score_cv=total_f1_score/cv_fold
f1_score_cv

Map:   0%|          | 0/5644 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.402, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.1959, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.1076, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 67.9512, 'train_samples_per_second': 199.334, 'train_steps_per_second': 24.944, 'train_loss': 0.21757998508689677, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.197, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.0732, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0381, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 67.3219, 'train_samples_per_second': 201.198, 'train_steps_per_second': 25.178, 'train_loss': 0.09641638786743524, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.1127, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.0426, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0207, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 67.0108, 'train_samples_per_second': 202.132, 'train_steps_per_second': 25.294, 'train_loss': 0.053460440874803034, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.0619, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.0269, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0071, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 67.0649, 'train_samples_per_second': 201.969, 'train_steps_per_second': 25.274, 'train_loss': 0.02872102724767364, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4516 [00:00<?, ? examples/s]

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.0627, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.014, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0033, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 71.6764, 'train_samples_per_second': 189.016, 'train_steps_per_second': 23.648, 'train_loss': 0.02408790261344572, 'epoch': 3.0}


  0%|          | 0/141 [00:00<?, ?it/s]

0.9748444756300263

In [24]:
#train on whole training datatest
trainer_final = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_org_datasets,
    # eval_dataset=tokenized_eval_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer_final.train()
#predict on test dataset
pred=trainer_final.predict(tokenized_test_datasets)
#change logits to probability
label_probability=torch.softmax(torch.tensor(pred[0]),1)
#get labels
labels=label_probability.argmax(axis=1).reshape(-1,1)
labels[:5]

  0%|          | 0/2118 [00:00<?, ?it/s]

{'loss': 0.0329, 'learning_rate': 3.819641170915959e-05, 'epoch': 0.71}
{'loss': 0.0456, 'learning_rate': 2.639282341831917e-05, 'epoch': 1.42}
{'loss': 0.0101, 'learning_rate': 1.4589235127478753e-05, 'epoch': 2.12}
{'loss': 0.007, 'learning_rate': 2.785646836638338e-06, 'epoch': 2.83}
{'train_runtime': 89.2388, 'train_samples_per_second': 189.738, 'train_steps_per_second': 23.734, 'train_loss': 0.022603803345378634, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

tensor([[0],
        [1],
        [0],
        [0],
        [0]])

In [25]:
test_to_submit=pd.read_excel('./data/Task-2/test_to-submit.xlsx')
test_to_submit['label']=labels
test_to_submit.loc[test_to_submit['label']==0,'label']=-1
# test_to_submit
test_to_submit.to_excel('./data/Task-2/test_to-submit_answers1.xlsx')


In [26]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [13]:
trainer_final.save_model('./disroberta_model')

In [5]:
new_model = AutoModelForSequenceClassification.from_pretrained("./test-trainer/disroberta_model")
tokenizer = AutoTokenizer.from_pretrained("./test-trainer/disroberta_model")
# from transformers import pipeline
from transformers import TextClassificationPipeline

model = new_model
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]
pipe("I hate this movie!")



[[{'label': 'LABEL_0', 'score': 0.9997579455375671},
  {'label': 'LABEL_1', 'score': 0.00024203682551160455}]]

In [6]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTQuantizer

model_checkpoint = "./test-trainer/disroberta_model"
save_directory = "./test-trainer/onnx_disroberta/"
# Load a model from transformers and export it to ONNX
ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define the quantization methodology
qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False)
quantizer = ORTQuantizer.from_pretrained(ort_model)
# Apply dynamic quantization on the model
quantizer.quantize(save_dir=save_directory, quantization_config=qconfig)

# Save the onnx model and tokenizer
ort_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.1.1+cu121
  mask, torch.tensor(torch.finfo(scores.dtype).min)
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: False)
Quantizing model...
Saving quantized model at: test-trainer\onnx_disroberta (external data format: False)
Configuration saved in test-trainer\onnx_disroberta\ort_config.json


('./test-trainer/onnx_disroberta/tokenizer_config.json',
 './test-trainer/onnx_disroberta/special_tokens_map.json',
 './test-trainer/onnx_disroberta/vocab.txt',
 './test-trainer/onnx_disroberta/added_tokens.json',
 './test-trainer/onnx_disroberta/tokenizer.json')

In [9]:
new_model = ORTModelForSequenceClassification.from_pretrained("./test-trainer/onnx_disroberta")
tokenizer = AutoTokenizer.from_pretrained("./test-trainer/onnx_disroberta")
# from transformers import pipeline
from transformers import TextClassificationPipeline

model = new_model
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]
pipe("I hate this movie!")


[[{'label': 'LABEL_0', 'score': 0.9996470212936401},
  {'label': 'LABEL_1', 'score': 0.00035300874151289463}]]