## run this on local environment

In [15]:
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, Value, ClassLabel, Features


https://stackoverflow.com/questions/75510487/huggingface-trainer-k-fold-cross-validation

In [16]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [3]:
traindf = pd.read_excel('./data/Task-2/train.xlsx')
testdf=pd.read_excel('./data/Task-2/test.xlsx')
#clean data
traindf.drop_duplicates(subset='text',inplace=True)
traindf.shape,testdf.shape,traindf['label'].value_counts()

((4326, 2),
 (1000, 2),
 label
  1    2822
 -1    1504
 Name: count, dtype: int64)

In [4]:
from imblearn.over_sampling import RandomOverSampler
#class balancing
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(traindf['text']).reshape(-1, 1), np.array(traindf['label']).reshape(-1, 1))
traindf_balance = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text', 'label'])
traindf_balance['label'].value_counts()

label
 1    2822
-1    2822
Name: count, dtype: int64

In [5]:
from datasets import concatenate_datasets

#data preprocessing
#shuffle training dataset
traindf=traindf_balance.sample(frac=1)
traindf['label'].replace(-1, 0, inplace=True)

testdf=testdf[['text']]

testds_features=Features({'text': Value(dtype='string', id = None)})
testds=Dataset.from_dict(mapping={"text": testdf['text'].to_list()},features=testds_features)


trainds_features = Features({'text': Value(dtype='string', id = None), 'label': ClassLabel(num_classes=2 ,id=None)})
trainds = Dataset.from_dict(mapping={"text": traindf['text'].to_list(), 'label': traindf['label'].to_list()},
                            features=trainds_features)
#whole training dataset
trainds_org = trainds.shuffle(seed=42)


#split
cv_fold=5
slice_ds=[]
for i in range(cv_fold):
    slice_ds.append(trainds_org.shard(num_shards=cv_fold,index=i))

In [6]:
from transformers import AutoTokenizer
checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## modify epochs in training_args for real training

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer",num_train_epochs=3,per_device_train_batch_size=8)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_train_org_datasets=trainds_org.map(tokenize_function, batched=True)
tokenized_test_datasets=testds.map(tokenize_function, batched=True)
#cross_validation cv_fold=5
total_f1_score=0
for i in range(cv_fold):
    valds=slice_ds[i]
    if i==0:
        trainds=concatenate_datasets(slice_ds[i+1:cv_fold])
    elif i==cv_fold-1:
        trainds=concatenate_datasets(slice_ds[0:i])
    else:
        temp=slice_ds[0:i]
        for j in range(i+1,cv_fold):
            temp.append(slice_ds[j])
        trainds=concatenate_datasets(temp)


    tokenized_train_datasets = trainds.map(tokenize_function, batched=True)
    tokenized_eval_datasets=valds.map(tokenize_function, batched=True)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_train_datasets,
        eval_dataset=tokenized_eval_datasets,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    val_result=trainer.evaluate()
    total_f1_score+=val_result['eval_f1']
#average f1 score
f1_score_cv=total_f1_score/cv_fold
f1_score_cv

Map:   0%|          | 0/5644 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.4329, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.2585, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.1688, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 78.4626, 'train_samples_per_second': 172.63, 'train_steps_per_second': 21.603, 'train_loss': 0.2694843551050597, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.25, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.1728, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0813, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 78.0823, 'train_samples_per_second': 173.471, 'train_steps_per_second': 21.708, 'train_loss': 0.15746989461172997, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.1648, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.1229, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0479, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 76.9315, 'train_samples_per_second': 176.066, 'train_steps_per_second': 22.033, 'train_loss': 0.10276127677399852, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4515 [00:00<?, ? examples/s]

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.1209, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.0589, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0386, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 77.0605, 'train_samples_per_second': 175.771, 'train_steps_per_second': 21.996, 'train_loss': 0.06645787402240225, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

Map:   0%|          | 0/4516 [00:00<?, ? examples/s]

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

  0%|          | 0/1695 [00:00<?, ?it/s]

{'loss': 0.0813, 'learning_rate': 3.5250737463126844e-05, 'epoch': 0.88}
{'loss': 0.063, 'learning_rate': 2.0501474926253688e-05, 'epoch': 1.77}
{'loss': 0.0195, 'learning_rate': 5.752212389380531e-06, 'epoch': 2.65}
{'train_runtime': 77.9104, 'train_samples_per_second': 173.892, 'train_steps_per_second': 21.756, 'train_loss': 0.04831502378030864, 'epoch': 3.0}


  0%|          | 0/141 [00:00<?, ?it/s]

0.9757375536576518

In [10]:
#train on whole training datatest
trainer_final = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_org_datasets,
    # eval_dataset=tokenized_eval_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer_final.train()
#predict on test dataset
pred=trainer_final.predict(tokenized_test_datasets)
#change logits to probability
label_probability=torch.softmax(torch.tensor(pred[0]),1)
#get labels
labels=label_probability.argmax(axis=1).reshape(-1,1)
labels[:5]

  0%|          | 0/2118 [00:00<?, ?it/s]

{'loss': 0.0704, 'learning_rate': 3.819641170915959e-05, 'epoch': 0.71}
{'loss': 0.0559, 'learning_rate': 2.639282341831917e-05, 'epoch': 1.42}
{'loss': 0.0292, 'learning_rate': 1.4589235127478753e-05, 'epoch': 2.12}
{'loss': 0.0139, 'learning_rate': 2.785646836638338e-06, 'epoch': 2.83}
{'train_runtime': 98.0564, 'train_samples_per_second': 172.676, 'train_steps_per_second': 21.6, 'train_loss': 0.04069640661209006, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

tensor([[0],
        [1],
        [0],
        [0],
        [0]])

In [None]:
test_to_submit=pd.read_excel('./data/Task-2/test_to-submit.xlsx')
test_to_submit['label']=labels
test_to_submit.loc[test_to_submit['label']==0,'label']=-1
# test_to_submit
test_to_submit.to_excel('./data/Task-2/test_to-submit_answers1.xlsx')


In [12]:
type(model)

transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification

In [31]:
trainer_final.save_model('./roberta_model')

In [34]:
new_model = AutoModelForSequenceClassification.from_pretrained("./roberta_model", )
# from transformers import pipeline
from transformers import TextClassificationPipeline

model = new_model
# tokenizer = tokenizer
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]
pipe("I hate this movie!")

[[{'label': 'LABEL_0', 'score': 0.9999197721481323},
  {'label': 'LABEL_1', 'score': 8.018474909476936e-05}]]