In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import classification_report

no_deprecation_warning=True

  from .autonotebook import tqdm as notebook_tqdm
2023-06-21 16:48:27.659172: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

dataset = load_dataset("md_gender_bias", "convai2_inferred")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Found cached dataset md_gender_bias (/home/jupyter/.cache/huggingface/datasets/md_gender_bias/convai2_inferred/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)
100% 3/3 [00:00<00:00, 382.85it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initial

In [3]:
def tokenize_function(examples):
    # The labels should be included here if they are not already in the dataset
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenizing the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Adding the 'labels' field to the tokenized_datasets
tokenized_datasets = tokenized_datasets.map(lambda examples: {'labels': examples['ternary_label']})

small_train_dataset = tokenized_datasets["train"].shuffle(seed=66).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=66).select(range(1000))

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/md_gender_bias/convai2_inferred/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05/cache-6059a3875d21c432.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/md_gender_bias/convai2_inferred/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05/cache-50a533ca345a11d2.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/md_gender_bias/convai2_inferred/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05/cache-bcf5dfbe1fd6caa3.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/md_gender_bias/convai2_inferred/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05/cache-fee0a9e8c0748d87.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/md_gender_bias/convai2_inferred/1.0.0/8ae77b51acf93383161cc954b146159291beca6c979b

In [4]:
batch_size = 16

# metric = evaluate.load("f1")

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels)

from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"f1": f1_score(y_true=labels, y_pred=predictions, average='micro')}

training_args = TrainingArguments(
    num_train_epochs= 6,
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_result = trainer.evaluate()
for key, value in eval_result.items():
    print(f"{key}: {value}")



Epoch,Training Loss,Validation Loss,F1
1,No log,1.049562,0.44
2,No log,1.002216,0.501
3,No log,0.942522,0.557
4,No log,0.903964,0.575
5,No log,0.890323,0.596
6,No log,0.882813,0.591


eval_loss: 0.8828128576278687
eval_f1: 0.591
eval_runtime: 36.724
eval_samples_per_second: 27.23
eval_steps_per_second: 1.716
epoch: 6.0


In [19]:
result = trainer.evaluate(eval_dataset=small_eval_dataset, metric_key_prefix="test")
test_accuracy = str(np.around(result['test_f1'],3)).replace('.','_')

trainer.save_model('./model/')
tokenizer.save_pretrained("./model/")

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

In [20]:
from transformers import pipeline

model_path = "./model"
pipe = pipeline(task="text-classification",model = model_path,truncation=True, device=0)

In [21]:
dataset['test'][0]

{'text': 'hello what are doing today ?',
 'binary_label': 1,
 'binary_score': 0.5015000104904175,
 'ternary_label': 1,
 'ternary_score': 0.3422999978065491}

In [22]:
dataset.unique("binary_label")

{'train': [1, 0], 'validation': [1, 0], 'test': [1, 0]}

In [23]:
y_pred = []
y_true = []
for x in range(2000,4000):
    y_true.append(dataset['test'][x]['ternary_label'])
    if pipe(dataset['test'][x]['text'])[0]['label'] == "LABEL_1":
        y_pred.append(1)
    elif pipe(dataset['test'][x]['text'])[0]['label'] == "LABEL_2":
        y_pred.append(2)
    else:
        y_pred.append(0)


print(classification_report(y_true,y_pred))



              precision    recall  f1-score   support

           0       0.64      0.67      0.65       627
           1       0.52      0.23      0.32       487
           2       0.66      0.83      0.74       886

    accuracy                           0.64      2000
   macro avg       0.61      0.58      0.57      2000
weighted avg       0.62      0.64      0.61      2000



In [29]:
pipe("do you have anything planned for today ? i think i am going to do some canning .")

[{'label': 'LABEL_2', 'score': 0.7097024917602539}]