In [2]:
from datasets import load_dataset
import numpy as np
from torch.utils.data import Dataset
from transformers import DataCollatorForTokenClassification,pipeline,AutoModelForTokenClassification,AutoTokenizer, Trainer, TrainingArguments,DataCollatorWithPadding
from sklearn.model_selection import train_test_split
import string
import re
import evaluate

from training import load_data,prepare_dataset,prepare_text

In [3]:
model_path = "distilbert-base-uncased"

In [4]:
intrasentence_dataset = load_data("intrasentence")

Found cached dataset stereoset (/Users/zekunwu/.cache/huggingface/datasets/stereoset/intrasentence/1.0.0/b188e395e95b37c7a095ebc2de352fbdb249d67d1beb2ff639bb4dc37dfbb090)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
print(np.shape(intrasentence_dataset["profession"]))
print(np.shape(intrasentence_dataset["race"]))
print(np.shape(intrasentence_dataset["gender"]))
print(np.shape(intrasentence_dataset["religion"]))

(2430,)
(2886,)
(765,)
(237,)


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
data = prepare_text(intrasentence_dataset["race"])
tokenized_data = prepare_dataset(tokenizer,data)
final_dataset = tokenized_data.train_test_split(0.2)
final_dataset

Map:   0%|          | 0/2886 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2308
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 578
    })
})

In [11]:
# print(data[17]["tokens"])
# tokenized_input = tokenizer(data[17]["tokens"], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# tokens

In [13]:
# Define data collator to handle padding
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

label_list = ["stereotype","anti-stereotype","unrelated"]
labels = [label_list[i] for i in data[0]["labels"]]


from sklearn.metrics import precision_recall_fscore_support,accuracy_score,balanced_accuracy_score

def compute_metrics_new(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten the lists
    true_predictions = [pred for sublist in true_predictions for pred in sublist]
    true_labels = [label for sublist in true_labels for label in sublist]

    # Calculate precision, recall, f1_score, and support with "macro" average
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average='macro')

    balanced_acc = balanced_accuracy_score(true_labels, true_predictions)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, true_predictions)
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "balanced accuracy": balanced_acc,
    }

id2label = {
    0: "stereotype",
    1: "anti-stereotype",
    2: "unrelated"
}
label2id = {
    "stereotype": 0,
    "anti-stereotype": 1,
    "unrelated": 2
}

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="specific_best_model/race",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_new,
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[codecarbon INFO @ 21:14:04] CPU Model on constant consumption mode: Apple M2
[codecarbon INFO @ 21:14:04] >>> Tracker's metadata:
[codecarbon INFO @ 21:14:04]   Platform system: macOS-13.0.1-arm64-arm-64bit
[codecarbon INFO @ 21:14:04]   Python version: 3.9.6
[codecarbon INFO @ 21:14:04]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 21:14:04]   Available RAM : 8.000 GB
[codecarbon INFO @ 21:14:04]   CPU count: 8
[codecarbon INFO @ 21:14:04]   CPU model: Apple M2
[codecarbon INFO @ 21:14:04]   GPU count: None
[codecarbon INFO @ 21:14:04]   GPU model: None
The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens.
***** Running training *****
  Num examples = 2308
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 870


Epoch,Training Loss,Validation Loss


[codecarbon INFO @ 21:14:22] Energy consumed for RAM : 0.000013 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 21:14:22] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 21:14:22] 0.000190 kWh of electricity used since the beginning.
[codecarbon INFO @ 21:14:37] Energy consumed for RAM : 0.000025 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 21:14:37] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 21:14:37] 0.000379 kWh of electricity used since the beginning.
[codecarbon INFO @ 21:14:52] Energy consumed for RAM : 0.000038 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 21:14:52] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 21:14:52] 0.000569 kWh of electricity used since the beginning.
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens.
***** Running Evaluation *****
  Nu

TrainOutput(global_step=870, training_loss=0.06965746057444605, metrics={'train_runtime': 324.2798, 'train_samples_per_second': 42.704, 'train_steps_per_second': 2.683, 'total_flos': 66062668176000.0, 'train_loss': 0.06965746057444605, 'epoch': 6.0})

In [14]:
trainer.evaluate(final_dataset["test"])

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens.
***** Running Evaluation *****
  Num examples = 578
  Batch size = 16


{'eval_loss': 0.08452929556369781,
 'eval_precision': 0.7701255290430122,
 'eval_recall': 0.7953339229050389,
 'eval_f1': 0.7821439651040846,
 'eval_balanced accuracy': 0.7953339229050389,
 'eval_runtime': 2.5342,
 'eval_samples_per_second': 228.08,
 'eval_steps_per_second': 14.6,
 'epoch': 6.0}

In [26]:
# Load the trained model and the tokenizer
model = AutoModelForTokenClassification.from_pretrained("token_level_model/best_model/checkpoint-366")
tokenizer = AutoTokenizer.from_pretrained("token_level_model/best_model/checkpoint-366")

# Use the pipeline for Named Entity Recognition
ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer)

# Now you can use the pipeline to classify named entities
for x in range(20):
    sentence = intrasentence_dataset[][x]['text'].replace("===","")
    print(f"Text: {sentence}")
    results = ner_pipeline(sentence)

    # Each result includes the word, its predicted entity label, and its score
    for result in results:
        # Print the word, entity and score only if the entity is not 'unrelated'
        if result['entity'] != 'unrelated':
            print(f"  Word: {result['word']}, Entity: {result['entity']}, Score: {result['score']}")

404 Client Error: Not Found for url: https://huggingface.co/token_level_model/best_model/checkpoint-366/resolve/main/config.json


OSError: We couldn't connect to 'https://huggingface.co/' to load this model and it looks like token_level_model/best_model/checkpoint-366 is not the path to a directory conaining a config.json file.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.