# Exam
1. Load the dataset 
https://huggingface.co/datasets/poem_sentiment
2. Set 'bert-base-uncased' tokenizer and model for your task.
3. Tokenize data and set train/test/validation data
4. Create Trainer with the correct configuration
5. Use Trainer and fine-tune your model for 5 epochs.
6. Predict on test data and report accuracy.


In [1]:
# IMPORTANT
!pip install -q transformers==4.28.0
!pip install -q datasets

In [2]:
from datasets import load_dataset, DatasetDict

# YOUR CODE HERE - 1
dataset = load_dataset("poem_sentiment")
# Download the dataset



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 105
    })
    test: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 104
    })
})

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# YOUR CODE HERE - 2
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
# Load the model and tokenizer

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
#['verse_text']
# YOUR CODE HERE - 3
# Set train/test/validation
def tokenize_function(examples):
    return tokenizer(examples["verse_text"], padding="max_length", truncation=True, return_tensors='pt').to("cuda:0")

train_data = dataset["train"].map(tokenize_function, batched=True)
val_data = dataset["validation"].map(tokenize_function, batched=True)
test_data = dataset['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [23]:
test_data

Dataset({
    features: ['id', 'verse_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 104
})

In [6]:
from transformers import TrainingArguments, Trainer
import numpy as np

# You can optionally use this function with the right parameter to compute results.
# You don't have to use this though. You can do the caculation anyway you like.
def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

# YOUR CODE HERE - 4
# create trainer

arguments = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5)
trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
)

In [7]:
# YOUR CODE HERE - 5
# Train
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.486664,0.838095
2,No log,0.540225,0.857143
3,No log,0.789502,0.847619
4,No log,0.786109,0.847619
5,0.376200,0.798364,0.838095


TrainOutput(global_step=560, training_loss=0.33790770726544517, metrics={'train_runtime': 421.4004, 'train_samples_per_second': 10.584, 'train_steps_per_second': 1.329, 'total_flos': 1173496379228160.0, 'train_loss': 0.33790770726544517, 'epoch': 5.0})

In [9]:
TOKEN = "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"
model.push_to_hub("poem_bert",  use_auth_token=TOKEN)
tokenizer.push_to_hub("poem_bert",  use_auth_token=TOKEN, commit_message="Upload Tokenizer")

CommitInfo(commit_url='https://huggingface.co/Babak-Behkamkia/poem_bert/commit/292f0240cc0c4effb4986371e8dae19f2e974378', commit_message='Upload Tokenizer', commit_description='', oid='292f0240cc0c4effb4986371e8dae19f2e974378', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
from transformers import pipeline
model_loaded = pipeline("text-classification", model="Babak-Behkamkia/poem_bert")

Downloading (…)lve/main/config.json:   0%|          | 0.00/919 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# YOUR CODE HERE - 6
# Evalute Test data and report results
labels = dataset['test']['label']
preds = model_loaded(test_data['verse_text'])
predictions = []
for item in preds:
  pred = item['label']
  if pred == "LABEL_0":
    predictions.append(0)
  elif pred == "LABEL_1":
    predictions.append(1)
  elif pred == "LABEL_2":
    predictions.append(2)
  elif pred == "LABEL_3":
    predictions.append(3)

In [32]:
s = 0
for i in range(len(predictions)):
  if predictions[i]==labels[i]:
    s+=1
print(s/len(predictions))

0.8076923076923077
