In [None]:
!pip install transformers datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes

!pip install pytesseract evaluate tqdm rouge-score accelerate nltk tensorboard jupyter-black py7zr --upgrade
!apt-get install git --yes

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorboard==2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard==2.11)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting tensorboard-da

In [None]:
import torch
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
    RobertaModel,
)
import json
import pandas
import jupyter_black
from datetime import timedelta
from datasets import Dataset
model_id = "roberta-base"

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

In [None]:
with open("data.json", "r") as f:
    data = json.load(f)

# create a dataframe
df = pandas.DataFrame(data)

dataset = Dataset.from_pandas(df).train_test_split(test_size=0.1)

train_dataset = dataset['train']
test_dataset = dataset["test"]
val_dataset = dataset['test']

print("train_dataset")
print(train_dataset)
print("test_dataset")
print(test_dataset)
print("val_dataset")
print(val_dataset)

train_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score'],
    num_rows: 5
})
test_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score'],
    num_rows: 1
})
val_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score'],
    num_rows: 1
})


In [None]:
def adjust_dataset(example):
  line_break = '\n'
  size = len(example['id'])

  prompt = [
      f"""
        {example['task'][i]}

        Question:
        {example['question'][i]}

        Maximum score:
        {example['maximum_score'][i]}

        Scoring guide:
        {line_break.join(str(str(z['point']) + ' points - ' + z['criteria']) for z in example['scoring_guide'][i])}

        Answer:
        {example['answer'][i]}

        Score:
        <mask>
      """
      for i in range(size)]

  example["text"] = prompt
  example["label"] = example["score"]

  #for i in range(len(example["score"])):
  #  example["label"][i] = str(example["score"][i])

  return example

train_dataset = train_dataset.map(adjust_dataset, batched=True)
test_dataset = test_dataset.map(adjust_dataset, batched=True)
val_dataset = val_dataset.map(adjust_dataset, batched=True)

print("train_dataset")
print(train_dataset)
print("test_dataset")
print(test_dataset)
print("val_dataset")
print(val_dataset)



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

train_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label'],
    num_rows: 5
})
test_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label'],
    num_rows: 1
})
val_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label'],
    num_rows: 1
})


In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

print("train_dataset")
print(train_dataset)
print("test_dataset")
print(test_dataset)
print("val_dataset")
print(val_dataset)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

train_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5
})
test_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1
})
val_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1
})


In [None]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

print("train_dataset")
print(train_dataset)
print("test_dataset")
print(test_dataset)
print("val_dataset")
print(val_dataset)

train_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5
})
test_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1
})
val_dataset
Dataset({
    features: ['id', 'task', 'question', 'maximum_score', 'scoring_guide', 'answer', 'score', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1
})


In [None]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = 7
class_names = [0, 1, 2, 3, 4, 5, 6]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
my_config = AutoConfig.from_pretrained(model_id)
my_config.update({"id2label": id2label})

number of labels: 7
the labels: ['0', '1', '2', '3', '4', '5', '6']


In [None]:
original_model = RobertaModel.from_pretrained(model_id, config=my_config)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install transformers[torch]
!pip install accelerate -U



In [None]:
output_dir = f'./first-ever-training'

# TrainingArguments
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,

)

# Trainer
trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

ValueError: ignored