In [1]:
%pip install "torch==2.5.0" "torchvision==0.20.0"
%pip install "setuptools<71.0.0" scikit-learn

%pip install  --upgrade \
  "datasets==3.1.0" \
  "accelerate==1.2.1" \
  "hf-transfer==0.1.8"

%pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

Collecting torch==2.5.0
  Downloading torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.20.0
  Downloading torchvision-0.20.0-cp312-cp312-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting filelock (from torch==2.5.0)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.5.0)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting networkx (from torch==2.5.0)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch==2.5.0)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch==2.5.0)
  Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.0)
  Downloading n

In [1]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.iterable_dataset import IterableDataset

# Dataset id from huggingface.co/dataset
dataset_id = "wesley7137/question_complexity_classification"

# Load raw dataset
train_dataset = load_dataset(dataset_id, split='train')

split_dataset = train_dataset.train_test_split(test_size=0.1)
split_dataset['train'][5:7]

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.


{'question': ["Complete the joke 'A Large Language Model enters a bar'",
  'Why is the sky blue?'],
 'rating': [0.2, 0.4]}

In [4]:
def get_bucket(rating):
    # 0 - easy, 1 - medium, 2 - hard
    if rating <= 0.3:
        return 0
    elif rating <= 0.6:
        return 1
    else:
        return 2

for i, (key, value) in enumerate(split_dataset['train']):
    if split_dataset['train'][i]['rating'] is None:
        continue
    split_dataset['train'][i]['rating'] = get_bucket(split_dataset['train'][i]['rating'])

# save dataset
split_dataset.save_to_disk('question_complexity_classification_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 12643/12643 [00:00<00:00, 106934.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1405/1405 [00:00<00:00, 77732.74 examples/s]


In [None]:
split_dataset = DatasetDict.load_from_disk('question_complexity_classification_dataset')

In [2]:
from transformers import AutoTokenizer

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['question'], truncation=True,padding=True, return_tensors="pt")

# Tokenize dataset
if "rating" in split_dataset["train"].features.keys():
    split_dataset =  split_dataset.rename_column("rating", "labels") # to match Trainer
tokenized_dataset = split_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 12643/12643 [00:13<00:00, 932.94 examples/s] 
Map: 100%|██████████| 1405/1405 [00:00<00:00, 2092.02 examples/s]


In [10]:
print(tokenized_dataset["train"].features)

{'question': Value(dtype='string', id=None), 'labels': Value(dtype='float64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [3]:
from transformers import AutoModelForSequenceClassification

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Prepare model labels - useful for inference
labels = [0, 1, 2]  # 0 - easy, 1 - medium, 2 - hard
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.save_pretrained("my_model/")
tokenizer.save_pretrained("my_model/")

('my_model/tokenizer_config.json',
 'my_model/special_tokens_map.json',
 'my_model/tokenizer.json')

In [5]:
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 12643/12643 [00:00<00:00, 74168.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1405/1405 [00:00<00:00, 122086.58 examples/s]
