## Finetuned ModernBert


Please note that most of this code is inspired by the BERT finetuning guide from MA2

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [2]:
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from datasets import Dataset, DatasetDict
from sklearn.utils import resample

from transformers import AutoTokenizer, ModernBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate

## Loading in the data

In [5]:
#specify CSV path
csv_path = '/content/data_preprocessed_general.csv'

#creating a df from the file
data = pd.read_csv(csv_path)

data.head(5)

Unnamed: 0,text,rating_overall
0,rooms are fine. service tries hard but does no...,3.0
1,best place to stay in nyc. want to go back mis...,5.0
2,it's a great place. i'll always check to see i...,5.0
3,this hotel has some of the biggest rooms in ma...,5.0
4,if you want to stay on the upper west side thi...,4.0


In [6]:
data['rating_overall'] = data['rating_overall'].replace(range(0, 3), 'Negative')
data['rating_overall'] = data['rating_overall'].replace(3, 'Neutral')
data['rating_overall'] = data['rating_overall'].replace(range(4, 6), 'Positive')

result = data.groupby('rating_overall').size()

result

Unnamed: 0_level_0,0
rating_overall,Unnamed: 1_level_1
Negative,3263
Neutral,3982
Positive,38291


In [7]:
#splitting the data into train, validation and test sets

(

    X_train,
    X_rem,
    y_train,
    y_rem

) = train_test_split(data["text"], data["rating_overall"], train_size=0.8, random_state=42)

(X_valid, X_test, y_valid, y_test) = train_test_split(X_rem,y_rem, test_size=0.5)

In [8]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

(36428,)
(4554,)
(4554,)


### Making a balanced training split

In [9]:
# balancing the training set, made with the help of AI

# Combine X_train and y_train into a single DataFrame for resampling
train_data = pd.DataFrame({'text': X_train, 'rating_overall': y_train})

# Separate the classes in the training set using the correct labels
positive_data = train_data[train_data['rating_overall'] == 'Positive']
neutral_data = train_data[train_data['rating_overall'] == 'Neutral']
negative_data = train_data[train_data['rating_overall'] == 'Negative']

# Downsample the majority class (Positive) to match the size of the minority class (Neutral or Negative, whichever is larger)
minority_class_size = max(len(neutral_data), len(negative_data))

positive_data_downsampled = resample(
    positive_data,
    replace=False,
    n_samples=minority_class_size,
    random_state=42
)

# Combine the downsampled majority class with the other classes
train_data_balanced = pd.concat([positive_data_downsampled, neutral_data, negative_data])

# Shuffle the balanced training data
train_data_balanced = train_data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate the balanced training data back into X_train and y_train
X_train_balanced = train_data_balanced['text']
y_train_balanced = train_data_balanced['rating_overall']

# Verify the class distribution in the balanced training set
print("Class distribution in the balanced training set:")
print(y_train_balanced.value_counts())

Class distribution in the balanced training set:
rating_overall
Positive    3169
Neutral     3169
Negative    2592
Name: count, dtype: int64


### Converting the pandas dataframe into a DatasetDict

In [10]:
# Combine the splits into pandas DataFrames
train_df = pd.DataFrame({"text": X_train, "label": y_train})
validation_df = pd.DataFrame({"text": X_valid, "label": y_valid})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

# Verify the structure of the DatasetDict
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 36428
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4554
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4554
    })
})


### Making a version with a balanced training set

In [11]:
# Combine the splits into pandas DataFrames
train_df_balanced = pd.DataFrame({"text": X_train_balanced, "label": y_train_balanced})

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset_balanced = Dataset.from_pandas(train_df_balanced)

# Combine into a DatasetDict
dataset_dict_balanced = DatasetDict({
    "train": train_dataset_balanced,
    "validation": validation_dataset,
    "test": test_dataset
})

# Verify the structure of the DatasetDict
print(dataset_dict_balanced)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8930
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4554
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4554
    })
})


### Making the target variables into numerical values so it can be processed by the model

In [12]:
#map string labels to integers for imbalanced data
label_mapping = {"Positive": 0, "Neutral": 1, "Negative": 2}
dataset_dict = dataset_dict.map(lambda x: {"label": label_mapping[x["label"]]})

Map:   0%|          | 0/36428 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

In [13]:
#map sting labels to integers for balanced data
dataset_dict_balanced = dataset_dict_balanced.map(lambda x: {"label": label_mapping[x["label"]]})

Map:   0%|          | 0/8930 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

### Loading in the ModernBERT tokenizer and model form HugginFace


In [14]:
# Define the mappping from label names to label ids
id2label = {
    0: 'Positive',
    1: 'Neutral',
    2: 'Negative',
}

# Define the mapping from label ids to label names (the reverse of id2label)
label2id = {v: k for k, v in id2label.items()}


In [15]:

# load the model
model = ModernBertForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=3, id2label=id2label, label2id=label2id)

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

### Tokenizing and encoding the data

In [16]:
#tokenizing the imbalanced data
def preprocess_function(examples):
    """ Tokenize the text column in the examples. """
    return tokenizer(examples["text"], truncation=True)

tokenized_data = dataset_dict.map(preprocess_function, batched=True, batch_size=4)

Map:   0%|          | 0/36428 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

In [17]:
#tokenizing the balanced data

tokenized_data_balanced = dataset_dict_balanced.map(preprocess_function, batched=True, batch_size=4)

Map:   0%|          | 0/8930 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

Map:   0%|          | 0/4554 [00:00<?, ? examples/s]

### Setting the evaluation metric

Setting the evaluation metric to overall accuracy, F1 for the different sentiment values and the overall weighted F1 score

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    # Calculate overall accuracy
    accuracy = accuracy_score(labels, predictions)

    # Generate a classification report for per-class metrics
    report = classification_report(
        labels,
        predictions,
        target_names=["Negative", "Neutral", "Positive"],
        output_dict=True
    )

    # Extract F1 scores for each class
    f1_negative = report["Negative"]["f1-score"]
    f1_neutral = report["Neutral"]["f1-score"]
    f1_positive = report["Positive"]["f1-score"]

    # Return metrics
    return {
        "accuracy": accuracy,
        "f1_negative": f1_negative,
        "f1_neutral": f1_neutral,
        "f1_positive": f1_positive,
        "f1_weighted": report["weighted avg"]["f1-score"],
    }


### Defining a datacollector

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training on the imbalanced data

First we train the model on the imbalanced data and validate it on the validation set

In [20]:
training_args = TrainingArguments(
    output_dir="/content/MYBERT",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfilip-rognerud[0m ([33mfilip-rognerud-copenhagen-business-school[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W0507 13:06:06.436000 840 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Accuracy,F1 Negative,F1 Neutral,F1 Positive,F1 Weighted
1,0.2317,0.233697,0.914141,0.963002,0.522193,0.819277,0.911508
2,0.1607,0.272065,0.918094,0.9645,0.5171,0.830769,0.913217


TrainOutput(global_step=4554, training_loss=0.21463759488896106, metrics={'train_runtime': 1117.33, 'train_samples_per_second': 65.205, 'train_steps_per_second': 4.076, 'total_flos': 2297578367218248.0, 'train_loss': 0.21463759488896106, 'epoch': 2.0})

### Training and validating using the balanced dataset

Now we to the same with the balanced training data to see how well it performs

In [23]:
training_args = TrainingArguments(
    output_dir="/content/MYBERT_balanced",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

)

trainer_balanced = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data_balanced["train"],
    eval_dataset=tokenized_data_balanced["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_balanced.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Negative,F1 Neutral,F1 Positive,F1 Weighted
1,0.2025,0.555914,0.83531,0.912256,0.460073,0.788774,0.861369
2,0.0989,1.033529,0.858147,0.927568,0.480269,0.790055,0.875989


TrainOutput(global_step=1118, training_loss=0.14386762489359792, metrics={'train_runtime': 300.9656, 'train_samples_per_second': 59.342, 'train_steps_per_second': 3.715, 'total_flos': 572568995610036.0, 'train_loss': 0.14386762489359792, 'epoch': 2.0})

### Final test using the test data

As the model performed best when training on the imbalanced data, we use this model to perform the final test on the test data

In [24]:
# Evaluate on the test set
test_results = trainer.predict(tokenized_data["test"])
print("\nTest Results:")
print(test_results.metrics)




Test Results:
{'test_loss': 0.5640872120857239, 'test_accuracy': 0.8282828282828283, 'test_f1_negative': 0.9104830850619519, 'test_f1_neutral': 0.43311160384331115, 'test_f1_positive': 0.7307692307692307, 'test_f1_weighted': 0.8564911522952919, 'test_runtime': 17.5123, 'test_samples_per_second': 260.046, 'test_steps_per_second': 16.274}
