## Getting ModernBERT embeddings

Note that most of this code in this notebook is inspired by the BERT guide in MA2


In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import pipeline

In [2]:
#specify CSV path
csv_path = '/content/data_preprocessed_general.csv'

#creating a df from the file
data = pd.read_csv(csv_path)

data.head(5)

Unnamed: 0,text,rating_overall
0,rooms are fine. service tries hard but does no...,3.0
1,best place to stay in nyc. want to go back mis...,5.0
2,it's a great place. i'll always check to see i...,5.0
3,this hotel has some of the biggest rooms in ma...,5.0
4,if you want to stay on the upper west side thi...,4.0


In [3]:
data['rating_overall'] = data['rating_overall'].replace(range(0, 3), 'Negative')
data['rating_overall'] = data['rating_overall'].replace(3, 'Neutral')
data['rating_overall'] = data['rating_overall'].replace(range(4, 6), 'Positive')

result = data.groupby('rating_overall').size()

result

Unnamed: 0_level_0,0
rating_overall,Unnamed: 1_level_1
Negative,3263
Neutral,3982
Positive,38291


In [4]:
#splitting the data into train, validation and test sets

(

    X_train,
    X_rem,
    y_train,
    y_rem

) = train_test_split(data["text"], data["rating_overall"], train_size=0.9, random_state=42)

(X_valid, X_test, y_valid, y_test) = train_test_split(X_rem,y_rem, test_size=0.5)

In [5]:
# balancing the training set, made with the help of AI

from sklearn.utils import resample

# Combine X_train and y_train into a single DataFrame for resampling
train_data = pd.DataFrame({'text': X_train, 'rating_overall': y_train})

# Separate the classes in the training set using the correct labels
positive_data = train_data[train_data['rating_overall'] == 'Positive']
neutral_data = train_data[train_data['rating_overall'] == 'Neutral']
negative_data = train_data[train_data['rating_overall'] == 'Negative']

# Downsample the majority class (Positive) to match the size of the minority class (Neutral or Negative, whichever is larger)
minority_class_size = max(len(neutral_data), len(negative_data))  # Use max to find the larger minority class

positive_data_downsampled = resample(
    positive_data,
    replace=False,  # Do not sample with replacement
    n_samples=minority_class_size,  # Match the size of the larger minority class
    random_state=42  # For reproducibility
)

# Combine the downsampled majority class with the other classes
train_data_balanced = pd.concat([positive_data_downsampled, neutral_data, negative_data])

# Shuffle the balanced training data
train_data_balanced = train_data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate the balanced training data back into X_train and y_train
X_train = train_data_balanced['text']
y_train = train_data_balanced['rating_overall']

# Verify the class distribution in the balanced training set
print("Class distribution in the balanced training set:")
print(y_train.value_counts())

Class distribution in the balanced training set:
rating_overall
Positive    3560
Neutral     3560
Negative    2933
Name: count, dtype: int64


In [6]:
# Combine the splits into pandas DataFrames
train_df = pd.DataFrame({"text": X_train, "label": y_train})
validation_df = pd.DataFrame({"text": X_valid, "label": y_valid})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

# Verify the structure of the DatasetDict
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10053
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2277
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2277
    })
})


In [7]:

embedder = pipeline(
    model="answerdotai/ModernBERT-base",      # model used for embedding
    tokenizer="answerdotai/ModernBERT-base",  # tokenizer used for embedding
    task="feature-extraction",                # feature extraction task (returns embeddings)
    device=0                                  # use GPU 0 if available
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [8]:

def get_embeddings(data):
    """ Extract the [CLS] embedding for each text. """
    embeddings = embedder(data["text"])  # Full token embeddings
    cls_embeddings = [e[0][0] for e in embeddings]  # Extract first token ([CLS])
    return {"embeddings": cls_embeddings}

dataset_dict = dataset_dict.map(get_embeddings, batched=True, batch_size=8)



Map:   0%|          | 0/10053 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

In [9]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'embeddings'],
        num_rows: 10053
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'embeddings'],
        num_rows: 2277
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'embeddings'],
        num_rows: 2277
    })
})

In [10]:
X_train = np.array(dataset_dict["train"]["embeddings"])  # Feature embeddings
y_train = np.array(dataset_dict["train"]["label"])

X_valid = np.array(dataset_dict["validation"]["embeddings"])  # Feature embeddings
y_valid = np.array(dataset_dict["validation"]["label"])

X_test = np.array(dataset_dict["test"]["embeddings"])
y_test = np.array(dataset_dict["test"]["label"])

# Check shapes
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_validation shape: {X_valid.shape}, y_validation shape: {y_valid.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")



X_train shape: (10053, 768), y_train shape: (10053,)
X_validation shape: (2277, 768), y_validation shape: (2277,)
X_test shape: (2277, 768), y_test shape: (2277,)


In [12]:
import pandas as pd

#saving the datasets as CSV files
pd.DataFrame(X_train).to_csv("X_train.csv", index=False)
pd.DataFrame(y_train).to_csv("y_train.csv", index=False)
pd.DataFrame(X_valid).to_csv("X_valid.csv", index=False)
pd.DataFrame(y_valid).to_csv("y_valid.csv", index=False)
pd.DataFrame(X_test).to_csv("X_test.csv", index=False)
pd.DataFrame(y_test).to_csv("y_test.csv", index=False)