In [1]:
import pandas as pd
import transformers

In [2]:
bert = transformers.AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at google/bert_uncased_L-4_H-256_A-4 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Load the data
data = {
    "train": pd.read_csv("../data/wine_cleaned_train.csv"),
    "val": pd.read_csv("../data/wine_cleaned_val.csv"),
    "test": pd.read_csv("../data/wine_cleaned_test.csv"),
}

In [4]:
# Function to tokenize the data and go trough the model
import numpy as np
import torch
from tqdm import tqdm


def tokenize_and_predict(split, batch_size=256):
    # Auto detect if cuda is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    bert.to(device)
    embeddings = np.zeros((len(data[split]), 256))
    # Process the data in batches with tqdm
    for i in tqdm(range(0, len(data[split]), batch_size)):
        # Tokenize the data
        inputs = tokenizer(
            data[split]["description"][i : i + batch_size].tolist(),
            return_tensors="pt",
            padding=True,
            truncation=True,
        )
        # Move the data to the device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        # Predict the labels
        with torch.no_grad():
            outputs = bert(**inputs)
        # Get the embeddings
        embedding = outputs.pooler_output
        # Move the embeddings to the cpu
        embedding = embedding.cpu().numpy()
        # Concatenate the embeddings
        embeddings[i : i + batch_size] = embedding
    return embeddings

In [5]:
for split in ["train", "val", "test"]:
    embeddings = tokenize_and_predict(split, batch_size=256)
    np.save(f"../data/embeddings/{split}_embeddings.npy", embeddings)

Using device: cuda


100%|██████████| 413/413 [00:30<00:00, 13.59it/s]


Using device: cuda


100%|██████████| 23/23 [00:01<00:00, 14.50it/s]


Using device: cuda


100%|██████████| 23/23 [00:01<00:00, 14.68it/s]
