<a href="https://colab.research.google.com/github/DLesmes/bert_embeddings_generator/blob/main/bert_embedding_generator_yahoo_answers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Requirements

In [28]:
!pip install datasets
# embeddings
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
from concurrent.futures import ThreadPoolExecutor
# data
from datasets import load_dataset
import pandas as pd
import numpy as np
# nn
import torch.nn as nn
from sklearn.metrics import accuracy_score
import seaborn as sns



# Embedding Model

In [29]:
# Choose a suitable pre-trained BERT model
model_name = 'bert-base-uncased'

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)
embed_model = BertModel.from_pretrained(model_name)

# Data

In [30]:
dataset = load_dataset("yahoo_answers_topics")
dataset

Downloading data:   0%|          | 0.00/336M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/60000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})

In [31]:
%%time
data_train = [
    {
       "text" : question['question_title']
        + " " + question['question_content']
        + " " + question['best_answer'],
          "result": question['topic']
    }
      for question
      in dataset['train']
]

data_test = [
    {
       "text" : question['question_title']
        + " " + question['question_content']
        + " " + question['best_answer'],
          "result": question['topic']
    }
      for question
    in dataset['test']
]


CPU times: user 1min 12s, sys: 1.7 s, total: 1min 14s
Wall time: 1min 13s


In [32]:
df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)
df_train = df_train[:150000].copy()
df_test = pd.concat([df_test,df_train[-15000:]])
dfs = [df_train, df_test]
for df in dfs:
  print(df.shape)

(150000, 2)
(75000, 2)


In [33]:
df_train.head()

Unnamed: 0,text,result
0,why doesn't an optical mouse work on a glass t...,4
1,What is the best off-road motorcycle trail ? l...,5
2,What is Trans Fat? How to reduce that? I heard...,2
3,How many planes Fedex has? I heard that it is ...,6
4,"In the san francisco bay area, does it make se...",6


In [34]:
df_test.head()

Unnamed: 0,text,result
0,What makes friendship click? How does the spar...,8
1,Why does Zebras have stripes? What is the purp...,1
2,What did the itsy bitsy sipder climb up? wate...,3
3,What is the difference between a Bachelors and...,3
4,Why do women get PMS? Premenstrual syndrome (...,2


In [35]:
def embed(text: list):
  # Tokenize and encode the text
  inputs = tokenizer(
      text,
      return_tensors='pt',
      truncation=True,
      max_length=512
  )

  # Pass the input through the model (no fine-tuning needed)
  with torch.no_grad():
      outputs = embed_model(**inputs)

  # Extract the embeddings
  return outputs.last_hidden_state[:, 0, :].float()   # [CLS] token embedding

def process_row(row):
    return {
        "text": row["text"],
        "embed": embed(row["text"])
    }

In [None]:
%%time
for df in dfs:
  embed_list = embed(df['text'].to_list())
  df['X'] = [embedding for embedding in embed_list]
  print(df.shape)

In [37]:
"""%%time
for df in dfs:
  with ThreadPoolExecutor() as executor:
    results = []
    futures = [
      executor.submit(process_row, row)
      for _, row
      in df.iterrows()
    ]
    results.extend(f.result() for f in futures)
  df_results = pd.DataFrame(results)
  df_results.index = df_results['text']
  dict_results = df_results['embed'].to_dict()
  df['X'] = df['text'].map(dict_results)"""

"%%time\nfor df in dfs:\n  with ThreadPoolExecutor() as executor:\n    results = []\n    futures = [\n      executor.submit(process_row, row)\n      for _, row\n      in df.iterrows()\n    ]\n    results.extend(f.result() for f in futures)\n  df_results = pd.DataFrame(results)\n  df_results.index = df_results['text']\n  dict_results = df_results['embed'].to_dict()\n  df['X'] = df['text'].map(dict_results)"

In [38]:
df_train.head()

Unnamed: 0,text,result
0,why doesn't an optical mouse work on a glass t...,4
1,What is the best off-road motorcycle trail ? l...,5
2,What is Trans Fat? How to reduce that? I heard...,2
3,How many planes Fedex has? I heard that it is ...,6
4,"In the san francisco bay area, does it make se...",6


In [39]:
df_test.head()

Unnamed: 0,text,result
0,What makes friendship click? How does the spar...,8
1,Why does Zebras have stripes? What is the purp...,1
2,What did the itsy bitsy sipder climb up? wate...,3
3,What is the difference between a Bachelors and...,3
4,Why do women get PMS? Premenstrual syndrome (...,2


In [None]:
len(df_test.X[0])

In [None]:
X_train = np.array(df_train.X.tolist())
X_test = np.array(df_test.X.tolist())
y_train = np.array(df_train.result)
y_test = np.array(df_test.result)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# NN - Model

In [None]:
y_train.shape[0]

In [None]:
class il_mmlu_data(Dataset):
  def __init__(self, X_train, y_train) -> None:
        super().__init__()
        self.X = X_train
        self.y = torch.from_numpy(y_train)
        self.y = self.y.type(torch.LongTensor)
        self.len = self.X.shape[0]

  def __getitem__(self, index):
      return self.X[index], self.y[index]

  def __len__(self):
      return self.len


In [None]:
# %% dataloader
il_mmlu_dataset = il_mmlu_data(X_train=X_train, y_train=y_train)
train_loader = DataLoader(dataset=il_mmlu_dataset, batch_size=32)

In [None]:
class nn_text_classifier(nn.Module):
  def __init__(
      self,
      num_features,
      num_classes,
      first_hidden_features,
      second_hidden_features
  ):
    super().__init__()
    self.ln1 = nn.Linear(
        num_features,
        first_hidden_features
    )
    self.ln2 = nn.Linear(
        first_hidden_features,
        second_hidden_features
    )
    self.ln3 = nn.Linear(
        second_hidden_features,
        num_classes
    )
    self.log_softmax = nn.LogSoftmax(dim=1)

  def forward(self, x):
    x = self.ln1(x)
    x = torch.sigmoid(x)
    x = self.ln2(x)
    x = torch.sigmoid(x)
    x = self.ln3(x)
    x = self.log_softmax(x)
    return x




In [None]:
num_features = 768
num_classes = 4
first_hidden_features = 348
second_hidden_features = 192
model = nn_text_classifier(
    num_features=num_features,
    num_classes=num_classes,
    first_hidden_features=first_hidden_features,
    second_hidden_features=second_hidden_features
)

# Train loop

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
epochs = 1000

In [None]:
%%time
losses = []
for epoch in range(epochs):
  for X, y in train_loader:
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
  losses.append(float(loss.data.detach().numpy()))

In [None]:
sns.lineplot(x= range(len(losses)), y = losses)

In [None]:
# %% test the model
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():
    y_test_hat_softmax = model(X_test_torch)
    y_test_hat = torch.max(y_test_hat_softmax.data, 1)

In [None]:
# %% Accuracy
accuracy_score(y_test, y_test_hat.indices)