# Feature Extraction with BERT (MAC)
Il presente Notebook mostra come effettuare l'estrazione delle Feature dal Training e dal Test Set mediante BERT. Attenzione! Il seguente codice risulta differenziato rispetto a quello dei classificatori poiché, per ragioni di efficienza, è stato eseguito su una GPU (Nvidia GeForce RTX 3060) mediante l'ausilio di CUDA.

In [1]:
import os
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch import cuda
from datetime import datetime
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Caricamento dei Dataset
training_set = pd.read_csv("./../../datasets/training_set.csv")
test_data = pd.read_csv("./../../datasets/test_set.csv")

test_data.dropna(inplace=True)

test_set = test_data[test_data['toxic'] != -1]
other_set = test_data[test_data['toxic'] == -1]

print("training_set.shape:", training_set.shape)
print("test_set.shape:", test_set.shape)
print("other_set.shape:", other_set.shape)

training_set.shape: (30577, 2)
test_set.shape: (63842, 2)
other_set.shape: (89004, 2)


In [3]:
import platform
print(platform.platform())

macOS-14.0-arm64-arm-64bit


In [4]:
CPU= False
device = "cpu" if CPU else torch.device("mps")
print("Device is :: {}".format(device))
torch.backends.mps.is_available()

Device is :: mps


True

In [5]:
# Carica il tokenizer e il modello preaddestrato di BERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to("mps")

# L'espressione "to('mps')" carica BertModel sulla GPU

In [6]:
def split_to_batches(dataset, batch_size):
    return [dataset[i:i + batch_size] for i in range(0, len(dataset), batch_size)]

In [7]:
def extract_features(strings):
    inputs = tokenizer(strings, return_tensors="pt", padding=True, truncation=True).to("mps")
    # return_tensors="pt": ritorna tensori PyTorch
    # padding=True: frasi di lunghezza inferiore alla massima vengono adattate ad essa mediante del Padding

    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    
    # Rappresentazione media e unidimensionale delle caratteristiche estratte
    features = torch.mean(last_hidden_states, dim=1).squeeze()
    to_return = features.cpu().numpy()

    del features
    del inputs
    del outputs
    torch.mps.empty_cache()
    return pd.DataFrame(to_return)

In [None]:
# Funzione per estrarre gli embeddings da un batch di testo (più indicata per l'hate detection)
def extract_embeddings(texts):
    # Tokenizza il testo
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    # Passa i token al modello
    with torch.no_grad():
        outputs = model(**inputs)
    # Estrai gli embeddings dall'ultimo layer del modello
    embeddings = outputs.last_hidden_state[:, 0, :]  # Utilizza l'embedding corrispondente al token CLS
    return embeddings

In [11]:
torch.mps.empty_cache()
data = training_set['comment_text'][0:32]
extraction = extract_features(data.to_list())

In [13]:
print("Feature Extraction for the Training Set...")
batches = training_set['comment_text']
batches = split_to_batches(batches.to_list(), batch_size=32)
print("Batch to process:", len(batches))

dataframes = list()

start = datetime.now()
for batch in batches:
    extraction = extract_features(batch)
    dataframes.append(extraction)
end = datetime.now()

X_train = pd.concat(dataframes, axis=0, ignore_index=True)

print("Extraction completed! Required Time:", str(end-start))
print(X_train)

X_train.to_csv("./X_train.csv", index=False)

Feature Extraction for the Training Set...
Batch to process: 956
Extraction completed! Required Time: 0:30:23.307517
            0         1         2         3         4         5         6    \
0      0.305078  0.355401  0.006583  0.119754  0.148529 -0.030381  0.081814   
1      0.037936  0.068326  0.065783  0.009710 -0.157437 -0.189122  0.209004   
2     -0.054468  0.139920  0.169760  0.011537  0.126337 -0.026550  0.203760   
3      0.220874  0.388016  0.100364 -0.068667 -0.147032 -0.154123  0.287599   
4      0.263740  0.457342 -0.061892  0.166867  0.133237  0.166459  0.158003   
...         ...       ...       ...       ...       ...       ...       ...   
30572  0.135423  0.246227  0.125089  0.121697  0.029626 -0.296775  0.370953   
30573  0.001573 -0.069304 -0.004758  0.061233 -0.004451 -0.428282 -0.041185   
30574  0.097718  0.065837  0.168668 -0.014908 -0.005668 -0.175033  0.109120   
30575  0.098884  0.189233  0.299181 -0.050321 -0.090466 -0.037958  0.289672   
30576 -0.35297

In [9]:
print("Feature Extraction for the Test Set...")

batches = split_to_batches(test_set['comment_text'].to_list(), batch_size=32)
print("Batch to process:", len(batches))

dataframes = list()

start = datetime.now()
for batch in batches:
    extraction = extract_features(batch)
    dataframes.append(extraction)
end = datetime.now()

X_test = pd.concat(dataframes, axis=0, ignore_index=True)

print("Extraction completed! Required Time:", str(end-start))
print("X_test.shape:", X_test.shape)

X_test.to_csv("./X_test.csv", index=False)

Feature Extraction for the Test Set...
Batch to process: 320
Extraction completed! Required Time: 0:12:55.694280
X_test.shape: (63842, 768)
