# Feature Extraction with BERT (Windows)
Il presente Notebook mostra come effettuare l'estrazione delle Feature dal Training e dal Test Set mediante BERT. Attenzione! Il seguente codice risulta differenziato rispetto a quello dei classificatori poiché, per ragioni di efficienza, è stato eseguito su una GPU (Nvidia GeForce RTX 3060) mediante l'ausilio di CUDA.

In [1]:
import os
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from torch import cuda
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Caricamento dei Dataset
training_set = pd.read_csv("./training_set.csv")
test_data = pd.read_csv("./test_set.csv")

test_data.dropna(inplace=True)

test_set = test_data[test_data['toxic'] != -1]
other_set = test_data[test_data['toxic'] == -1]

print("training_set.shape:", training_set.shape)
print("test_set.shape:", test_set.shape)
print("other_set.shape:", other_set.shape)

training_set.shape: (30577, 2)
test_set.shape: (63842, 2)
other_set.shape: (89004, 2)


In [3]:
availability = cuda.is_available()
if availability is True:
    print("Device:", cuda.get_device_name())
else:
    print("CUDA is not available")

Device: NVIDIA GeForce RTX 3060


In [4]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [5]:
# Carica il tokenizer e il modello preaddestrato di BERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to("cuda")

# L'espressione "to('cuda')" carica BertModel sulla GPU

In [6]:
def split_to_batches(dataset, batch_size):
    return [dataset[i:i + batch_size] for i in range(0, len(dataset), batch_size)]

In [7]:
def extract_features(strings):
    inputs = tokenizer(strings, return_tensors="pt", padding=True, truncation=True).to("cuda")
    # return_tensors="pt": ritorna tensori PyTorch
    # padding=True: frasi di lunghezza inferiore alla massima vengono adattate ad essa mediante del Padding

    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    
    # Rappresentazione media e unidimensionale delle caratteristiche estratte
    features = torch.mean(last_hidden_states, dim=1).squeeze()
    to_return = features.cpu().numpy()
    
    del features
    del outputs
    del inputs
    cuda.empty_cache()

    return pd.DataFrame(to_return)

In [8]:
print("Feature Extraction for the Training Set...")

batches = split_to_batches(training_set['comment_text'].to_list(), batch_size=32)
print("Batch to process:", len(batches))

dataframes = list()

start = datetime.now()
for batch in batches:
    extraction = extract_features(batch)
    dataframes.append(extraction)
end = datetime.now()

X_train = pd.concat(dataframes, axis=0, ignore_index=True)

print("Extraction completed! Required Time:", str(end-start))
print("X_train.shape:", X_train.shape)

X_train.to_csv("./X_train_bert.csv", index=False)

Feature Extraction for the Training Set...
Batch to process: 956
Extraction completed! Required Time: 0:04:14.764641
X_train.shape: (30577, 768)


In [9]:
print("Feature Extraction for the Test Set...")

batches = split_to_batches(test_set['comment_text'].to_list(), batch_size=32)
print("Batch to process:", len(batches))

dataframes = list()

start = datetime.now()
for batch in batches:
    extraction = extract_features(batch)
    dataframes.append(extraction)
end = datetime.now()

X_test = pd.concat(dataframes, axis=0, ignore_index=True)

print("Extraction completed! Required Time:", str(end-start))
print("X_test.shape:", X_test.shape)

X_test.to_csv("./X_test_bert.csv", index=False)

Feature Extraction for the Test Set...
Batch to process: 1996
Extraction completed! Required Time: 0:09:32.381707
X_test.shape: (63842, 768)
