# Estrazione delle features con BERT e confronto delle prestazioni con il classificatore SVM con TF-IDF


In [20]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from datetime import datetime
from sklearn.metrics import accuracy_score,precision_score,recall_score

## Caricamento del Training Set

In [23]:
training_set = pd.read_csv("./../../datasets/training_set.csv")
# Osservazione: il Training Set è stato già ripulito
training_set

Unnamed: 0,comment_text,toxic
0,cocksucker before you piss around on my work,1
1,hey what is it talk what is it an exclusive gr...,1
2,bye dont look come or think of comming back to...,1
3,you are gay or antisemmitian archangel white t...,1
4,fuck your filthy mother in the ass dry,1
...,...,...
30572,chris i dont know who you are talking to but i...,0
30573,operation condor is also named a dirty war can...,0
30574,there is no evidence that this block has anyth...,0
30575,thanks hey utkarshraj thanks for the kindness ...,0


## Estrazione features

In [21]:
# Carica il tokenizer e il modello preaddestrato di BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [28]:
# Definizione di una funzione per estrarre le caratteristiche di un testo utilizzando BERT
def extract_features(text):
    # return_tensors ritorna il tensore per la versione pytorch
    # padding = true fa in modo che frasi di lunghezza diversa vengono portate alla lunghezza massima
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    #rappresentazione media e unidimensionale delle caratteristiche estratte
    features = torch.mean(last_hidden_states, dim=1).squeeze()
    return features.numpy()

In [30]:
def extract_features_data_set(dataset):
    features = []
    for i in range(len(dataset)):
        features.append(extract_features(dataset['comment_text'][i]))
    result = pd.DataFrame(features)
    return result

## Addestramento del Modello

In [31]:
X_train = extract_features_data_set(training_set)
print(X_train)

[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [None]:
y_train = training_set['toxic']

In [None]:
kernel = 'linear'
model_filename = '{}_svm_classifier.pkl'.format(kernel)
cl = None

In [None]:
cl = SVC(kernel=kernel, probability=True, random_state=24)

In [None]:
print("Training started...")
start = datetime.now()
cl.fit(X=X_train, y=y_train)
end = datetime.now()
print("Training completed! Required time: " + str(end-start))

with open(model_filename, 'wb') as f:
    pickle.dump(cl, f)

In [None]:
with open(model_filename, 'rb') as f:
    cl = pickle.load(f)

## Testing del Sistema

In [None]:
test_set = pd.read_csv("./../../datasets/test_set.csv")
test_set.dropna(inplace=True)

In [None]:
test_set = test_set[test_set['toxic'] != -1]
other_set = test_set[test_set['toxic'] == -1]

In [None]:
y_pred = cl_lem.predict(X_test)
#Metriche: Accuracy,Precision,Recall
print("Accuracy: " + str(accuracy_score(y_test, y_pred)))
print("Precision: " + str(precision_score(y_test, y_pred)))
print("Recall: " + str(recall_score(y_test, y_pred)))

# Brevi esperimenti con BERT

In [11]:
#Esperimento
text = "Ciao Belli!!"
inputs = tokenizer(text,return_tensors="pt", padding=True)
print(inputs)

{'input_ids': tensor([[ 101, 9915, 2080, 4330, 2072,  999,  999,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [12]:
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3141,  0.0321,  0.0595,  ..., -0.1836, -0.0218,  0.6473],
         [ 0.2967, -0.0405,  0.5128,  ...,  0.2590,  0.0862,  1.3316],
         [-0.3823,  0.0132,  0.9642,  ..., -0.0876, -0.2632,  1.0097],
         ...,
         [-0.1378, -0.2305,  0.4040,  ...,  0.0642, -0.0660,  0.3863],
         [-0.1831, -0.4964, -0.1496,  ...,  0.5253, -0.4985,  0.2009],
         [ 0.7862,  0.1154, -0.1137,  ...,  0.2464, -0.8151, -0.0474]]]), pooler_output=tensor([[-0.8261, -0.3466, -0.8242,  0.7842,  0.4768, -0.0715,  0.8891,  0.2735,
         -0.7233, -0.9999, -0.4402,  0.8052,  0.9593,  0.5968,  0.9168, -0.7953,
         -0.5079, -0.5590,  0.3431, -0.5569,  0.5800,  0.9999,  0.0611,  0.2156,
          0.3833,  0.9619, -0.8321,  0.8835,  0.9298,  0.6070, -0.7066,  0.0988,
         -0.9773, -0.2711, -0.9180, -0.9770,  0.3035, -0.6644, -0.0661,  0.0302,
         -0.8867,  0.1795,  0.9998,  0.3337,  0.2153, -0.4069, -1.0000,  0.

In [17]:
result = extract_features("Ciao Belli!")
print(result)
print(result.shape)

[-1.17541976e-01 -1.61978051e-01  3.98247629e-01 -1.16937391e-01
  3.44952971e-01  1.13185577e-01  8.11842918e-01  6.78107560e-01
  1.06444485e-01 -3.62467200e-01 -1.99941128e-01 -4.50574607e-01
 -1.64689571e-01  5.88219464e-01 -6.61147773e-01  3.77366096e-01
  7.35610843e-01 -1.30787075e-01 -1.30586803e-01 -1.42679200e-01
  1.95564404e-01 -2.39510741e-02 -2.82505780e-01 -5.05829677e-02
 -5.75412452e-01 -3.14073026e-01  2.09521845e-01  1.43737588e-02
  2.22098544e-01  2.15350702e-01 -2.13940348e-02  1.13563828e-01
 -3.95994753e-01  5.45265853e-01 -6.50065780e-01  3.46671611e-01
  3.95975143e-01  6.89308524e-01 -5.20978570e-01 -5.09267688e-01
 -5.17328791e-02 -8.04509282e-01 -6.88277334e-02  3.56208384e-01
  1.42084792e-01 -4.02675569e-01  2.79372305e-01 -3.74704674e-02
 -1.98110834e-01 -3.54869038e-01 -1.47073880e-01  7.30344132e-02
 -5.57606161e-01 -2.65875794e-02 -1.57929689e-01 -2.40625665e-01
 -2.41708413e-01 -2.64919251e-01 -2.49543525e-02 -2.93899387e-01
 -3.94180954e-01  1.19030