## Caricamento e Analisi del Dataset

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [4]:

df = pd.read_csv("../data/processed/messages_labeled_detoxify.csv")

print(df['label'].value_counts())
df.head()


label
0    194558
1     12833
Name: count, dtype: int64


Unnamed: 0,author,message,badges,emotes,toxicity_score,label
0,alexanderdc95,il re e il principe,['premium'],[],0.002905,0
1,brighi05,ma eri da sara secondi fa,['premium'],[],0.044066,0
2,sono_tuo_nonno22,il re,[],[],0.016997,0
3,michiamanoroberta,bella canzone,[],[],0.012764,0
4,nadestin,manucio bannato,['premium'],[],0.162031,0


## Splitting e Vectorization

In [5]:

X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


## Logistic Regression (senza bilanciamento)

In [None]:
# Logistic Regression (senza bilanciamento)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

print("=== Modello senza bilanciamento ===")
print(classification_report(y_test, y_pred))

# Prova su input singoli
test_texts = [
    "sei uno stupidino",
    "Stronzo di merda"
]

test_tfidf = vectorizer.transform(test_texts)
preds = clf.predict(test_tfidf)
probs = clf.predict_proba(test_tfidf)

for text, pred, prob in zip(test_texts, preds, probs):
    print(f"Testo: {text}")
    print(f"Predizione: {pred}")
    print(f"Probabilità: {prob}")
    print()

=== Modello senza bilanciamento ===
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     38912
           1       0.95      0.59      0.73      2567

    accuracy                           0.97     41479
   macro avg       0.96      0.80      0.86     41479
weighted avg       0.97      0.97      0.97     41479

Testo: sei uno stupidino
Predizione: 0
Probabilità: [0.72511253 0.27488747]

Testo: Stronzo di merda
Predizione: 1
Probabilità: [2.54534622e-04 9.99745465e-01]



: 

## Logistic Regression (con bilanciamento)

In [27]:
# Bilanciamento
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train_tfidf, y_train)
print("Distribuzione dopo il bilanciamento:", Counter(y_train_bal))

# Logistic Regression bilanciato
clf_bal = LogisticRegression(max_iter=1000)
clf_bal.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_bal.predict(X_test_tfidf)
print("=== Modello con bilanciamento ===")
print(classification_report(y_test, y_pred_bal))

# Prova su input singoli
test_texts = [
    "sei uno stupidino",
    "Stronzo di merda"
]

test_tfidf = vectorizer.transform(test_texts)
preds = clf_bal.predict(test_tfidf)
probs = clf_bal.predict_proba(test_tfidf)

for text, pred, prob in zip(test_texts, preds, probs):
    print(f"Testo: {text}")
    print(f"Predizione: {pred}")
    print(f"Probabilità: {prob}")
    print()

Distribuzione dopo il bilanciamento: Counter({0: 75332, 1: 75332})
=== Modello con bilanciamento ===
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     18833
           1       0.48      0.81      0.61      1134

    accuracy                           0.94     19967
   macro avg       0.74      0.88      0.79     19967
weighted avg       0.96      0.94      0.95     19967

Testo: sei uno stupidino
Predizione: 1
Probabilità: [0.19603694 0.80396306]

Testo: Stronzo di merda
Predizione: 1
Probabilità: [2.23912317e-05 9.99977609e-01]



## Fine-Tuning BERT

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# Prepara dati per HuggingFace
train_df = pd.DataFrame({'message': X_train, 'label': y_train})
test_df = pd.DataFrame({'message': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(example):
    return tokenizer(example['message'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/79866 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def compute_metrics(eval_pred):
    preds = eval_pred.predictions.argmax(-1)
    labels = eval_pred.label_ids
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

: 