## Caricamento e Analisi del Dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [3]:

df = pd.read_csv("../data/processed/messages_labeled_detoxify.csv")

print(df['label'].value_counts())
df.head()


label
0    94165
1     5668
Name: count, dtype: int64


Unnamed: 0,author,author_id,message,badges,emotes,toxicity_score,label
0,andrewwzaza,1282565000.0,il re,"['subscriber', 'no_audio']",[],0.016997,0
1,domyjj,210372000.0,aodw,"['subscriber', 'premium']",[],0.000899,0
2,lippoth,711694700.0,buonasera,"['subscriber', 'premium']",[],0.017655,0
3,youssef_5_,754968100.0,il re,['raging_wolf_helm'],[],0.016997,0
4,xvalee2810,1316469000.0,hit,"['subscriber', 'premium']",[],0.196588,0


## Splitting e Vectorization

In [4]:

X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


## Logistic Regression (senza bilanciamento)

In [5]:
# Logistic Regression (senza bilanciamento)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print("=== Modello senza bilanciamento ===")
print(classification_report(y_test, y_pred))

=== Modello senza bilanciamento ===
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18833
           1       0.96      0.49      0.65      1134

    accuracy                           0.97     19967
   macro avg       0.96      0.75      0.82     19967
weighted avg       0.97      0.97      0.97     19967



## Logistic Regression (con bilanciamento)

In [6]:
# Bilanciamento con RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train_tfidf, y_train)
print("Distribuzione dopo il bilanciamento:", Counter(y_train_bal))

# Logistic Regression bilanciato
clf_bal = LogisticRegression(max_iter=1000)
clf_bal.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_bal.predict(X_test_tfidf)
print("=== Modello con bilanciamento ===")
print(classification_report(y_test, y_pred_bal))

Distribuzione dopo il bilanciamento: Counter({0: 75332, 1: 75332})
=== Modello con bilanciamento ===
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     18833
           1       0.48      0.81      0.61      1134

    accuracy                           0.94     19967
   macro avg       0.74      0.88      0.79     19967
weighted avg       0.96      0.94      0.95     19967



## Fine-Tuning BERT

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# Prepara dati per HuggingFace
train_df = pd.DataFrame({'message': X_train, 'label': y_train})
test_df = pd.DataFrame({'message': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(example):
    return tokenizer(example['message'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/79866 [00:00<?, ? examples/s]

Map:   0%|          | 0/19967 [00:00<?, ? examples/s]

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def compute_metrics(eval_pred):
    preds = eval_pred.predictions.argmax(-1)
    labels = eval_pred.label_ids
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

: 