In [12]:
import zipfile
import pandas as pd
import re
import nltk
import demoji
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud

nltk.download('punkt_tab') #tokenizzazione
nltk.download('stopwords') # stop words
nltk.download('wordnet') # lemmatizzazione
demoji.download_codes() # dizionario delle emoji

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\loris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\loris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  demoji.download_codes() # dizionario delle emoji


In [13]:
zip_path = 'redditComments.zip'

with zipfile.ZipFile(zip_path) as z:
    with z.open('ruddit_comments_score.csv') as csv:
        df = pd.read_csv(csv)

print("Dataset shape:", df.shape)

Dataset shape: (5966, 3)


In [14]:
df = df.drop(columns='comment_id')
print(df.columns)

Index(['body', 'score'], dtype='object')


In [15]:
#Rimozione dei commenti nulli
df = df.dropna(subset=['body'])
df.drop(df[df['body'].str.startswith('[')].index, inplace=True)


In [16]:
dict1 = {
    "don't": "do not", "doesn't": "does not", "didn't": "did not",
    "won't": "will not", "can't": "cannot", "i'm": "i am", "you're": "you are",
    "it's": "it is", "they're": "they are", "we're": "we are", "isn't": "is not",
    "aren't": "are not", "haven't": "have not", "hasn't": "has not",
    "wasn't": "was not", "weren't": "were not", "shouldn't": "should not",
    "wouldn't": "would not", "couldn't": "could not", "mustn't": "must not"
}

dict2 = {
    "nude": "naked", "xxx": "porn", "tits": "breasts", "dick": "penis",
    "pussy": "vagina", "bj": "blowjob", "cum": "semen", "hornyk": "aroused",
    "milf": "attractive older woman", "nsfw": "not safe for work",
    "thirsty": "sexually desperate", "ass": "buttocks", "booty": "buttocks",
    "deepthroat": "oral sex", "suck": "oral action", "sexy": "attractive"
}

def expand_contractions_and_slang(text):
    #applicazione dei dizionari
    for key, value in {**dict1, **dict2}.items():
        text = re.sub(r"\b" + key + r"\b", value, text)
    return text

def correct_spelling(text):
    #correzione degli errori ortografici
    return str(TextBlob(text).correct())

def segment_sentences(text):
    # divisione in frasi
    return sent_tokenize(text) if isinstance(text, str) else []

def replace_emojis_and_ascii(text):
    text = demoji.replace_with_desc(text)

    # rimozione elementi non validi
    text = re.sub(r'(\*\*|\*\w+\*|\W+\*{2,})', '', text)
    text = re.sub(r'(\bxx+\b)', 'explicit', text) 
    return text

important_stopwords = {"not", "no", "nor", "n't", "never", "hardly", "barely", "scarcely",
                       "very", "really", "so", "extremely", "super", "terribly", "horribly",
                       "awfully", "slightly", "somewhat", "but", "however", "although",
                       "though", "yet", "i", "we", "you", "he", "she", "they", "my", "your",
                       "why", "how", "what"}

stop_words = set(stopwords.words("english")) - important_stopwords


def preprocess_text(text):
    """Esegue il preprocessing completo di un testo per adult content filtering."""
    if isinstance(text, str):
        text = text.lower()
        text = expand_contractions_and_slang(text)
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        text = re.sub(r"@\w+|#\w+", "", text)
        text = replace_emojis_and_ascii(text)
        text = re.sub(r"[^a-z\s]", "", text)
        tokenizer = TweetTokenizer()
        tokens = tokenizer.tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        text = correct_spelling(" ".join(tokens))

        return text
    return ""
df['Segmented_Comments'] = df['body'].apply(segment_sentences)
df['Processed_Comment'] = df['body'].apply(preprocess_text)

df.head()
# Salvataggio del dataset preprocessato in formato Pickle
df.to_pickle("datasets/Preprocessed_YoutubeComments_ACF.pkl")
print("Dataset salvato in formato Pickle!")

KeyboardInterrupt: 

In [None]:
# Caricamento del dataset preprocessato da Pickle
path_pickle = "datasets/Preprocessed_YoutubeComments_ACF.pkl"

df = pd.read_pickle(path_pickle)
df.head()
df.info()

In [None]:
all_words = ' '.join([str(text) for text in df['Processed_Comment']])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # Polarità [-1, 1]

# Applicare la funzione al dataset
df['sentiment_score'] = df['Processed_Comment'].apply(get_sentiment)

# Visualizzare i primi risultati
print(df[['Processed_Comment', 'sentiment_score']].head())

# Distribuzione dei punteggi di sentimento
plt.hist(df['sentiment_score'], bins=20, edgecolor='black')
plt.title('Distribuzione dei punteggi di sentimento')
plt.xlabel('Sentiment score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Salvataggio del dataset preprocessato in formato Pickle
df.to_pickle("datasets/Preprocessed_RedditComments.pkl")
print("Dataset salvato in formato Pickle!")

In [None]:
path_pickle = "datasets/Preprocessed_RedditComments.pkl"#
df = pd.read_pickle(path_pickle)
df.head()

In [None]:
class_distribution = df['Sentiment'].value_counts()

class_distribution

# 📌 Stampa la distribuzione numerica
print("🔹 Distribuzione delle classi nel dataset:")
print(class_distribution)

# 📌 Visualizzazione della distribuzione con un grafico a barre
plt.figure(figsize=(6, 4))
class_distribution.plot(kind='bar', color=['salmon', 'lightblue', 'lightgreen'])
plt.title("Distribuzione delle Classi di Sentiment")
plt.xlabel("Categoria di Sentiment")
plt.ylabel("Numero di Campioni")

# 📌 Aggiungere i numeri sopra le barre
for index, value in enumerate(class_distribution):
    plt.text(index, value + 5, str(value), ha='center', fontsize=10, fontweight='bold')

plt.xticks(rotation=0)
plt.show()

TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

# 📌 Convertire il testo in TF-IDF
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,3))  # Include unigrammi, bigrammi e trigrammi
X_tfidf = tfidf.fit_transform(df['Processed_Comment'])

# 📌 Stampare la distribuzione delle classi PRIMA del bilanciamento
print("🔹 Distribuzione classi PRIMA del bilanciamento:")
print(df['Sentiment'].value_counts())

# 📌 1️⃣ **Downsampling della classe positiva**
# Riduciamo 'positive' a un numero intermedio, ad esempio 5000
under_sampler = RandomUnderSampler(sampling_strategy={'positive': 5000}, random_state=42)
X_under, y_under = under_sampler.fit_resample(X_tfidf, df['Sentiment'])

# 📌 2️⃣ **Upsampling delle classi negative e neutrali con SMOTE**
# Portiamo 'negative' e 'neutral' a circa 5000 ciascuno
smote = SMOTE(sampling_strategy={'negative': 5000, 'neutral': 5000}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_under, y_under)

# 📌 Stampare la distribuzione delle classi DOPO il bilanciamento
print("\n🔹 Distribuzione classi DOPO il bilanciamento:")
print(pd.Series(y_balanced).value_counts())

# 📌 Divisione in training e test set
X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

# 📌 Visualizzazione della distribuzione prima e dopo il bilanciamento
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 📌 Grafico PRIMA del bilanciamento
df['Sentiment'].value_counts().plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon'], ax=axes[0])
axes[0].set_title('Distribuzione PRIMA del bilanciamento')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Numero di Campioni')

# 📌 Grafico DOPO il bilanciamento
pd.Series(y_balanced).value_counts().plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon'], ax=axes[1])
axes[1].set_title('Distribuzione DOPO il bilanciamento')
axes[1].set_xlabel('Sentiment')
axes[1].set_ylabel('Numero di Campioni')

plt.tight_layout()
plt.show()

# 📌 Stampare le dimensioni finali dei dataset
print(f"\n🔹 Dimensione Training Set: {X_train_sent.shape}")
print(f"🔹 Dimensione Test Set: {X_test_sent.shape}")

Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV

# 📌 Normalizzazione per migliorare Naïve Bayes
normalizer = Normalizer()
X_train_sent = normalizer.fit_transform(X_train_sent)
X_test_sent = normalizer.transform(X_test_sent)

# 📌 Ottimizzazione di `alpha` per MultinomialNB
param_grid = {'alpha': [0.1, 0.5, 1.0, 5, 10]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_sent, y_train_sent)

# 📌 Miglior valore di alpha trovato
best_alpha = grid_search.best_params_['alpha']
print(f"🔹 Miglior alpha trovato: {best_alpha}")

# 📌 Addestramento del modello con il miglior `alpha`
nb_classifier = ComplementNB(alpha=best_alpha)
nb_classifier.fit(X_train_sent, y_train_sent)

# 📌 Previsioni sul test set
y_pred_nb = nb_classifier.predict(X_test_sent)

# 📌 Valutazione del modello
print("🔹 Naïve Bayes Performance:")
print(classification_report(y_test_sent, y_pred_nb))



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# 📌 Matrice di confusione
cm = confusion_matrix(y_test_sent, y_pred_nb)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
plt.xlabel("Predetto")
plt.ylabel("Reale")
plt.title("Matrice di Confusione - Naïve Bayes")
plt.show()

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# 📌 Standardizzazione dei dati
scaler = StandardScaler(with_mean=False) # per gestire le matrici sparse geenrate da TF-IDF
X_train_scaled = scaler.fit_transform(X_train_sent)  # Usa il dataset bilanciato
X_test_scaled = scaler.transform(X_test_sent)  # Mantiene la scala nel test set

# 📌 Addestramento del modello Logistic Regression
log_reg = LogisticRegression(class_weight='balanced', max_iter=500)  # Maggiori iterazioni per stabilità
log_reg.fit(X_train_scaled, y_train_sent)

# 📌 Previsioni
y_pred_log = log_reg.predict(X_test_scaled)

# 📌 Valutazione
print("Logistic Regression Performance:")
print(classification_report(y_test_sent, y_pred_log))

SVM

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# 📌 **Standardizzazione con StandardScaler**

scaler = StandardScaler(with_mean=False)  # Evita problemi con matrici sparse TF-IDF
X_train_scaled = scaler.fit_transform(X_train_sent)
X_test_scaled = scaler.transform(X_test_sent)

# 📌 **Definizione della griglia di parametri per SVM**

param_grid = {
    'C': [0.1, 1, 10, 100],  # Parametro di regolarizzazione
    'gamma': [0.01, 0.1, 1, 'scale', 'auto'],  # Parametro di kernel
    'kernel': ['rbf', 'linear']  # Manteniamo il kernel RBF
}

# 📌 **Ricerca dei migliori iperparametri con GridSearchCV**

grid_search = GridSearchCV(
    SVC(class_weight='balanced'),
    param_grid,
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1)

grid_search.fit(X_train_scaled, y_train_sent)

# 📌 **Migliori parametri trovati**
print(f"🔹 Migliori parametri trovati: {grid_search.best_params_}")

# 📌 **Usiamo il miglior modello trovato**
best_svm = grid_search.best_estimator_

# 📌 **Previsioni**
svm_predictions_rbf = best_svm.predict(X_test_scaled)

# 📌 **Valutazione del modello**
print("🔹 Report delle performance (SVM con GridSearch e StandardScaler):")
print(classification_report(y_test_sent, svm_predictions_rbf))

Word2Vec + Padding

In [None]:
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

word2vec_model = api.load("word2vec-google-news-300")

def get_word2vec_sequence(comment):
    tokens = word_tokenize(comment)
    vectors = [word2vec_model[word] for word in tokens if word in word2vec_model]
    return np.array(vectors, dtype=np.float32) if vectors else np.zeros((1, 300), dtype=np.float32)

word2vec_sequences = [get_word2vec_sequence(comment) for comment in df['Processed_Comment']]

sequence_lengths = [seq.shape[0] for seq in word2vec_sequences]
optimal_length = int(np.mean(sequence_lengths) + np.std(sequence_lengths))

word2vec_sequences_padded = pad_sequences(word2vec_sequences, maxlen=optimal_length, dtype='float32', padding='post', truncating='post')

X_word2vec = np.array(word2vec_sequences_padded, dtype=np.float32)

sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment_Numeric'] = df['Sentiment'].map(sentiment_mapping)

print("🔹 Distribuzione classi PRIMA del bilanciamento:")
print(pd.Series(df['Sentiment_Numeric']).value_counts())

X_word2vec_flat = X_word2vec.reshape(X_word2vec.shape[0], -1) 
under_sampler = RandomUnderSampler(sampling_strategy={2: 5000}, random_state=42)
X_under, y_under = under_sampler.fit_resample(X_word2vec_flat, df['Sentiment_Numeric'])

smote = SMOTE(sampling_strategy={0: 5000, 1: 5000}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_under, y_under)

print("\n🔹 Distribuzione classi DOPO il bilanciamento:")
print(pd.Series(y_balanced).value_counts())

# 📌 **Ripristinare la forma originale per LSTM** (N, timesteps, embedding_dim)
X_balanced = X_balanced.reshape(-1, optimal_length, 300)

# 📌 **Suddivisione del dataset bilanciato in Training e Test**
X_train_sent_seq, X_test_sent_seq, y_train_sent, y_test_sent = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

# 📌 Controllare le dimensioni
print(f"Forma di X_train_sent_seq: {X_train_sent_seq.shape}")
print(f"Forma di X_test_sent_seq: {X_test_sent_seq.shape}")
print(f"Distribuzione classi train:\n{np.bincount(y_train_sent)}")
print(f"Distribuzione classi test:\n{np.bincount(y_test_sent)}")

LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# =========================
# 📌 Creazione del modello LSTM ottimizzato per evitare overfitting
# =========================

lstm_model = Sequential([
    Masking(mask_value=0.0, input_shape=(optimal_length, 300)),  # Ignora i padding
    LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3),  # 🔹 Dropout nella LSTM
    LSTM(32, dropout=0.3, recurrent_dropout=0.3),  # 🔹 Strato LSTM più piccolo
    Dropout(0.3),  # 🔹 Dropout tra le LSTM e il Dense
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),  # 🔹 L2 Regularization
    Dropout(0.3),  # 🔹 Dropout anche nel Fully Connected
    Dense(3, activation='softmax')  # 🔹 Output con 3 classi
])

"""lstm_model = Sequential([
    Masking(mask_value=0.0, input_shape=(optimal_length, 300)),  # Ignora i padding
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])"""

# 📌 Compilazione del modello
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 📌 Calcolo dei pesi delle classi
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_sent), y=y_train_sent)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# 📌 Early Stopping con patience maggiore
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

# 📌 Addestramento con batch_size maggiore
history = lstm_model.fit(X_train_sent_seq, y_train_sent,
                         epochs=20,  # 🔹 Aumentato per stabilità
                         batch_size=64,  # 🔹 Batch più grande per aggiornamenti più stabili
                         validation_data=(X_test_sent_seq, y_test_sent),
                         callbacks=[early_stopping])  # 🔹 Usare pesi di classe

Grafico di Accuracy e Loss durante l’addestramento

In [None]:
# 📌 Ottenere i dati dall'addestramento
history = lstm_model.history.history

# 📌 Plot della accuracy
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history['accuracy'], label='Training Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.title('Training & Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# 📌 Plot della loss
plt.subplot(1,2,2)
plt.plot(history['loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Training & Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
from sklearn.metrics import classification_report

# 📌 Previsioni sul test set
y_pred_lstm = np.argmax(lstm_model.predict(X_test_sent_seq), axis=1)  # Converte probabilità in classi

# 📌 Generare il classification report
report = classification_report(y_test_sent, y_pred_lstm, target_names=['negative', 'neutral', 'positive'])

# 📌 Stampare il report
print("LSTM Performance:\n", report)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# 📌 Creare la matrice di confusione
cm = confusion_matrix(y_test_sent, y_pred_lstm)

# 📌 Plot della matrice di confusione
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Matrice di Confusione - LSTM Sentiment Analysis")
plt.show()

Transformer

In [None]:
import torch

# Controllare la disponibilità della GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

torch.backends.cudnn.benchmark = True

In [None]:
import torch
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import confusion_matrix, classification_report

# 📌 Controllare e impostare il dispositivo su GPU CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔹 Using device: {device}")

# 📌 Mappare i sentimenti in numeri
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment_Numeric'] = df['Sentiment'].map(sentiment_mapping)

# 📌 Stampare la distribuzione delle classi PRIMA del bilanciamento
print("🔹 Distribuzione classi PRIMA del bilanciamento:")
print(df['Sentiment_Numeric'].value_counts())

# 📌 Suddividere il dataset bilanciato in training e test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Processed_Comment'].tolist(), df['Sentiment_Numeric'].tolist(),
    test_size=0.2, stratify=df['Sentiment_Numeric'], random_state=42
)

# 📌 Caricare il Tokenizer di BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 📌 Tokenizzare i testi per BERT
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=70, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=70, return_tensors='pt')

# 📌 Convertire gli encodings in NumPy per poter applicare SMOTE
X_train = train_encodings['input_ids'].numpy()
y_train = np.array(train_labels)

# 📌 **Bilanciamento Dopo la Tokenizzazione**
# 🔹 1️⃣ **Downsampling della classe `positive` (2) → Ridotta a 5000 campioni**
under_sampler = RandomUnderSampler(sampling_strategy={2: 5000}, random_state=42)
X_under, y_under = under_sampler.fit_resample(X_train, y_train)

# 🔹 2️⃣ **Upsampling delle classi `negative` (0) e `neutral` (1) con SMOTE → Portate a 5000 campioni**
smote = SMOTE(sampling_strategy={0: 5000, 1: 5000}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_under, y_under)

# 📌 Stampare la distribuzione delle classi DOPO il bilanciamento
print("\n🔹 Distribuzione classi DOPO il bilanciamento:")
print(pd.Series(y_balanced).value_counts())

# 📌 **Convertire i dati bilanciati in Tensor per PyTorch**
X_balanced_tensor = torch.tensor(X_balanced, dtype=torch.long)
y_balanced_tensor = torch.tensor(y_balanced, dtype=torch.long)

# 📌 **Classe Dataset Corretto**
class SentimentDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {"input_ids": self.input_ids[idx], "labels": self.labels[idx]}

# 📌 Creazione dei dataset
train_dataset = SentimentDataset(X_balanced_tensor, y_balanced_tensor)
test_dataset = SentimentDataset(test_encodings["input_ids"], torch.tensor(test_labels))

# 📌 Configura BERT con dropout più alto
config = BertConfig.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)

# 📌 Caricare il modello pre-addestrato di BERT per classificazione
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model.to(device)

# 📌 Definire l'ottimizzatore AdamW con weight decay
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=10)

# 📌 Aumentare il batch size per sfruttare la GPU
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 📌 Funzione di Training Personalizzata
def train_model(model, train_loader, val_loader, epochs=10):
    class_weights = torch.tensor([1.0 / 0.33, 1.0 / 0.33, 1.0 / 0.33]).to(device)
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    results = []
    for epoch in range(epochs):
        model.train()
        loop = tqdm(train_loader, leave=True)
        total_loss = 0
        start_time = time.time()

        for batch in loop:
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)

            outputs = model(input_ids=inputs["input_ids"])
            loss = loss_fn(outputs.logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            total_loss += loss.item()
            loop.set_description(f'Epoch {epoch + 1}')
            loop.set_postfix(loss=loss.item())

        avg_training_loss = total_loss / len(train_loader)
        val_loss, val_accuracy = evaluate_model(model, val_loader)
        epoch_time = time.time() - start_time

        results.append({
            "Epoca": f"Epoca {epoch + 1}",
            "Training Loss": round(avg_training_loss, 4),
            "Validation Loss": round(val_loss, 4),
            "Validation Accuracy": round(val_accuracy, 4),
            "Training Time": time.strftime("%H:%M:%S", time.gmtime(epoch_time))
        })

    results_df = pd.DataFrame(results)
    print(results_df)
    return results_df

# 📌 Funzione di Valutazione
def evaluate_model(model, val_loader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    loss_fn = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)

            outputs = model(input_ids=inputs["input_ids"])
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = total_loss / len(val_loader)
    accuracy = correct / total
    return avg_val_loss, accuracy

# 📌 Avviare l'addestramento
results_df = train_model(model, train_loader, val_loader, epochs=10)

# 📌 Salva il DataFrame con i risultati
results_df.to_csv("training_results.csv", index=False)

# 📌 Valutazione sul set di test
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].cpu().numpy()

        outputs = model(input_ids=inputs["input_ids"])
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# 📌 Salva le previsioni e i veri label
np.save("predictions.npy", np.array(all_preds))
np.save("true_labels.npy", np.array(all_labels))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# 📌 Carica i dati salvati
results_df = pd.read_csv("training_results.csv")
all_preds = np.load("predictions.npy")
all_labels = np.load("true_labels.npy")

# 📌 Grafico Loss durante il Training
plt.figure(figsize=(8, 6))
plt.plot(results_df["Epoca"], results_df["Training Loss"], label="Training Loss", marker="o")
plt.plot(results_df["Epoca"], results_df["Validation Loss"], label="Validation Loss", marker="o")
plt.xlabel("Epoca")
plt.ylabel("Loss")
plt.legend()
plt.title("Training vs Validation Loss")
plt.show()

# 📌 Grafico Accuracy durante il Training
plt.figure(figsize=(8, 6))
plt.plot(results_df["Epoca"], results_df["Validation Accuracy"], label="Validation Accuracy", marker="o", color="green")
plt.xlabel("Epoca")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Validation Accuracy Trend")
plt.show()

# 📌 Matrice di Confusione
cm = confusion_matrix(all_labels, all_preds)
labels = ["Negative", "Neutral", "Positive"]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# 📌 Report di Classificazione
print("\nClassification Report:\n")
print(classification_report(all_labels, all_preds, target_names=labels))