In [None]:
!pip install unidecode
!python -m spacy download pt_core_news_sm

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adagrad
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.metrics import Precision, Recall

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from unidecode import unidecode
import csv
from datetime import datetime
import os
from tensorflow.keras.regularizers import l2


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

def load_data(filename):
    df = pd.read_csv(filename, encoding='utf-8')
    df = df[['comentario', 'label_final']]
    return df


def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = unidecode(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('portuguese'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

def get_optimizer(optimizer_name, learning_rate=0.001):
    """
    Retorna o otimizador com base no nome fornecido
    """
    if optimizer_name.lower() == 'adam':
        return Adam(learning_rate=learning_rate)
    elif optimizer_name.lower() == 'sgd':
        return SGD(learning_rate=learning_rate)
    elif optimizer_name.lower() == 'rmsprop':
        return RMSprop(learning_rate=learning_rate)
    elif optimizer_name.lower() == 'adagrad':
        return Adagrad(learning_rate=learning_rate)
    else:
        print(f"Otimizador {optimizer_name} não reconhecido. Usando Adam como padrão.")
        return Adam(learning_rate=learning_rate)

def create_model(max_words, max_len, optimizer_name, loss_function, embedding_dim=100, learning_rate=0.001):
    model = Sequential([
        Embedding(max_words, 128, input_length=max_len),

        
        BatchNormalization(),

        Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.02))),
        Dropout(0.6),  

        Bidirectional(LSTM(32, kernel_regularizer=l2(0.02))),
        Dropout(0.6),

        Dense(64, activation='relu', kernel_regularizer=l2(0.02)),
        Dropout(0.6),

        Dense(1, activation='sigmoid')
    ])

    
    optimizer = get_optimizer(optimizer_name, learning_rate)

    
    model.compile(
        optimizer=optimizer,
        loss=loss_function,
        metrics=['accuracy', Precision(), Recall()]
    )
    return model

def save_metrics_to_csv(history, metrics, optimizer_name, loss_function, learning_rate, csv_filename='/content/drive/MyDrive/TG/training_metrics2.csv'):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    
    train_precision = history.history['precision'][-1] if 'precision' in history.history else 0
    val_precision = history.history['val_precision'][-1] if 'val_precision' in history.history else 0
    train_recall = history.history['recall'][-1] if 'recall' in history.history else 0
    val_recall = history.history['val_recall'][-1] if 'val_recall' in history.history else 0

    metrics_dict = {
        'timestamp': timestamp,
        'optimizer': optimizer_name,
        'learning_rate': learning_rate,
        'loss_function': loss_function,
        'train_loss': history.history['loss'][-1],
        'val_loss': history.history['val_loss'][-1],
        'train_accuracy': history.history['accuracy'][-1],
        'val_accuracy': history.history['val_accuracy'][-1],
        'train_precision': train_precision,
        'val_precision': val_precision,
        'train_recall': train_recall,
        'val_recall': val_recall,
        'test_accuracy': metrics['accuracy'],
        'test_precision': metrics['precision'],
        'test_recall': metrics['recall'],
        'test_f1': metrics['f1_score'],
        'best_epoch': metrics['best_epoch'],
        'total_epochs': len(history.history['loss'])
    }

    file_exists = os.path.isfile(csv_filename)

    with open(csv_filename, 'a' if file_exists else 'w', newline='') as csvfile:
        fieldnames = list(metrics_dict.keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if not file_exists:
            writer.writeheader()

        writer.writerow(metrics_dict)

    print(f"\nMétricas salvas em {csv_filename}:")
    for key, value in metrics_dict.items():
        print(f"{key}: {value}")

def evaluate_model(model, X_test, y_test):
    y_pred_proba = model.predict(X_test)
    y_pred = (y_pred_proba > 0.5).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("\nMatriz de Confusão:")
    print(f"Verdadeiros Negativos: {tn}")
    print(f"Falsos Positivos: {fp}")
    print(f"Falsos Negativos: {fn}")
    print(f"Verdadeiros Positivos: {tp}")

    print(f"\nMétricas de Avaliação:")
    print(f"Acurácia: {accuracy:.4f}")
    print(f"Precisão: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1_score:.4f}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'best_epoch': None
    }

def run_experiment(X_train, X_test, y_train, y_test, max_words, max_len, optimizer_name, loss_function, learning_rate=0.001):
    print(f"\nTreinando modelo com optimizer: {optimizer_name} (lr={learning_rate}), loss: {loss_function}")

    model = create_model(max_words, max_len, optimizer_name, loss_function, learning_rate=learning_rate)

    
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=4,
        restore_best_weights=True,
        verbose=1
    )

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    checkpoint = ModelCheckpoint(
        f'/content/drive/MyDrive/TG/best_model_{optimizer_name}_{loss_function}_lr{learning_rate}_{timestamp}.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )

    history = model.fit(
        X_train, y_train,
        epochs=30,
        batch_size=64,
        validation_split=0.3,
        callbacks=[early_stopping, checkpoint],
        verbose=1
    )

    metrics = evaluate_model(model, X_test, y_test)
    metrics['best_epoch'] = np.argmax(history.history['val_accuracy']) + 1

    save_metrics_to_csv(history, metrics, optimizer_name, loss_function, learning_rate)

    return metrics, history, model

def save_tokenizer(tokenizer, filename='/content/drive/MyDrive/TG/tokenizer.pickle'):
    """
    Salva o tokenizer em um arquivo pickle
    """
    import pickle
    with open(filename, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Tokenizer salvo em: {filename}")

def save_best_model(model, optimizer_name, loss_function, learning_rate, metrics, filename=None):
    """
    Salva o melhor modelo treinado
    """
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'/content/drive/MyDrive/TG/best_model_final_{optimizer_name}_{loss_function}_lr{learning_rate}_{timestamp}.keras'

    model.save(filename)
    print(f"Melhor modelo salvo em: {filename}")

    
    metrics_summary = {
        'optimizer': optimizer_name,
        'loss_function': loss_function,
        'learning_rate': learning_rate,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1_score': metrics['f1_score'],
        'best_epoch': metrics['best_epoch']
    }

    summary_filename = filename.replace('.keras', '_summary.txt')
    with open(summary_filename, 'w') as f:
        for key, value in metrics_summary.items():
            f.write(f"{key}: {value}\n")

    print(f"Resumo das métricas do melhor modelo salvo em: {summary_filename}")

def main():
    print("Carregando dataset HateBR...")
    
    
    df = load_data('/content/drive/MyDrive/TG/HateBR.csv')

    print("Verificando valores únicos em label_final:")
    print(df['label_final'].value_counts())

    
    df = df.dropna()

    print("\nPré-processando textos...")
    df['processed_comment'] = df['comentario'].apply(preprocess_text)

    max_words = 15000
    max_len = 150

    print("Tokenizando textos...")
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(df['processed_comment'])

    
    save_tokenizer(tokenizer)

    sequences = tokenizer.texts_to_sequences(df['processed_comment'])
    X = pad_sequences(sequences, maxlen=max_len)
    y = df['label_final'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    
    optimizers = ['adam', 'sgd', 'rmsprop', 'adagrad']
    losses = ['binary_crossentropy', 'mean_squared_error', 'hinge', 'categorical_hinge']
    learning_rates = [0.001, 0.0005, 0.0001]

    
    all_results = []
    best_metrics = None
    best_val_accuracy = 0
    best_model = None
    best_optimizer = None
    best_loss = None
    best_lr = None

    
    for optimizer in optimizers:
        for loss in losses:
            for lr in learning_rates:
                try:
                    print(f"\n{'='*50}")
                    print(f"TESTANDO: Optimizer={optimizer}, Loss={loss}, Learning Rate={lr}")
                    print(f"{'='*50}")

                    metrics, history, model = run_experiment(
                        X_train, X_test, y_train, y_test,
                        max_words, max_len,
                        optimizer, loss, lr
                    )

                    
                    result = {
                        'optimizer': optimizer,
                        'loss': loss,
                        'learning_rate': lr,
                        'accuracy': metrics['accuracy'],
                        'f1_score': metrics['f1_score'],
                        'precision': metrics['precision'],
                        'recall': metrics['recall']
                    }
                    all_results.append(result)

                    
                    if metrics['f1_score'] > best_val_accuracy:  
                        best_val_accuracy = metrics['f1_score']
                        best_metrics = metrics
                        best_model = model
                        best_optimizer = optimizer
                        best_loss = loss
                        best_lr = lr

                except Exception as e:
                    print(f"Erro ao treinar com optimizer={optimizer}, loss={loss}, lr={lr}: {str(e)}")
                    continue

    
    all_results.sort(key=lambda x: x['f1_score'], reverse=True)

    
    print("\n\n" + "="*80)
    print("RESULTADO DAS EXPERIMENTAÇÕES (ORDENADO POR F1-SCORE)")
    print("="*80)
    print(f"{'Optimizer':<10} | {'Loss':<20} | {'LR':<8} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10}")
    print("-"*90)
    for result in all_results:
        print(f"{result['optimizer']:<10} | {result['loss']:<20} | {result['learning_rate']:<8.6f} | {result['accuracy']:<10.4f} | {result['precision']:<10.4f} | {result['recall']:<10.4f} | {result['f1_score']:<10.4f}")

    
    if best_model is not None:
        print("\n\n" + "="*50)
        print("MELHOR CONFIGURAÇÃO ENCONTRADA")
        print("="*50)
        print(f"Optimizer: {best_optimizer}")
        print(f"Loss Function: {best_loss}")
        print(f"Learning Rate: {best_lr}")
        print(f"Test Accuracy: {best_metrics['accuracy']:.4f}")
        print(f"Precision: {best_metrics['precision']:.4f}")
        print(f"Recall: {best_metrics['recall']:.4f}")
        print(f"F1-Score: {best_metrics['f1_score']:.4f}")

        save_best_model(best_model, best_optimizer, best_loss, best_lr, best_metrics)

if __name__ == "__main__":
    print("Iniciando o programa...")
    main()