In [None]:
#instalacao
!pip install fasttext
!pip install accelerate -U

!pip install numpy==1.26.4

In [None]:
# Célula 1: Instalações e Configuração do Ambiente


from google.colab import drive
drive.mount('/content/drive')


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import os
import re
import time
import nltk
import spacy

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datetime import datetime



In [None]:
# Célula 2: Configuração, Carga e Análise Inicial


file_path = '/content/drive/MyDrive/Colab Notebooks/Projeto_Ouvidoria/Data/ouvidoria_sintetico.csv'

DATASET_ID = os.path.splitext(os.path.basename(file_path))[0]




try:
    df = pd.read_csv(file_path, sep=';',encoding='utf-8', on_bad_lines='skip')
except FileNotFoundError:
    print(f"ERRO: O arquivo não foi encontrado no caminho '{file_path}'.")
    print("Verifique se o caminho no Google Drive está correto.")
    df = None



if df is not None:
    df_encoded = df.copy()
    for column in df_encoded.columns:
        if df_encoded[column].dtype == 'object':
            df_encoded[column] = LabelEncoder().fit_transform(df_encoded[column])

    correlation_matrix = df_encoded.corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Matriz de Correlação - {DATASET_ID}', fontsize=16)
    plt.show()

In [None]:
# Célula 3: Pré-processamento e Estratégia de Validação

try:
    nlp = spacy.load('pt_core_news_sm', disable=['parser', 'ner'])
except OSError:
    os.system('python -m spacy download pt_core_news_sm')
    nlp = spacy.load('pt_core_news_sm', disable=['parser', 'ner'])

try:
    stopwords_pt = nltk.corpus.stopwords.words('portuguese')
except LookupError:
    nltk.download('stopwords')
    stopwords_pt = nltk.corpus.stopwords.words('portuguese')

def preprocess_text_advanced(text):
    if not isinstance(text, str) or not nlp: return ""
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    doc = nlp(text.lower())
    lemmas = [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stopwords_pt and len(token.lemma_) > 2]
    return " ".join(lemmas)


df_model = df.copy()
df_model.dropna(subset=['Texto da Demanda', 'Categoria'], inplace=True)
print(df_model.shape)
print("\nIniciando pré-processamento avançado do texto (lematização).")
df_model['texto_processado'] = df_model['Texto da Demanda'].apply(preprocess_text_advanced)
print("Pré-processamento concluído.")


le = LabelEncoder()
df_model['categoria_encoded'] = le.fit_transform(df_model['Categoria'])


X = df_model['texto_processado']
y = df_model['categoria_encoded']
print(f"Dataset final com {len(df_model)} amostras preparado para o treinamento.")


rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
N_RUNS = rskf.get_n_splits(X, y)
print(f"Estratégia de validação definida: {N_RUNS} execuções por modelo.")


In [None]:
# Célula 4: Avaliação dos Modelos Scikit-learn


current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
execution_folder_name = f"Execucao_{DATASET_ID}_{current_time}"
output_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/Projeto_Ouvidoria/Results/', execution_folder_name)
os.makedirs(output_dir, exist_ok=True)
print(f"Resultados desta execução serão salvos em: {output_dir}")

models_to_run = {
    "Logistic Regression": LogisticRegression(max_iter=1500, C=10),
    "Multinomial Naive Bayes": MultinomialNB(alpha=0.1),
    "Linear SVM": LinearSVC(dual='auto', C=10)
}

for name, model in models_to_run.items():
    print(f"\nAvaliando o modelo: {name}")
    fold_results = []
    num_classes = len(le.classes_)
    sum_cm = np.zeros((num_classes, num_classes), dtype=int)

    for i, (train_index, val_index) in enumerate(rskf.split(X, y), 1):
        print(f"   Executando Fold {i}/{N_RUNS}...", end='\r')
        # ... (código de treino e predição) ...
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 1))
        X_train_fold_tfidf = vectorizer.fit_transform(X_train_fold)
        X_val_fold_tfidf = vectorizer.transform(X_val_fold)

        model.fit(X_train_fold_tfidf, y_train_fold)
        y_pred = model.predict(X_val_fold_tfidf)

        labels_ordered = le.transform(le.classes_)
        cm = confusion_matrix(y_val_fold, y_pred, labels=labels_ordered)
        sum_cm += cm

        # ... (cálculo de métricas)
        accuracy = accuracy_score(y_val_fold, y_pred)
        precision = precision_score(y_val_fold, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_val_fold, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, average='macro', zero_division=0)
        FP = cm.sum(axis=0) - np.diag(cm)
        FN = cm.sum(axis=1) - np.diag(cm)
        TP = np.diag(cm)
        TN = cm.sum() - (FP + FN + TP)

        fold_results.append({
            'Run': i, 'Accuracy': accuracy, 'F1_Score_Macro': f1,
            'Precision_Macro': precision, 'Recall_Macro': recall,
            'TP': TP.sum(), 'FP': FP.sum(), 'FN': FN.sum(), 'TN': TN.sum()
        })


    detailed_results_df = pd.DataFrame(fold_results)
    base_filename = f"{DATASET_ID}_detailed_results_{name.replace(' ', '_')}.csv"

    full_path_to_save = os.path.join(output_dir, base_filename)
    detailed_results_df.to_csv(full_path_to_save, index=False, float_format='%.5f')
    print(f"\nResultados detalhados para '{name}' salvos.")


    avg_cm = sum_cm #/ N_RUNS
    cm_df = pd.DataFrame(avg_cm, index=le.classes_, columns=le.classes_)

    cm_filename = f"{DATASET_ID}_avg_cm_{name.replace(' ', '_')}.csv"

    cm_full_path = os.path.join(output_dir, cm_filename)
    cm_df.to_csv(cm_full_path, float_format='%.2f')
    print(f"Matriz de confusão média para '{name}' salva.")


    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_df, annot=True, fmt='.2f', cmap='Blues')
    plt.title(f'Matriz de Confusão Média ({N_RUNS} execuções) - {name}')
    plt.ylabel('Classe Verdadeira')
    plt.xlabel('Classe Prevista')
    plt.show()

print("\nAvaliação dos modelos TF-IDF concluída!")
print("=====================================================\n")

In [None]:
# Célula 5: Modelo FastText

import fasttext
import time
import numpy as np
import pandas as pd
from IPython.display import display

fold_results_ft = []
num_classes = len(le.classes_)
sum_cm_ft = np.zeros((num_classes, num_classes), dtype=int)

for i, (train_index, val_index) in enumerate(rskf.split(X, y), 1):
    print(f"Executando Fold {i}/{N_RUNS}...", end='\r')
    train_file = 'fasttext.train'
    with open(train_file, 'w', encoding='utf-8') as f:
        for idx in train_index:
            label = "__label__" + str(df_model['Categoria'].iloc[idx]).replace(' ', '_')
            text = X.iloc[idx]
            f.write(f"{label} {text}\n")

    model_ft = fasttext.train_supervised(input=train_file, epoch=25, lr=1.0, wordNgrams=2, dim=320)

    X_val_fold_list = X.iloc[val_index].tolist()
    y_val_fold_encoded = y.iloc[val_index]

    preds_raw = model_ft.predict(X_val_fold_list, k=1)
    y_pred_text = [pred[0].replace('__label__', '').replace('_', ' ') for pred in preds_raw[0]]
    y_pred_encoded = le.transform(y_pred_text)


    labels_ordered = le.transform(le.classes_)
    cm = confusion_matrix(y_val_fold_encoded, y_pred_encoded, labels=labels_ordered)
    sum_cm_ft += cm


    accuracy = accuracy_score(y_val_fold_encoded, y_pred_encoded)
    precision = precision_score(y_val_fold_encoded, y_pred_encoded, average='macro', zero_division=0)
    recall = recall_score(y_val_fold_encoded, y_pred_encoded, average='macro', zero_division=0)
    f1 = f1_score(y_val_fold_encoded, y_pred_encoded, average='macro', zero_division=0)
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)

    fold_results_ft.append({
        'Run': i, 'Accuracy': accuracy, 'F1_Score_Macro': f1,
        'Precision_Macro': precision, 'Recall_Macro': recall,
        'TP': TP.sum(), 'FP': FP.sum(), 'FN': FN.sum(), 'TN': TN.sum()
    })


detailed_results_df_ft = pd.DataFrame(fold_results_ft)
base_filename_ft = f"{DATASET_ID}_detailed_results_FastText.csv"

full_path_to_save_ft = os.path.join(output_dir, base_filename_ft)
detailed_results_df_ft.to_csv(full_path_to_save_ft, index=False, float_format='%.5f')

avg_cm_ft = sum_cm_ft #/ N_RUNS
cm_df_ft = pd.DataFrame(avg_cm_ft, index=le.classes_, columns=le.classes_)

cm_filename_ft = f"{DATASET_ID}_avg_cm_FastText.csv"

cm_full_path_ft = os.path.join(output_dir, cm_filename_ft)
cm_df_ft.to_csv(cm_full_path_ft, float_format='%.2f')



plt.figure(figsize=(10, 8))
sns.heatmap(cm_df_ft, annot=True, fmt='.2f', cmap='Greens')
plt.title(f'Matriz de Confusão Média ({N_RUNS} execuções) - FastText')
plt.ylabel('Classe Verdadeira')
plt.xlabel('Classe Prevista')
plt.show()


In [None]:
# Célula 6: BERTimbau

import torch
import time
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from IPython.display import display





fold_results_bert = []
num_classes = len(le.classes_)
sum_cm_bert = np.zeros((num_classes, num_classes), dtype=int)

model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

for i, (train_index, val_index) in enumerate(rskf.split(X, y), 1):

    train_df_fold = pd.DataFrame({'text': X.iloc[train_index], 'label': y.iloc[train_index]})
    val_df_fold = pd.DataFrame({'text': X.iloc[val_index], 'label': y.iloc[val_index]})

    train_dataset = Dataset.from_pandas(train_df_fold).map(tokenize, batched=True, load_from_cache_file=False)
    val_dataset = Dataset.from_pandas(val_df_fold).map(tokenize, batched=True, load_from_cache_file=False)
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    model_bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))

    training_args = TrainingArguments(
        output_dir=f'./bert_results_fold_{i}',
        num_train_epochs=5,
        learning_rate=4.246e-05,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        warmup_ratio=0.054,
        weight_decay=0.023,
        fp16=torch.cuda.is_available(),
        logging_strategy="no",
        save_strategy="no",
        disable_tqdm=False,
        report_to="none"
    )

    trainer = Trainer(model=model_bert, args=training_args, train_dataset=train_dataset)
    trainer.train()

    predictions_output = trainer.predict(val_dataset)
    y_true_encoded = predictions_output.label_ids
    preds_encoded = np.argmax(predictions_output.predictions, axis=1)

    labels_ordered = le.transform(le.classes_)
    cm = confusion_matrix(y_true_encoded, preds_encoded, labels=labels_ordered)
    sum_cm_bert += cm


    accuracy = accuracy_score(y_true_encoded, preds_encoded)
    precision = precision_score(y_true_encoded, preds_encoded, average='macro', zero_division=0)
    recall = recall_score(y_true_encoded, preds_encoded, average='macro', zero_division=0)
    f1 = f1_score(y_true_encoded, preds_encoded, average='macro', zero_division=0)
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)

    fold_results_bert.append({
        'Run': i, 'Accuracy': accuracy, 'F1_Score_Macro': f1,
        'Precision_Macro': precision, 'Recall_Macro': recall,
        'TP': TP.sum(), 'FP': FP.sum(), 'FN': FN.sum(), 'TN': TN.sum()
    })


detailed_results_df_bert = pd.DataFrame(fold_results_bert)
base_filename_bert = f"{DATASET_ID}_detailed_results_BERTimbau.csv"

full_path_to_save_bert = os.path.join(output_dir, base_filename_bert)
detailed_results_df_bert.to_csv(full_path_to_save_bert, index=False, float_format='%.5f')

avg_cm_bert = sum_cm_bert #/ N_RUNS
cm_df_bert = pd.DataFrame(avg_cm_bert, index=le.classes_, columns=le.classes_)

cm_filename_bert = f"{DATASET_ID}_avg_cm_BERTimbau.csv"

cm_full_path_bert = os.path.join(output_dir, cm_filename_bert)
cm_df_bert.to_csv(cm_full_path_bert, float_format='%.2f')
print(f"Matriz de confusão média para 'BERTimbau' salva.")


plt.figure(figsize=(10, 8))
sns.heatmap(cm_df_bert, annot=True, fmt='.2f', cmap='OrRd')
plt.title(f'Matriz de Confusão Média ({N_RUNS} execuções) - BERTimbau')
plt.ylabel('Classe Verdadeira')
plt.xlabel('Classe Prevista')
plt.show()



In [None]:
# Célula 7: Relatório Final, Comparativo e Visual - CORRIGIDA NOVAMENTE

import os
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

results_parent_dir = '/content/drive/MyDrive/Colab Notebooks/Projeto_Ouvidoria/Results/'
results_dir = None

try:

    all_run_folders = [d for d in os.listdir(results_parent_dir) if os.path.isdir(os.path.join(results_parent_dir, d))]
    if not all_run_folders:
        raise FileNotFoundError("Nenhuma pasta de execução foi encontrada no diretório 'Results'.")


    latest_run_dir = max([os.path.join(results_parent_dir, d) for d in all_run_folders], key=os.path.getmtime)
    results_dir = latest_run_dir
    print(f"Usando a pasta de execução mais recente: {results_dir}")

except FileNotFoundError as e:
    print(f"Erro: {e}")

if results_dir:

    files_found = [f for f in os.listdir(results_dir) if f.endswith('.csv') and 'detailed_results' in f]


    if files_found:
        try:
            sample_df = pd.read_csv(os.path.join(results_dir, files_found[0]))
            print(f"\nColunas encontradas no arquivo de exemplo '{files_found[0]}':")
            print(sample_df.columns.tolist())
        except Exception as e:
            print(f"Não foi possível ler o arquivo de exemplo para verificar as colunas: {e}")

    final_results_list = []
    print(f"\nGerando Relatório Final...\n")

    for file_name in files_found:
        try:
            model_name = file_name.split('_detailed_results_')[-1].replace('.csv', '').replace('_', ' ')
            full_path = os.path.join(results_dir, file_name)
            df_result = pd.read_csv(full_path)

            final_results_list.append({
                'Modelo': model_name,
                'Acurácia Média': df_result['Accuracy'].mean(),
                'Acurácia Desv. Padrão': df_result['Accuracy'].std(),
                'F1-Score Médio': df_result['F1_Score_Macro'].mean(),
                'F1-Score Desv. Padrão': df_result['F1_Score_Macro'].std(),
                'Precisão Média': df_result['Precision_Macro'].mean(),
                'Precisão Desv. Padrão': df_result['Precision_Macro'].std(),
                'Recall Média': df_result['Recall_Macro'].mean(),
                'Recall Desv. Padrão': df_result['Recall_Macro'].std(),
            })
            print(f"-> Resultados de '{model_name}' processados com sucesso.")
        except KeyError as e:
            print(f"-> ERRO DE CHAVE (Coluna não encontrada): {e}. Verifique se o nome da coluna está correto no código e no CSV para o arquivo '{file_name}'.")
        except Exception as e:
            print(f"-> ERRO ao processar o arquivo '{file_name}': {e}")

    if not final_results_list:
        print("\nNenhum arquivo de resultado foi processado. A análise não pode continuar.")
    else:

        final_summary_df = pd.DataFrame(final_results_list)
        final_summary_df_display = final_summary_df.sort_values(by='F1-Score Médio', ascending=False).reset_index(drop=True)


        display(final_summary_df_display)


        summary_base_filename = 'resumo_geral_modelos.csv'
        summary_full_path = os.path.join(results_dir, summary_base_filename)
        final_summary_df_display.to_csv(summary_full_path, index=False, float_format='%.5f')



        def plot_metric_comparison(dataframe, metric_col, std_col, title_suffix):
            plot_df = dataframe.sort_values(by=metric_col, ascending=True)
            fig, ax = plt.subplots(figsize=(12, 7))
            bars = ax.barh(plot_df['Modelo'], plot_df[metric_col], xerr=plot_df[std_col], align='center', alpha=0.85, ecolor='black', capsize=5)
            ax.bar_label(bars, fmt='%.4f', padding=5, fontsize=10)
            ax.set_xlabel(metric_col.replace('_', ' '))
            ax.set_ylabel('Modelo')
            ax.set_title(f"Comparação de Modelos {title_suffix}")
            if not plot_df.empty:
                ax.set_xlim(left=max(0, plot_df[metric_col].min() * 0.95), right=plot_df[metric_col].max() * 1.05)
            ax.grid(axis='x', linestyle='--', alpha=0.6)
            plt.tight_layout()
            plt.show()


        plot_metric_comparison(final_summary_df, 'F1-Score Médio', 'F1-Score Desv. Padrão', 'por F1-Score')
        plot_metric_comparison(final_summary_df, 'Acurácia Média', 'Acurácia Desv. Padrão', 'por Acurácia')