# Pipeline Benchmark Zeroshot

- Acur√°cia
- Precis√£o
- Recall
- F1 Score

In [None]:
import requests
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import warnings
from pysentimiento import create_analyzer

warnings.filterwarnings('ignore')

In [None]:
def load_prompt(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

In [None]:
# --- CONFIGURA√á√ïES DO SERVIDOR ---
REMOTE_OLLAMA_URL = "https://ollama.fuzzylab.online/api/generate"
LOCAL_OLLAMA_URL = "http://localhost:11434/api/generate"
DEFAULT_MODEL = "gemma3:4b"

DATASET_FILE = './data/dataset_teste_balanceado.json'
SYSTEM_PROMPT = load_prompt('./data/prompts/zeroshot.md')

In [None]:
payload = {
    "model": DEFAULT_MODEL,
    "prompt": "Teste de conex√£o com o modelo. Responda apenas OK.",
    "system": "Voc√™ √© um modelo de teste.",
    "stream": False,
    "options": {"temperature": 0.0}
}

try:
    response = requests.post(LOCAL_OLLAMA_URL, json=payload, timeout=120)
    response.raise_for_status()
    result = response.json()
    print("[SUCESS]")
    print(result.get("response", "Sem resposta"))
except requests.exceptions.Timeout:
    print("[ERROR - Timeout]: a API demorou demais para responder.")
except requests.exceptions.HTTPError as err:
    print(f"[HTTP-ERROR]: {err}")
except Exception as e:
    print(f"[ERROR-Except]: {e}")


‚úÖ Conex√£o OK. Resposta do modelo:
OK


In [None]:
SEED_VALUE = 42

ALL_CONFIGS = [
    {
        "model_name": "gemma3:4b",
        "model_display": "Gemma3:4b",
        "config_name": "Gemma3:4b",
        "temperature": 0.1,
        "top_p": 1.0,
        "top_k": 1,
        "seed": SEED_VALUE,
        "type": "ollama"
    },
    {
        "model_name": "llama3.2:3b",
        "model_display": "Llama3.2:3b",
        "config_name": "Llama3.2:3b",
        "temperature": 0.1,
        "top_p": 1.0,
        "top_k": 1,
        "seed": SEED_VALUE,
        "type": "ollama"
    },
]

In [None]:
erros_formato_log = []

def normalize_response(text, input_text, debug=False):
    """Normaliza a resposta do modelo para classifica√ß√£o."""
    if text is None:
        return ("ERRO_API", "", "API_NULL")

    clean = text.strip().upper().replace("\n", " ").strip()

    # PADR√ïES DE RECUSA
    refusals = [
        "N√ÉO POSSO", "NAO POSSO", "N√ÉO CONSIGO", "NAO CONSIGO",
        "N√ÉO SOU CAPAZ", "NAO SOU CAPAZ", "N√ÉO ATENDO", "NAO ATENDO",
        "N√ÉO FORNE√áO", "NAO FORNECO", "IMPROPRIO", "INAPROPRIADO",
        "CONTE√öDO SENS√çVEL", "CONTEUDO SENSIVEL", "POL√çTICAS", "POLITICAS"
    ]

    for r in refusals:
        if r in clean:
            erros_formato_log.append({
                "input": input_text,
                "raw": text,
                "clean": clean,
                "tipo": "RECUSA"
            })
            return ("ERRO_FORMATO", clean, f"RECUSA: {r}")

    # RESPOSTAS EXPL√çCITAS
    if clean in ["MISOGINO", "MIS√ìGINO"]:
        return ("MISOGINO", clean, "MATCH_EXPLICITO_MISOGINO")

    if clean in ["NAO_MISOGINO", "N√ÉO_MISOGINO", "NAO MISOGINO", "N√ÉO MISOGINO"]:
        return ("NAO_MISOGINO", clean, "MATCH_EXPLICITO_NAO_MISOGINO")

    # REGRAS FRACAS (tokens)
    if "MISOG" in clean and "NAO" not in clean:
        return ("MISOGINO", clean, "MATCH_TOKEN_MISOGINO")

    if "NAO" in clean and "MIS" in clean:
        return ("NAO_MISOGINO", clean, "MATCH_TOKEN_NAO_MISOGINO")

    # FORMATO INV√ÅLIDO
    erros_formato_log.append({
        "input": input_text,
        "raw": text,
        "clean": clean,
        "tipo": "FORMATO_INVALIDO"
    })

    return ("ERRO_FORMATO", clean, "FORMATO_INVALIDO")

def load_dataset():
    """Carrega o dataset do arquivo JSON."""
    try:
        with open(DATASET_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            print(f"‚úÖ Dataset carregado de '{DATASET_FILE}': {len(data)} itens.")
            return data
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Arquivo '{DATASET_FILE}' n√£o encontrado. Usando dados de exemplo.")
        return [
            {"input": "lugar de mulher √© na cozinha", "output": "MISOGINO"},
            {"input": "hoje o dia est√° lindo", "output": "NAO_MISOGINO"},
            {"input": "essa vadia n√£o serve pra nada", "output": "MISOGINO"},
            {"input": "a programa√ß√£o √© divertida", "output": "NAO_MISOGINO"},
            {"input": "mulheres s√£o emocionais demais", "output": "MISOGINO"},
            {"input": "vamos ao cinema hoje", "output": "NAO_MISOGINO"},
            {"input": "ela s√≥ serve pra cuidar da casa", "output": "MISOGINO"},
            {"input": "o livro √© muito interessante", "output": "NAO_MISOGINO"}
        ]
    except json.JSONDecodeError:
        print(f"‚ùå Erro: JSON inv√°lido em '{DATASET_FILE}'.")
        return []

def calculate_metrics(y_true, y_pred):
    """Calcula todas as m√©tricas de avalia√ß√£o."""
    # Converter para formato bin√°rio
    y_true_bin = [1 if x == "MISOGINO" else 0 for x in y_true]
    y_pred_bin = [1 if x == "MISOGINO" else 0 for x in y_pred]

    accuracy = accuracy_score(y_true_bin, y_pred_bin)
    precision = precision_score(y_true_bin, y_pred_bin, zero_division=0)
    recall = recall_score(y_true_bin, y_pred_bin, zero_division=0)
    f1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)

    # Matriz de confus√£o
    tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()

    # Taxa de erro
    error_rate = (fp + fn) / len(y_true) if len(y_true) > 0 else 0

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "tp": int(tp),
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn),
        "error_rate": error_rate
    }

## Benchmark

In [None]:
class BenchmarkRunner:
    def __init__(self, is_remote=True, system_prompt="", verbose=True, ollama_url=None):
        """
          is_remote (bool): Se True, usa API remota. Se False, usa Ollama local.
          system_prompt (str): Prompt de sistema para classifica√ß√£o.
          verbose (bool): Se True, mostra detalhes de execu√ß√£o.
          ollama_url (str): URL personalizada para Ollama (sobrescreve is_remote).
        """

        self.verbose = verbose
        self.system_prompt = system_prompt
        self.results = []
        self.detailed_predictions = {}
        self.hate_analyzer = None

        if ollama_url:
            self.ollama_url = ollama_url
            print(f"Usando URL personalizada para Ollama: {self.ollama_url}")
        elif is_remote:
            self.ollama_url = REMOTE_OLLAMA_URL
            print(f"Usando API remota Ollama: {self.ollama_url}")
        else:
            self.ollama_url = LOCAL_OLLAMA_URL
            print(f"Usando API local Ollama: {self.ollama_url}")

    def run_benchmark(self, configs, dataset, max_retries=2):
        """Executa benchmark para todas as configura√ß√µes."""
        print(f"üöÄ Iniciando benchmark com {len(configs)} configura√ß√µes...")
        print(f"üìä Dataset com {len(dataset)} exemplos\n")

        total_runs = len(configs)

        for idx, config in enumerate(configs, 1):
            print(f"\n{'='*60}")
            print(f"‚ñ∂Ô∏è Executando {idx}/{total_runs}: {config['config_name']}")
            print(f"   Modelo: {config['model_name']}")
            model_type = config.get('type', 'ollama')
            print(f"   Tipo: {model_type}")
            if model_type == 'ollama':
                print(f"   Temperature: {config['temperature']}")
                print(f"   Top-p: {config['top_p']}")
                print(f"   Top-k: {config['top_k']}")
            print(f"{'='*60}")

            result = self._test_single_config(config, dataset, max_retries)
            self.results.append(result)

            # Salvar predi√ß√µes detalhadas
            self.detailed_predictions[config['config_name']] = result['predictions']

            # Resumo da configura√ß√£o
            print(f"\nüìä Resumo da configura√ß√£o '{config['config_name']}':")
            print(f"   ‚Ä¢ F1-Score: {result['f1_score']:.3f}")
            print(f"   ‚Ä¢ Acur√°cia: {result['accuracy']:.2%}")
            print(f"   ‚Ä¢ Precis√£o: {result['precision']:.2%}")
            print(f"   ‚Ä¢ Recall: {result['recall']:.2%}")
            print(f"   ‚Ä¢ Tempo total: {result['total_time_seconds']:.1f}s")
            print(f"   ‚Ä¢ Erros: {result['errors']}/{len(dataset)}")

        print(f"\n{'='*60}")
        print("‚úÖ Benchmark finalizado!")
        print(f"{'='*60}")

        return pd.DataFrame(self.results)

    def _test_single_config(self, config, dataset, max_retries):
        """Testa uma √∫nica configura√ß√£o."""
        start_time = time.time()

        y_true = []
        y_pred = []
        all_predictions = []
        errors = 0
        request_times = []

        # Inicializar PySentimiento se necess√°rio
        model_type = config.get('type', 'ollama')
        if model_type == 'pysentimiento' and self.hate_analyzer is None:
            print("   ‚è≥ Inicializando PySentimiento...")
            try:
                # Usar portugu√™s para obter as categorias espec√≠ficas
                self.hate_analyzer = create_analyzer(task="hate_speech", lang="pt")
                print("   ‚úÖ PySentimiento inicializado (modelo em portugu√™s)")
            except Exception as e:
                print(f"   ‚ùå Erro ao inicializar PySentimiento: {e}")
                self.hate_analyzer = None

        for i, item in enumerate(dataset):
            # Diferentes fluxos para diferentes tipos de modelo
            if model_type == 'pysentimiento' and self.hate_analyzer is not None:
                # Usar PySentimiento para detec√ß√£o de sexismo
                try:
                    req_start = time.time()
                    analysis = self.hate_analyzer.predict(item['input'])
                    request_time = time.time() - req_start

                    # Debug detalhado para os primeiros exemplos
                    if i < 3:
                        print(f"   üîç PySentimiento - Exemplo {i+1}:")
                        print(f"      Texto: {item['input'][:50]}...")
                        print(f"      Output: {analysis.output}")
                        print(f"      Probabilidades: {analysis.probas}")

                    # Acessar probabilidade de sexismo corretamente
                    sexism_prob = analysis.probas.get('Sexism', 0)

                    if i < 3:  # Mostrar apenas para os 3 primeiros exemplos
                        print(f"      Sexism probability: {sexism_prob}")

                    # Usar threshold de 0.5 para classifica√ß√£o
                    # Se a probabilidade de Sexism for maior que 0.5, classificar como MISOGINO
                    if sexism_prob > 0.5:
                        raw_output = "MISOGINO"
                    else:
                        raw_output = "NAO_MISOGINO"

                    request_times.append(request_time)

                except Exception as e:
                    print(f"   ‚ùå Erro no PySentimiento: {e}")
                    import traceback
                    traceback.print_exc()
                    raw_output = None

            else: 
                prompt_formatado = self.system_prompt.replace("{{INPUT}}", item['input'])

                payload = {
                    "model": config["model_name"],
                    "prompt": prompt_formatado,
                    "system": self.system_prompt,
                    "stream": False,
                    "options": {
                        "temperature": config["temperature"],
                        "top_p": config["top_p"],
                        "top_k": config["top_k"]
                    }
                }

                # Tentar com retry
                raw_output = None
                for attempt in range(max_retries + 1):
                    try:
                        req_start = time.time()
                        response = requests.post(self.ollama_url, json=payload, timeout=300)
                        request_time = time.time() - req_start

                        if response.status_code == 200:
                            full_json = response.json()
                            raw_output = full_json.get("response", "")
                            request_times.append(request_time)
                            break
                        elif attempt < max_retries:
                            print(f"   ‚ö†Ô∏è Tentativa {attempt + 1} falhou (HTTP {response.status_code}), tentando novamente...")
                            time.sleep(1)
                        else:
                            raw_output = None
                            print(f"   ‚ùå Falha ap√≥s {max_retries} tentativas")

                    except requests.exceptions.Timeout:
                        if attempt < max_retries:
                            print(f"   ‚è±Ô∏è Timeout na tentativa {attempt + 1}, tentando novamente...")
                            time.sleep(2)
                        else:
                            raw_output = None
                            print("   ‚ùå Timeout ap√≥s todas as tentativas")
                    except Exception as e:
                        if attempt < max_retries:
                            print(f"   ‚ö†Ô∏è Erro: {e}, tentando novamente...")
                            time.sleep(1)
                        else:
                            raw_output = None
                            print(f"   ‚ùå Erro persistente: {e}")

            # Normalizar resposta
            norm, clean, debug_rule = normalize_response(raw_output, item["input"])

            # Coletar predi√ß√£o
            prediction = norm
            ground_truth = item["output"].upper()

            # ============================================
            # IMPRESS√ÉO DETALHADA DE CADA ITERA√á√ÉO
            # ============================================
            if self.verbose:
                print("\n" + "‚îÄ" * 50)
                print(f"üìù Exemplo {i+1}/{len(dataset)}")
                print("‚îÄ" * 50)
                print(f"üìÑ Texto: \"{item['input'][:100]}{'...' if len(item['input']) > 100 else ''}\"")
                print(f"üéØ Esperado: {ground_truth}")

                if model_type == 'pysentimiento':
                    # Mostrar a probabilidade de sexismo se dispon√≠vel
                    try:
                        if 'analysis' in locals():
                            sexism_prob = analysis.probas.get('Sexism', 0)
                            print(f"ü§ñ Probabilidade Sexism: {sexism_prob:.3f}")
                            print(f"ü§ñ Categorias detectadas: {analysis.output}")
                    except:
                        pass

                print(f"ü§ñ Resposta bruta: {raw_output[:100] if raw_output else 'Nenhuma resposta'}")
                print(f"üßπ Resposta tratada: {clean}")
                print(f"üìä Normalizado: {norm}")
                print(f"üîç Regra: {debug_rule}")

                # Verificar se est√° correto
                if prediction != "ERRO":
                    is_correct = prediction == ground_truth
                    status = "‚úÖ CORRETO" if is_correct else "‚ùå ERRADO"
                    print(f"üìà Status: {status}")
                else:
                    print(f"‚ö†Ô∏è  Status: ERRO - {prediction}")
                print("‚îÄ" * 50)

            # Registrar erro se houver
            if prediction in ["ERRO_FORMATO", "ERRO_API"]:
                errors += 1
                # Para m√©tricas, tratar como erro
                prediction = "ERRO"

            # Armazenar para c√°lculo de m√©tricas
            if prediction != "ERRO":
                y_true.append(ground_truth)
                y_pred.append(prediction)

            # Salvar predi√ß√£o detalhada
            prediction_record = {
                "input": item["input"],
                "true_label": ground_truth,
                "predicted_label": prediction,
                "raw_response": raw_output if raw_output else "",
                "cleaned_response": clean,
                "is_correct": prediction == ground_truth if prediction != "ERRO" else False,
                "error_type": prediction if prediction in ["ERRO_FORMATO", "ERRO_API"] else None
            }

            # Adicionar informa√ß√µes do PySentimiento
            if model_type == 'pysentimiento':
                try:
                    if 'analysis' in locals():
                        prediction_record["sexism_probability"] = analysis.probas.get('Sexism', 0)
                        prediction_record["detected_categories"] = analysis.output
                        prediction_record["all_probabilities"] = analysis.probas
                except:
                    pass

            all_predictions.append(prediction_record)

            # Pequena pausa para n√£o sobrecarregar a API (apenas para Ollama)
            if model_type != 'pysentimiento':
                time.sleep(0.1)

        # Calcular m√©tricas
        if y_true:  # S√≥ calcular se tivermos predi√ß√µes v√°lidas
            metrics = calculate_metrics(y_true, y_pred)
        else:
            metrics = {
                "accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0,
                "tp": 0, "tn": 0, "fp": 0, "fn": 0, "error_rate": 1
            }

        total_time = time.time() - start_time
        avg_request_time = np.mean(request_times) if request_times else 0

        result_data = {
            "model": config["model_name"],
            "model_display": config["model_display"],
            "config": config["config_name"],
            "full_name": config["config_name"],
            "type": model_type,
            "temperature": config.get("temperature"),
            "top_p": config.get("top_p"),
            "top_k": config.get("top_k"),
            "accuracy": metrics["accuracy"],
            "precision": metrics["precision"],
            "recall": metrics["recall"],
            "f1_score": metrics["f1_score"],
            "tp": metrics["tp"],
            "tn": metrics["tn"],
            "fp": metrics["fp"],
            "fn": metrics["fn"],
            "total_samples": len(dataset),
            "errors": errors,
            "error_rate": errors / len(dataset),
            "total_time_seconds": round(total_time, 2),
            "avg_request_time": round(avg_request_time, 2),
            "samples_per_second": round(len(dataset) / total_time, 2) if total_time > 0 else 0,
            "predictions": all_predictions,
            "metrics": metrics
        }

        # Adicionar estat√≠sticas do PySentimiento
        if model_type == 'pysentimiento':
            # Calcular estat√≠sticas das probabilidades
            sexism_probs = [p.get("sexism_probability", 0) for p in all_predictions if "sexism_probability" in p]
            if sexism_probs:
                result_data["avg_sexism_prob"] = np.mean(sexism_probs)
                result_data["std_sexism_prob"] = np.std(sexism_probs)
                result_data["max_sexism_prob"] = np.max(sexism_probs)
                result_data["min_sexism_prob"] = np.min(sexism_probs)

                # Contar quantos textos foram classificados como MISOGINO pelo threshold
                misogino_count = sum(1 for prob in sexism_probs if prob > 0.5)
                result_data["misogino_by_threshold"] = misogino_count

        return result_data

In [None]:
dataset = load_dataset()
runner = BenchmarkRunner(is_remote=False, system_prompt=SYSTEM_PROMPT, verbose=True) # verbose=True para ver detalhes
final_results = runner.run_benchmark(ALL_CONFIGS, dataset)

print("‚úÖ Benchmark conclu√≠do! Resultados dispon√≠veis em 'final_results'")

‚úÖ Dataset carregado de 'dataset_teste_balanceado.json': 200 itens.
Usando API local Ollama: http://localhost:11434/api/generate
üöÄ Iniciando benchmark com 3 configura√ß√µes...
üìä Dataset com 200 exemplos


‚ñ∂Ô∏è Executando 1/3: Gemma0.0
   Modelo: gemma3:4b
   Tipo: ollama
   Temperature: 0.0
   Top-p: 0.0
   Top-k: 50

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìù Exemplo 1/200
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìÑ Texto: "Se ela quer mudar um pouco a rotina, √© s√≥ planejar com o marido, mas deixa eu adivinhar: nessa hist√≥..."
üéØ Esperado: MISOGINO
ü§ñ Resposta bruta: MISOGINO
üßπ Resposta tratada: MISOGINO
üìä Normalizado: MISOGINO
üîç Regra: MATCH_EXPLICITO_MISOGINO
üìà Status: ‚úÖ CORRETO
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

### Resultados do Benchmark


In [None]:
def display_summary_table(results_df):
    """Exibe uma tabela resumida dos resultados."""
    summary_cols = ["full_name", "type", "accuracy", "precision", "recall", "f1_score", "errors", "total_time_seconds"]

    summary_df = results_df[summary_cols].copy()
    summary_df.columns = ["Configura√ß√£o", "Tipo", "Acur√°cia", "Precis√£o", "Recall", "F1-Score", "Erros", "Tempo (s)"]

    # Ordenar por F1-Score
    summary_df = summary_df.sort_values("F1-Score", ascending=False)

    # Formatar
    styled_df = summary_df.style.format({
        "Acur√°cia": "{:.2%}",
        "Precis√£o": "{:.2%}",
        "Recall": "{:.2%}",
        "F1-Score": "{:.3f}"
    }).background_gradient(
        subset=["F1-Score"],
        cmap="RdYlGn",
        vmin=0,
        vmax=1
    ).set_properties(**{
        'text-align': 'center'
    })

    display(styled_df)

    # Melhor configura√ß√£o
    best_idx = results_df["f1_score"].idxmax()
    best_config = results_df.loc[best_idx]

    print(f"\nüèÜ MELHOR CONFIGURA√á√ÉO:")
    print(f"   ‚Ä¢ {best_config['full_name']} ({best_config['type']})")
    print(f"   ‚Ä¢ F1-Score: {best_config['f1_score']:.3f}")
    print(f"   ‚Ä¢ Acur√°cia: {best_config['accuracy']:.2%}")
    print(f"   ‚Ä¢ Precis√£o: {best_config['precision']:.2%}")
    print(f"   ‚Ä¢ Recall: {best_config['recall']:.2%}")
    print(f"   ‚Ä¢ TP: {best_config['tp']} | TN: {best_config['tn']} | FP: {best_config['fp']} | FN: {best_config['fn']}")
    print(f"   ‚Ä¢ Erros: {best_config['errors']}/{best_config['total_samples']}")
    print(f"   ‚Ä¢ Tempo: {best_config['total_time_seconds']:.1f}s")

# Mostrar resultados resumidos
print("\nüìä RESUMO DOS RESULTADOS:")
print("="*80)
display_summary_table(final_results)

In [None]:
# Mostrar erros de formata√ß√£o se houver
if erros_formato_log:
    print(f"\n‚ö†Ô∏è ERROS DE FORMATA√á√ÉO ENCONTRADOS: {len(erros_formato_log)}")
    print("="*50)

    # Converter para DataFrame para f√°cil visualiza√ß√£o
    errors_df = pd.DataFrame(erros_formato_log)

    # Agrupar por tipo de erro
    print("\nüìã Distribui√ß√£o por tipo de erro:")
    error_counts = errors_df['tipo'].value_counts()
    for error_type, count in error_counts.items():
        print(f"   ‚Ä¢ {error_type}: {count} ocorr√™ncias")

    # Mostrar alguns exemplos
    print(f"\nüîç Exemplos de erros (mostrando 3 primeiros):")
    for i, error in enumerate(errors_df.head(3).to_dict('records')):
        print(f"\n   {i+1}. Texto: \"{error['input']}\"")
        print(f"      Resposta bruta: {error['raw'][:100] if error['raw'] else 'Nenhuma'}...")
        print(f"      Resposta limpa: {error['clean']}")
        print(f"      Tipo: {error['tipo']}")
else:
    print("\n‚úÖ Nenhum erro de formata√ß√£o encontrado!")

In [None]:
# Visualiza√ß√£o dos resultados com gr√°ficos
if len(final_results) > 0:
    print("üìà VISUALIZA√á√ÉO DOS RESULTADOS")
    print("="*60)

    # Criar figura com subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Gr√°fico 1: Compara√ß√£o de F1-Score
    ax1 = axes[0, 0]
    bars1 = ax1.barh(final_results['full_name'], final_results['f1_score'], color='skyblue')
    ax1.set_xlabel('F1-Score')
    ax1.set_title('Compara√ß√£o de F1-Score por Configura√ß√£o')
    ax1.set_xlim(0, 1)

    # Adicionar valores nas barras
    for bar in bars1:
        width = bar.get_width()
        ax1.text(width + 0.01, bar.get_y() + bar.get_height()/2, f'{width:.3f}',
                ha='left', va='center')

    # Gr√°fico 2: Matriz de m√©tricas para a melhor configura√ß√£o
    ax2 = axes[0, 1]
    best_idx = final_results['f1_score'].idxmax()
    best_config = final_results.loc[best_idx]

    metrics = ['Acur√°cia', 'Precis√£o', 'Recall', 'F1-Score']
    values = [best_config['accuracy'], best_config['precision'],
              best_config['recall'], best_config['f1_score']]

    bars2 = ax2.bar(metrics, values, color=['#4CAF50', '#2196F3', '#FF9800', '#E91E63'])
    ax2.set_ylim(0, 1)
    ax2.set_title(f'M√©tricas da Melhor Configura√ß√£o\n{best_config["full_name"]}')
    ax2.set_ylabel('Valor')

    # Adicionar valores nas barras
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2, height + 0.01, f'{height:.3f}',
                ha='center', va='bottom')

    # Gr√°fico 3: Matriz de confus√£o
    ax3 = axes[1, 0]
    conf_matrix = [[best_config['tn'], best_config['fp']],
                   [best_config['fn'], best_config['tp']]]

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['N√£o Mis√≥gino', 'Mis√≥gino'],
                yticklabels=['N√£o Mis√≥gino', 'Mis√≥gino'],
                ax=ax3)
    ax3.set_title('Matriz de Confus√£o (Melhor Configura√ß√£o)')
    ax3.set_xlabel('Predito')
    ax3.set_ylabel('Real')

    # Gr√°fico 4: Tempo vs F1-Score
    ax4 = axes[1, 1]
    scatter = ax4.scatter(final_results['total_time_seconds'],
                         final_results['f1_score'],
                         s=100, alpha=0.6)
    ax4.set_xlabel('Tempo Total (s)')
    ax4.set_ylabel('F1-Score')
    ax4.set_title('Rela√ß√£o Tempo vs Desempenho')

    # Adicionar r√≥tulos aos pontos
    for i, row in final_results.iterrows():
        ax4.annotate(row['model_display'],
                    (row['total_time_seconds'], row['f1_score']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=9)

    plt.tight_layout()
    plt.show()

    print(f"\nüìä RESUMO ESTAT√çSTICO:")
    print(f"   ‚Ä¢ Melhor F1-Score: {final_results['f1_score'].max():.3f}")
    print(f"   ‚Ä¢ Pior F1-Score: {final_results['f1_score'].min():.3f}")
    print(f"   ‚Ä¢ M√©dia F1-Score: {final_results['f1_score'].mean():.3f}")
    print(f"   ‚Ä¢ Desvio Padr√£o F1-Score: {final_results['f1_score'].std():.3f}")
    print(f"   ‚Ä¢ Tempo m√©dio por configura√ß√£o: {final_results['total_time_seconds'].mean():.1f}s")
else:
    print("‚ùå Nenhum resultado para visualizar!")

In [None]:
print("üîç AN√ÅLISE DETALHADA POR MODELO")
print("="*60)

# Agrupar por tipo de modelo
model_analysis = []
for model_type in final_results['type'].unique():
    model_results = final_results[final_results['type'] == model_type]

    if len(model_results) > 0:
        best_config = model_results.loc[model_results['f1_score'].idxmax()]

        model_analysis.append({
            'Modelo': model_type,
            'Melhor Config': best_config['full_name'],
            'Melhor F1': best_config['f1_score'],
            'Avg Accuracy': model_results['accuracy'].mean(),
            'Avg Precision': model_results['precision'].mean(),
            'Avg Recall': model_results['recall'].mean(),
            'Tempo M√©dio (s)': model_results['total_time_seconds'].mean(),
            'Erros M√©dios': model_results['errors'].mean()
        })

analysis_df = pd.DataFrame(model_analysis)
display(analysis_df.style.format({
    'Melhor F1': '{:.3f}',
    'Avg Accuracy': '{:.2%}',
    'Avg Precision': '{:.2%}',
    'Avg Recall': '{:.2%}',
    'Tempo M√©dio (s)': '{:.1f}',
    'Erros M√©dios': '{:.1f}'
}))

In [None]:
# Visualiza√ß√£o dos resultados com gr√°ficos
if len(final_results) > 0:
    print("üìà VISUALIZA√á√ÉO DOS RESULTADOS")
    print("="*60)

    # Criar figura com subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Gr√°fico 1: Compara√ß√£o de F1-Score
    ax1 = axes[0, 0]
    colors = ['skyblue' if t == 'ollama' else 'lightcoral' for t in final_results['type']]
    bars1 = ax1.barh(final_results['full_name'], final_results['f1_score'], color=colors)
    ax1.set_xlabel('F1-Score')
    ax1.set_title('Compara√ß√£o de F1-Score por Configura√ß√£o')
    ax1.set_xlim(0, 1)

    # Adicionar valores nas barras
    for bar in bars1:
        width = bar.get_width()
        ax1.text(width + 0.01, bar.get_y() + bar.get_height()/2, f'{width:.3f}',
                ha='left', va='center')

    # Gr√°fico 2: Matriz de m√©tricas para a melhor configura√ß√£o
    ax2 = axes[0, 1]
    best_idx = final_results['f1_score'].idxmax()
    best_config = final_results.loc[best_idx]

    metrics = ['Acur√°cia', 'Precis√£o', 'Recall', 'F1-Score']
    values = [best_config['accuracy'], best_config['precision'],
              best_config['recall'], best_config['f1_score']]

    bars2 = ax2.bar(metrics, values, color=['#4CAF50', '#2196F3', '#FF9800', '#E91E63'])
    ax2.set_ylim(0, 1)
    ax2.set_title(f'M√©tricas da Melhor Configura√ß√£o\n{best_config["full_name"]}')
    ax2.set_ylabel('Valor')

    # Adicionar valores nas barras
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2, height + 0.01, f'{height:.3f}',
                ha='center', va='bottom')

    # Gr√°fico 3: Matriz de confus√£o
    ax3 = axes[1, 0]
    conf_matrix = [[best_config['tn'], best_config['fp']],
                   [best_config['fn'], best_config['tp']]]

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['N√£o Mis√≥gino', 'Mis√≥gino'],
                yticklabels=['N√£o Mis√≥gino', 'Mis√≥gino'],
                ax=ax3)
    ax3.set_title('Matriz de Confus√£o (Melhor Configura√ß√£o)')
    ax3.set_xlabel('Predito')
    ax3.set_ylabel('Real')

    # Gr√°fico 4: Tempo vs F1-Score
    ax4 = axes[1, 1]
    colors_scatter = ['blue' if t == 'ollama' else 'red' for t in final_results['type']]
    scatter = ax4.scatter(final_results['total_time_seconds'],
                         final_results['f1_score'],
                         s=100, alpha=0.6, c=colors_scatter)
    ax4.set_xlabel('Tempo Total (s)')
    ax4.set_ylabel('F1-Score')
    ax4.set_title('Rela√ß√£o Tempo vs Desempenho')

    # Adicionar r√≥tulos aos pontos
    for i, row in final_results.iterrows():
        ax4.annotate(row['model_display'],
                    (row['total_time_seconds'], row['f1_score']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=9)

    # Adicionar legenda para tipos
    import matplotlib.patches as mpatches
    ollama_patch = mpatches.Patch(color='blue', label='Ollama')
    pysentimiento_patch = mpatches.Patch(color='red', label='PySentimiento')
    ax4.legend(handles=[ollama_patch, pysentimiento_patch])

    plt.tight_layout()
    plt.show()

    print(f"\nüìä RESUMO ESTAT√çSTICO:")
    print(f"   ‚Ä¢ Melhor F1-Score: {final_results['f1_score'].max():.3f}")
    print(f"   ‚Ä¢ Pior F1-Score: {final_results['f1_score'].min():.3f}")
    print(f"   ‚Ä¢ M√©dia F1-Score: {final_results['f1_score'].mean():.3f}")
    print(f"   ‚Ä¢ Desvio Padr√£o F1-Score: {final_results['f1_score'].std():.3f}")
    print(f"   ‚Ä¢ Tempo m√©dio por configura√ß√£o: {final_results['total_time_seconds'].mean():.1f}s")

    # Estat√≠sticas por tipo
    print(f"\nüìä ESTAT√çSTICAS POR TIPO:")
    for model_type in final_results['type'].unique():
        type_results = final_results[final_results['type'] == model_type]
        print(f"   ‚Ä¢ {model_type}:")
        print(f"     - N√∫mero de configura√ß√µes: {len(type_results)}")
        print(f"     - Melhor F1: {type_results['f1_score'].max():.3f}")
        print(f"     - M√©dia F1: {type_results['f1_score'].mean():.3f}")
        print(f"     - Tempo m√©dio: {type_results['total_time_seconds'].mean():.1f}s")
else:
    print("‚ùå Nenhum resultado para visualizar!")

In [None]:
# Teste do PySentimiento com exemplos em portugu√™s
from pysentimiento import create_analyzer

print("Inicializando PySentimiento...")
hate_analyzer = create_analyzer(task="hate_speech", lang="es")
print("Pronto.")

test_texts = [
    "lugar de mulher √© na cozinha",
    "essa vadia n√£o serve pra nada",
    "hoje o dia est√° lindo",
    "voc√™ tem que matar todos os malditos negros"  # exemplo em portugu√™s
]

for text in test_texts:
    print(f"\nTexto: {text}")
    analysis = hate_analyzer.predict(text)
    print(f"An√°lise: {analysis}")
    print(f"Tipo: {type(analysis)}")
    print(f"Atributos: {dir(analysis)}")
    if hasattr(analysis, 'probas'):
        print(f"Probas: {analysis.probas}")
        print(f"Tipo de probas: {type(analysis.probas)}")
        # Verificar se √© um dicion√°rio com subcategorias
        if isinstance(analysis.probas, dict):
            for key, value in analysis.probas.items():
                print(f"  {key}: {value}")
    if hasattr(analysis, 'output'):
        print(f"Output: {analysis.output}")

| ID | Configura√ß√£o                | Modelo   | Temp | Top_p | Tipo           | Acur√°cia | Precis√£o | Recall | F1-Score | Erros | Tempo (s) |
|----|-----------------------------|----------|------|-------|----------------|----------|-----------|--------|----------|-------|-----------|
| 5  | Gemma3-4B_temp0.5_topp0.9   | Gemma3-4B | 0.5  | 0.9   | ollama         | 74.00%   | 69.67%    | 85.00% | 0.766    | 0     | 1361.57   |
| 7  | Gemma3-4B_temp0.5_topp0.9   | Gemma3-4B | 0.5  | 0.9   | ollama         | 73.50%   | 69.42%    | 84.00% | 0.760    | 0     | 608.16    |
| 1  | Gemma3-4B_temp0.1_topp0.5   | Gemma3-4B | 0.1  | 0.5   | ollama         | 73.50%   | 69.42%    | 84.00% | 0.760    | 0     | 691.34    |
| 3  | Gemma3-4B_temp0.3_topp0.7   | Gemma3-4B | 0.3  | 0.7   | ollama         | 73.00%   | 68.85%    | 84.00% | 0.757    | 0     | 3013.98   |
| 4  | Llama3_temp0.5_topp0.9      | Llama3    | 0.5  | 0.9   | ollama         | 73.00%   | 84.85%    | 56.00% | 0.675    | 0     | 900.42    |
| 0  | Llama3_temp0.1_topp0.5      | Llama3    | 0.1  | 0.5   | ollama         | 72.08%   | 83.33%    | 55.56% | 0.667    | 3     | 1413.71   |
| 2  | Llama3_temp0.3_topp0.7      | Llama3    | 0.3  | 0.7   | ollama         | 72.00%   | 83.33%    | 55.00% | 0.663    | 0     | 1234.54   |
| 6  | Llama3_temp0.5_topp0.9      | Llama3    | 0.5  | 0.9   | ollama         | 72.00%   | 83.33%    | 55.00% | 0.663    | 0     | 951.04    |
| 8  | PySentimiento_sexism        | PySentimiento | ‚Äî    | ‚Äî     | pysentimiento | 63.50%   | 76.47%    | 39.00% | 0.517    | 0     | 23.23     |
