# IMPORTS

In [2]:
import os
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

# CONFIGURA√á√ïES

In [3]:
BASE_FOLDER_TRAIN = "treino"

FILES = [
    "train_literal_dinamico.csv",
    "train_complexo_simples.csv",
    "train_arcaico_moderno.csv",
]

preprocess_params = {
    "lowercase": True,
    "normalize_unicode": False,
    "remove_extra_whitespace": True,
    "remove_punct": False,
}

In [4]:
# Configura√ß√£o dos Pipelines e Grades de Hiperpar√¢metros para Grid Search
param_grids = {
    'Naive Bayes': {
        'pipeline': Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('model', MultinomialNB())
        ]),
        'params': {
            'vectorizer__max_features': [3000, 5000, 10000],
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'vectorizer__min_df': [2, 5],
            'vectorizer__max_df': [0.9, 0.95],
            'model__alpha': [0.1, 0.5, 1.0, 2.0]
        }
    },
    'Logistic Regression': {
        'pipeline': Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('model', LogisticRegression(max_iter=1000, random_state=42))
        ]),
        'params': {
            'vectorizer__max_features': [3000, 5000, 10000],
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'vectorizer__min_df': [2, 5],
            'vectorizer__max_df': [0.9, 0.95],
            'model__C': [0.1, 1.0, 10.0],
            'model__solver': ['lbfgs', 'liblinear'],
            'model__class_weight': ['balanced', None]
        }
    },
    'SVM (LinearSVC)': {
        'pipeline': Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('model', LinearSVC(dual=False, random_state=42))
        ]),
        'params': {
            'vectorizer__max_features': [3000, 5000, 10000],
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'vectorizer__min_df': [2, 5],
            'vectorizer__max_df': [0.9, 0.95],
            'model__C': [0.1, 1.0, 10.0],
            'model__max_iter': [1000, 2000]
        }
    }
}


# AN√ÅLISE DE BALANCEAMENTO DOS DATASETS


In [5]:
def analisar_balanceamento(file_name):
    """Fun√ß√£o para analisar o balanceamento de um dataset"""
    path = os.path.join(BASE_FOLDER_TRAIN, file_name)
    df = pd.read_csv(path, sep=";")
    
    print("="*60)
    print(f"AN√ÅLISE ESTAT√çSTICA - {file_name}")
    print("="*60)
    
    # Informa√ß√µes b√°sicas
    print(f"\nüìä INFORMA√á√ïES GERAIS:")
    print(f"   ‚Ä¢ Total de linhas: {len(df):,}")
    print(f"   ‚Ä¢ Total de colunas: {len(df.columns)}")
    print(f"   ‚Ä¢ Colunas: {list(df.columns)}")
    
    # Verificar valores nulos
    print(f"\nüîç VALORES NULOS:")
    print(f"   ‚Ä¢ Coluna 'text': {df['text'].isna().sum()}")
    print(f"   ‚Ä¢ Coluna 'style': {df['style'].isna().sum()}")
    
    # Distribui√ß√£o das classes
    print(f"\nüìà DISTRIBUI√á√ÉO DAS CLASSES:")
    contagem_classes = df['style'].value_counts()
    print(contagem_classes)
    
    print(f"\nüìä PORCENTAGEM POR CLASSE:")
    porcentagem_classes = df['style'].value_counts(normalize=True) * 100
    for classe, perc in porcentagem_classes.items():
        count = contagem_classes[classe]
        print(f"   ‚Ä¢ {classe}: {count:,} ({perc:.2f}%)")
    
    # Verificar balanceamento
    print(f"\n‚öñÔ∏è BALANCEAMENTO:")
    razao = contagem_classes.max() / contagem_classes.min()
    print(f"   ‚Ä¢ Raz√£o maior/menor classe: {razao:.2f}x")
    if razao < 1.5:
        print(f"   ‚Ä¢ Status: ‚úÖ Dataset bem balanceado")
    elif razao < 3:
        print(f"   ‚Ä¢ Status: ‚ö†Ô∏è Dataset moderadamente desbalanceado")
    else:
        print(f"   ‚Ä¢ Status: ‚ùå Dataset desbalanceado")
    
    print("\n" + "="*60)
    print()
    
    return df, contagem_classes


In [6]:
# Analisar todos os datasets
resultados_analise = {}

for file_name in FILES:
    df, contagem = analisar_balanceamento(file_name)
    resultados_analise[file_name] = {
        'dataframe': df,
        'contagem_classes': contagem
    }


AN√ÅLISE ESTAT√çSTICA - train_literal_dinamico.csv

üìä INFORMA√á√ïES GERAIS:
   ‚Ä¢ Total de linhas: 36,964
   ‚Ä¢ Total de colunas: 2
   ‚Ä¢ Colunas: ['text', 'style']

üîç VALORES NULOS:
   ‚Ä¢ Coluna 'text': 0
   ‚Ä¢ Coluna 'style': 0

üìà DISTRIBUI√á√ÉO DAS CLASSES:
style
literal     18482
dinamico    18482
Name: count, dtype: int64

üìä PORCENTAGEM POR CLASSE:
   ‚Ä¢ literal: 18,482 (50.00%)
   ‚Ä¢ dinamico: 18,482 (50.00%)

‚öñÔ∏è BALANCEAMENTO:
   ‚Ä¢ Raz√£o maior/menor classe: 1.00x
   ‚Ä¢ Status: ‚úÖ Dataset bem balanceado


AN√ÅLISE ESTAT√çSTICA - train_complexo_simples.csv

üìä INFORMA√á√ïES GERAIS:
   ‚Ä¢ Total de linhas: 33,422
   ‚Ä¢ Total de colunas: 2
   ‚Ä¢ Colunas: ['text', 'style']

üîç VALORES NULOS:
   ‚Ä¢ Coluna 'text': 1
   ‚Ä¢ Coluna 'style': 0

üìà DISTRIBUI√á√ÉO DAS CLASSES:
style
complexo    16711
simples     16711
Name: count, dtype: int64

üìä PORCENTAGEM POR CLASSE:
   ‚Ä¢ complexo: 16,711 (50.00%)
   ‚Ä¢ simples: 16,711 (50.00%)

‚öñÔ∏è BALANCEAM

In [7]:
# Criar tabela resumo comparativa
print("="*80)
print("RESUMO COMPARATIVO - TODOS OS DATASETS")
print("="*80)
print()

resumo_data = []
for file_name, resultado in resultados_analise.items():
    df = resultado['dataframe']
    contagem = resultado['contagem_classes']
    razao = contagem.max() / contagem.min()
    
    resumo_data.append({
        'Dataset': file_name.replace('train_', '').replace('.csv', ''),
        'Total Linhas': len(df),
        'Classe 1': contagem.index[0],
        'Count 1': contagem.values[0],
        'Classe 2': contagem.index[1],
        'Count 2': contagem.values[1],
        'Raz√£o': f"{razao:.2f}x",
        'Status': '‚úÖ Balanceado' if razao < 1.5 else '‚ö†Ô∏è Moderado' if razao < 3 else '‚ùå Desbalanceado'
    })

df_resumo = pd.DataFrame(resumo_data)
print(df_resumo.to_string(index=False))
print()
print("="*80)


RESUMO COMPARATIVO - TODOS OS DATASETS

         Dataset  Total Linhas Classe 1  Count 1 Classe 2  Count 2 Raz√£o       Status
literal_dinamico         36964  literal    18482 dinamico    18482 1.00x ‚úÖ Balanceado
complexo_simples         33422 complexo    16711  simples    16711 1.00x ‚úÖ Balanceado
 arcaico_moderno         36884  arcaico    18442  moderno    18442 1.00x ‚úÖ Balanceado



# PR√â-PROCESSAMENTO

In [8]:
def preprocess_operations(text, params):
    if not isinstance(text, str):
        return ""
    if params.get("normalize_unicode", True):
        text = unicodedata.normalize("NFKC", text)
    if params.get("lowercase", True):
        text = text.lower()
    if params.get("remove_punct", False):
        text = re.sub(r"[^\w\s]", " ", text)
    if params.get("remove_extra_whitespace", True):
        text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_data(path):
    if not os.path.exists(path):
        print(f"Aviso: {path} n√£o encontrado.")
        return None

    df = pd.read_csv(path, sep=";")
    col_text, col_label = "text", "style"

    df = df[[col_text, col_label]].dropna()
    df = shuffle(df, random_state=10).reset_index(drop=True)

    df["text_preproc"] = df[col_text].apply(lambda x: preprocess_operations(x, preprocess_params))

    le = LabelEncoder()
    y = le.fit_transform(df[col_label])
    X = df["text_preproc"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, stratify=y, random_state=10
    )

    return X_train, X_test, y_train, y_test

In [9]:
datasets = {}

for file_name in FILES:
    path = os.path.join(BASE_FOLDER_TRAIN, file_name)
    print(f"\nProcessando: {file_name}")
    result = preprocess_data(path) 

    if result is not None:
        X_train, X_test, y_train, y_test = result
        datasets[file_name] = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }


Processando: train_literal_dinamico.csv

Processando: train_complexo_simples.csv

Processando: train_arcaico_moderno.csv


# DATASET 1: ARCAICO vs MODERNO

Classifica√ß√£o de textos entre estilo **arcaico** e **moderno**.

In [10]:
# Preparar dados - arcaico_moderno
X_train = datasets["train_arcaico_moderno.csv"]["X_train"]
X_test = datasets["train_arcaico_moderno.csv"]["X_test"]
y_train = datasets["train_arcaico_moderno.csv"]["y_train"]
y_test = datasets["train_arcaico_moderno.csv"]["y_test"]

print(f"Dados carregados - train_arcaico_moderno.csv")
print(f"   Treino: {len(X_train)} textos | Teste: {len(X_test)} textos")


Dados carregados - train_arcaico_moderno.csv
   Treino: 31351 textos | Teste: 5533 textos


In [11]:
print("="*80)
print("GRID SEARCH COM PIPELINE - ARCAICO vs MODERNO (10-FOLD CV)")
print("="*80)
print("Otimizando TF-IDF + Modelos simultaneamente...\n")

# Armazenar melhores pipelines
best_models = {}
cv_results = {}

for name, config in param_grids.items():
    print(f"[{name}] Executando Grid Search...")
    print(f"   Testando {len(config['params']['vectorizer__max_features']) * len(config['params']['vectorizer__ngram_range']) * len(config['params']['vectorizer__min_df']) * len(config['params']['vectorizer__max_df'])} combina√ß√µes de TF-IDF...")
    
    # Grid Search com 10-fold CV
    grid_search = GridSearchCV(
        config['pipeline'],
        config['params'],
        cv=10,
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train, y_train)
    
    # Armazenar resultados
    best_models[name] = grid_search.best_estimator_
    
    # Separar par√¢metros de TF-IDF e modelo
    vectorizer_params = {k.replace('vectorizer__', ''): v 
                        for k, v in grid_search.best_params_.items() 
                        if k.startswith('vectorizer__')}
    model_params = {k.replace('model__', ''): v 
                   for k, v in grid_search.best_params_.items() 
                   if k.startswith('model__')}
    
    cv_results[name] = {
        'best_params': grid_search.best_params_,
        'vectorizer_params': vectorizer_params,
        'model_params': model_params,
        'best_score': grid_search.best_score_,
        'mean': grid_search.best_score_,
        'std': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
    }
    
    print(f"  ‚úì Melhores params TF-IDF: {vectorizer_params}")
    print(f"  ‚úì Melhores params Modelo: {model_params}")
    print(f"  ‚úì Acuracia (CV): {grid_search.best_score_:.4f}\n")

print("="*80)


GRID SEARCH COM PIPELINE - ARCAICO vs MODERNO (10-FOLD CV)
Otimizando TF-IDF + Modelos simultaneamente...

[Naive Bayes] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 2, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'alpha': 0.1}
  ‚úì Acuracia (CV): 0.8362

[Logistic Regression] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'C': 10.0, 'class_weight': 'balanced', 'solver': 'lbfgs'}
  ‚úì Acuracia (CV): 0.8386

[SVM (LinearSVC)] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'C': 0.1, 'max_iter': 1000}
  ‚úì Acuracia (CV): 0.8357



In [12]:
# Teste Final - arcaico_moderno (com melhores params do Grid Search)
print("\nTeste Final no Hold-Out - ARCAICO vs MODERNO")
print("="*60)

final_results = {}
for name, pipeline in best_models.items():
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    final_results[name] = {'holdout_acc': acc, 'cv_mean': cv_results[name]['mean']}
    print(f"{name:25s}: CV={cv_results[name]['mean']:.4f} | Hold-Out={acc:.4f}")

best_final = max(final_results.items(), key=lambda x: x[1]['holdout_acc'])
print(f"\nMelhor modelo (Hold-Out): {best_final[0]} - {best_final[1]['holdout_acc']*100:.2f}%")



Teste Final no Hold-Out - ARCAICO vs MODERNO
Naive Bayes              : CV=0.8362 | Hold-Out=0.8332
Logistic Regression      : CV=0.8386 | Hold-Out=0.8473
SVM (LinearSVC)          : CV=0.8357 | Hold-Out=0.8417

Melhor modelo (Hold-Out): Logistic Regression - 84.73%


# DATASET 2: COMPLEXO vs SIMPLES

Classifica√ß√£o de textos entre estilo **complexo** e **simples**.


In [13]:
# Preparar dados - complexo_simples
X_train_cs = datasets["train_complexo_simples.csv"]["X_train"]
X_test_cs = datasets["train_complexo_simples.csv"]["X_test"]
y_train_cs = datasets["train_complexo_simples.csv"]["y_train"]
y_test_cs = datasets["train_complexo_simples.csv"]["y_test"]

print(f"Dados carregados - train_complexo_simples.csv")
print(f"   Treino: {len(X_train_cs)} textos | Teste: {len(X_test_cs)} textos")


Dados carregados - train_complexo_simples.csv
   Treino: 28407 textos | Teste: 5014 textos


In [14]:
print("="*80)
print("GRID SEARCH COM PIPELINE - COMPLEXO vs SIMPLES (10-FOLD CV)")
print("="*80)
print("Otimizando TF-IDF + Modelos simultaneamente...\n")

# Armazenar melhores pipelines
best_models_cs = {}
cv_results_cs = {}

for name, config in param_grids.items():
    print(f"[{name}] Executando Grid Search...")
    print(f"   Testando {len(config['params']['vectorizer__max_features']) * len(config['params']['vectorizer__ngram_range']) * len(config['params']['vectorizer__min_df']) * len(config['params']['vectorizer__max_df'])} combina√ß√µes de TF-IDF...")
    
    # Grid Search com 10-fold CV
    grid_search = GridSearchCV(
        config['pipeline'],
        config['params'],
        cv=10,
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train_cs, y_train_cs)
    
    # Armazenar resultados
    best_models_cs[name] = grid_search.best_estimator_
    
    # Separar par√¢metros de TF-IDF e modelo
    vectorizer_params = {k.replace('vectorizer__', ''): v 
                        for k, v in grid_search.best_params_.items() 
                        if k.startswith('vectorizer__')}
    model_params = {k.replace('model__', ''): v 
                   for k, v in grid_search.best_params_.items() 
                   if k.startswith('model__')}
    
    cv_results_cs[name] = {
        'best_params': grid_search.best_params_,
        'vectorizer_params': vectorizer_params,
        'model_params': model_params,
        'best_score': grid_search.best_score_,
        'mean': grid_search.best_score_,
        'std': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
    }
    
    print(f"  ‚úì Melhores params TF-IDF: {vectorizer_params}")
    print(f"  ‚úì Melhores params Modelo: {model_params}")
    print(f"  ‚úì Acuracia (CV): {grid_search.best_score_:.4f}\n")

print("="*80)


GRID SEARCH COM PIPELINE - COMPLEXO vs SIMPLES (10-FOLD CV)
Otimizando TF-IDF + Modelos simultaneamente...

[Naive Bayes] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'alpha': 0.1}
  ‚úì Acuracia (CV): 0.8149

[Logistic Regression] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 2, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'C': 10.0, 'class_weight': 'balanced', 'solver': 'lbfgs'}
  ‚úì Acuracia (CV): 0.8346

[SVM (LinearSVC)] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'C': 0.1, 'max_iter': 1000}
  ‚úì Acuracia (CV): 0.8301



In [15]:
# Teste Final - complexo_simples (com melhores params do Grid Search)
print("\nTeste Final no Hold-Out - COMPLEXO vs SIMPLES")
print("="*60)

final_results_cs = {}
for name, pipeline in best_models_cs.items():
    y_pred = pipeline.predict(X_test_cs)
    acc = accuracy_score(y_test_cs, y_pred)
    final_results_cs[name] = {'holdout_acc': acc, 'cv_mean': cv_results_cs[name]['mean']}
    print(f"{name:25s}: CV={cv_results_cs[name]['mean']:.4f} | Hold-Out={acc:.4f}")

best_final_cs = max(final_results_cs.items(), key=lambda x: x[1]['holdout_acc'])
print(f"\nMelhor modelo (Hold-Out): {best_final_cs[0]} - {best_final_cs[1]['holdout_acc']*100:.2f}%")



Teste Final no Hold-Out - COMPLEXO vs SIMPLES
Naive Bayes              : CV=0.8149 | Hold-Out=0.8145
Logistic Regression      : CV=0.8346 | Hold-Out=0.8311
SVM (LinearSVC)          : CV=0.8301 | Hold-Out=0.8221

Melhor modelo (Hold-Out): Logistic Regression - 83.11%


# DATASET 3: LITERAL vs DIN√ÇMICO

Classifica√ß√£o de textos entre estilo **literal** e **din√¢mico**.

In [16]:
# Preparar dados - literal_dinamico
X_train_ld = datasets["train_literal_dinamico.csv"]["X_train"]
X_test_ld = datasets["train_literal_dinamico.csv"]["X_test"]
y_train_ld = datasets["train_literal_dinamico.csv"]["y_train"]
y_test_ld = datasets["train_literal_dinamico.csv"]["y_test"]

print(f"Dados carregados - train_literal_dinamico.csv")
print(f"   Treino: {len(X_train_ld)} textos | Teste: {len(X_test_ld)} textos")


Dados carregados - train_literal_dinamico.csv
   Treino: 31419 textos | Teste: 5545 textos


In [17]:
print("="*80)
print("GRID SEARCH COM PIPELINE - LITERAL vs DIN√ÇMICO (10-FOLD CV)")
print("="*80)
print("Otimizando TF-IDF + Modelos simultaneamente...\n")

# Armazenar melhores pipelines
best_models_ld = {}
cv_results_ld = {}

for name, config in param_grids.items():
    print(f"[{name}] Executando Grid Search...")
    print(f"   Testando {len(config['params']['vectorizer__max_features']) * len(config['params']['vectorizer__ngram_range']) * len(config['params']['vectorizer__min_df']) * len(config['params']['vectorizer__max_df'])} combina√ß√µes de TF-IDF...")
    
    # Grid Search com 10-fold CV
    grid_search = GridSearchCV(
        config['pipeline'],
        config['params'],
        cv=10,
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train_ld, y_train_ld)
    
    # Armazenar resultados
    best_models_ld[name] = grid_search.best_estimator_
    
    # Separar par√¢metros de TF-IDF e modelo
    vectorizer_params = {k.replace('vectorizer__', ''): v 
                        for k, v in grid_search.best_params_.items() 
                        if k.startswith('vectorizer__')}
    model_params = {k.replace('model__', ''): v 
                   for k, v in grid_search.best_params_.items() 
                   if k.startswith('model__')}
    
    cv_results_ld[name] = {
        'best_params': grid_search.best_params_,
        'vectorizer_params': vectorizer_params,
        'model_params': model_params,
        'best_score': grid_search.best_score_,
        'mean': grid_search.best_score_,
        'std': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
    }
    
    print(f"  ‚úì Melhores params TF-IDF: {vectorizer_params}")
    print(f"  ‚úì Melhores params Modelo: {model_params}")
    print(f"  ‚úì Acuracia (CV): {grid_search.best_score_:.4f}\n")

print("="*80)


GRID SEARCH COM PIPELINE - LITERAL vs DIN√ÇMICO (10-FOLD CV)
Otimizando TF-IDF + Modelos simultaneamente...

[Naive Bayes] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'alpha': 0.1}
  ‚úì Acuracia (CV): 0.8355

[Logistic Regression] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 2, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'C': 1.0, 'class_weight': 'balanced', 'solver': 'liblinear'}
  ‚úì Acuracia (CV): 0.8347

[SVM (LinearSVC)] Executando Grid Search...
   Testando 24 combina√ß√µes de TF-IDF...
  ‚úì Melhores params TF-IDF: {'max_df': 0.9, 'max_features': 10000, 'min_df': 2, 'ngram_range': (1, 2)}
  ‚úì Melhores params Modelo: {'C': 0.1, 'max_iter': 1000}
  ‚úì Acuracia (CV): 0.8341



In [18]:
# Teste Final - literal_dinamico (com melhores params do Grid Search)
print("\nTeste Final no Hold-Out - LITERAL vs DIN√ÇMICO")
print("="*60)

final_results_ld = {}
for name, pipeline in best_models_ld.items():
    y_pred = pipeline.predict(X_test_ld)
    acc = accuracy_score(y_test_ld, y_pred)
    final_results_ld[name] = {'holdout_acc': acc, 'cv_mean': cv_results_ld[name]['mean']}
    print(f"{name:25s}: CV={cv_results_ld[name]['mean']:.4f} | Hold-Out={acc:.4f}")

best_final_ld = max(final_results_ld.items(), key=lambda x: x[1]['holdout_acc'])
print(f"\nMelhor modelo (Hold-Out): {best_final_ld[0]} - {best_final_ld[1]['holdout_acc']*100:.2f}%")



Teste Final no Hold-Out - LITERAL vs DIN√ÇMICO
Naive Bayes              : CV=0.8355 | Hold-Out=0.8267
Logistic Regression      : CV=0.8347 | Hold-Out=0.8357
SVM (LinearSVC)          : CV=0.8341 | Hold-Out=0.8368

Melhor modelo (Hold-Out): SVM (LinearSVC) - 83.68%


# COMPARA√á√ÉO FINAL - TODOS OS DATASETS

An√°lise comparativa do desempenho dos modelos nos 3 tipos de classifica√ß√£o.


In [19]:
print("="*100)
print("COMPARA√á√ÉO FINAL - TODOS OS DATASETS")
print("="*100)

# Organizar resultados com par√¢metros
datasets_comparison = {
    'ARCAICO vs MODERNO': {
        'results': final_results,
        'cv_results': cv_results
    },
    'COMPLEXO vs SIMPLES': {
        'results': final_results_cs,
        'cv_results': cv_results_cs
    },
    'LITERAL vs DIN√ÇMICO': {
        'results': final_results_ld,
        'cv_results': cv_results_ld
    }
}

# Mostrar resultados por dataset
for dataset_name, data in datasets_comparison.items():
    results = data['results']
    cv_res = data['cv_results']
    
    print(f"\n{'='*100}")
    print(f"DATASET: {dataset_name}")
    print(f"{'='*100}")
    
    # Ordenar por Hold-Out
    sorted_results = sorted(results.items(), key=lambda x: x[1]['holdout_acc'], reverse=True)
    
    for i, (model_name, metrics) in enumerate(sorted_results, 1):
        emoji = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â"
        print(f"\n{emoji} {model_name}")
        print(f"   Acur√°cia CV:       {metrics['cv_mean']:.4f} ({metrics['cv_mean']*100:.2f}%)")
        print(f"   Acur√°cia Hold-Out: {metrics['holdout_acc']:.4f} ({metrics['holdout_acc']*100:.2f}%)")
        print(f"   Params TF-IDF:     {cv_res[model_name]['vectorizer_params']}")
        print(f"   Params Modelo:     {cv_res[model_name]['model_params']}")

# Resumo final - Melhor modelo por dataset
print(f"\n\n{'='*100}")
print("RESUMO - MELHOR MODELO POR DATASET")
print(f"{'='*100}\n")

for dataset_name, data in datasets_comparison.items():
    results = data['results']
    cv_res = data['cv_results']
    
    best = max(results.items(), key=lambda x: x[1]['holdout_acc'])
    best_name = best[0]
    best_metrics = best[1]
    
    print(f"üèÜ {dataset_name}")
    print(f"   Melhor Modelo:     {best_name}")
    print(f"   Acur√°cia Hold-Out: {best_metrics['holdout_acc']*100:.2f}%")
    print(f"   Params TF-IDF:     {cv_res[best_name]['vectorizer_params']}")
    print(f"   Params Modelo:     {cv_res[best_name]['model_params']}")
    print()

print("="*100)


COMPARA√á√ÉO FINAL - TODOS OS DATASETS

DATASET: ARCAICO vs MODERNO

ü•á Logistic Regression
   Acur√°cia CV:       0.8386 (83.86%)
   Acur√°cia Hold-Out: 0.8473 (84.73%)
   Params TF-IDF:     {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
   Params Modelo:     {'C': 10.0, 'class_weight': 'balanced', 'solver': 'lbfgs'}

ü•à SVM (LinearSVC)
   Acur√°cia CV:       0.8357 (83.57%)
   Acur√°cia Hold-Out: 0.8417 (84.17%)
   Params TF-IDF:     {'max_df': 0.9, 'max_features': 10000, 'min_df': 5, 'ngram_range': (1, 2)}
   Params Modelo:     {'C': 0.1, 'max_iter': 1000}

ü•â Naive Bayes
   Acur√°cia CV:       0.8362 (83.62%)
   Acur√°cia Hold-Out: 0.8332 (83.32%)
   Params TF-IDF:     {'max_df': 0.9, 'max_features': 10000, 'min_df': 2, 'ngram_range': (1, 2)}
   Params Modelo:     {'alpha': 0.1}

DATASET: COMPLEXO vs SIMPLES

ü•á Logistic Regression
   Acur√°cia CV:       0.8346 (83.46%)
   Acur√°cia Hold-Out: 0.8311 (83.11%)
   Params TF-IDF:     {'max_df': 0.9, 

# PREDI√á√ïES NOS DADOS DE TESTE

Aplica√ß√£o dos melhores modelos treinados nos dados de teste da pasta `teste/`.


In [20]:
# Configura√ß√£o da pasta de teste
BASE_FOLDER_TEST = "teste"

TEST_FILES = [
    "test_arcaico_moderno.csv",
    "test_complexo_simples.csv",
    "test_literal_dinamico.csv",
]

print("="*80)
print("CARREGANDO DADOS DE TESTE")
print("="*80)
print(f"Pasta: {BASE_FOLDER_TEST}/")
print(f"Arquivos: {len(TEST_FILES)}")
print("="*80)


CARREGANDO DADOS DE TESTE
Pasta: teste/
Arquivos: 3


In [22]:
def load_and_preprocess_test(path):
    """Carrega e pr√©-processa dados de teste (sem labels)"""
    if not os.path.exists(path):
        print(f"Aviso: {path} n√£o encontrado.")
        return None
    
    # Tentar diferentes encodings comuns no Windows
    encodings = ['latin-1', 'cp1252', 'iso-8859-1', 'utf-8']
    df = None
    
    for encoding in encodings:
        try:
            df = pd.read_csv(path, sep=";", encoding=encoding)
            print(f"  ‚úì Arquivo lido com encoding: {encoding}")
            break
        except UnicodeDecodeError:
            continue
    
    if df is None:
        print(f"Erro: n√£o foi poss√≠vel ler {path} com nenhum encoding comum")
        return None
    
    # Verificar se tem a coluna text
    if "text" not in df.columns:
        print(f"Erro: coluna 'text' n√£o encontrada em {path}")
        return None
    
    # Remover linhas com valores nulos
    df = df.dropna(subset=["text"])
    
    # Aplicar pr√©-processamento
    df["text_preproc"] = df["text"].apply(lambda x: preprocess_operations(x, preprocess_params))
    
    return df

# Carregar todos os dados de teste
test_datasets = {}

for file_name in TEST_FILES:
    path = os.path.join(BASE_FOLDER_TEST, file_name)
    print(f"\nCarregando: {file_name}")
    
    df = load_and_preprocess_test(path)
    
    if df is not None:
        test_datasets[file_name] = df
        print(f"  ‚úì {len(df):,} textos carregados e pr√©-processados")
    else:
        print(f"  ‚úó Erro ao carregar {file_name}")



Carregando: test_arcaico_moderno.csv
  ‚úì Arquivo lido com encoding: latin-1
  ‚úì 9,222 textos carregados e pr√©-processados

Carregando: test_complexo_simples.csv
  ‚úì Arquivo lido com encoding: latin-1
  ‚úì 8,356 textos carregados e pr√©-processados

Carregando: test_literal_dinamico.csv
  ‚úì Arquivo lido com encoding: latin-1
  ‚úì 9,242 textos carregados e pr√©-processados


## Predi√ß√µes - ARCAICO vs MODERNO


In [25]:
print("="*80)
print("PREDI√á√ïES - ARCAICO vs MODERNO")
print("="*80)

# Identificar o melhor modelo
best_model_name = max(final_results.items(), key=lambda x: x[1]['holdout_acc'])[0]
best_pipeline = best_models[best_model_name]

print(f"\nMelhor modelo: {best_model_name}")
print(f"Acur√°cia Hold-Out: {final_results[best_model_name]['holdout_acc']*100:.2f}%")

# Carregar dados de teste
df_test = test_datasets["test_arcaico_moderno.csv"]
X_test_texts = df_test["text_preproc"].values

print(f"\nTotal de textos para predi√ß√£o: {len(X_test_texts):,}")

# Fazer predi√ß√µes
y_pred = best_pipeline.predict(X_test_texts)

# Obter os labels originais (arcaico/moderno)
df_train_original = pd.read_csv(os.path.join(BASE_FOLDER_TRAIN, "train_arcaico_moderno.csv"), sep=";", encoding='latin-1')
le = LabelEncoder()
le.fit(df_train_original["style"])

# Converter predi√ß√µes num√©ricas para labels
predicted_labels = le.inverse_transform(y_pred)

# Adicionar predi√ß√µes ao dataframe
df_test["predicted_style"] = predicted_labels

# Mostrar estat√≠sticas
print(f"\nüìä DISTRIBUI√á√ÉO DAS PREDI√á√ïES:")
pred_counts = pd.Series(predicted_labels).value_counts()
for label, count in pred_counts.items():
    pct = (count / len(predicted_labels)) * 100
    print(f"   ‚Ä¢ {label}: {count:,} ({pct:.2f}%)")

# Salvar resultado
output_path = "test_arcaico_moderno_predictions.csv"
df_output = df_test[["text", "predicted_style"]]
df_output.to_csv(output_path, sep=";", index=False, encoding='utf-8-sig')
print(f"\n‚úÖ Predi√ß√µes salvas em: {output_path}")
print("="*80)


PREDI√á√ïES - ARCAICO vs MODERNO

Melhor modelo: Logistic Regression
Acur√°cia Hold-Out: 84.73%

Total de textos para predi√ß√£o: 9,222

üìä DISTRIBUI√á√ÉO DAS PREDI√á√ïES:
   ‚Ä¢ moderno: 4,615 (50.04%)
   ‚Ä¢ arcaico: 4,607 (49.96%)

‚úÖ Predi√ß√µes salvas em: test_arcaico_moderno_predictions.csv


In [26]:
print("="*80)
print("PREDI√á√ïES - COMPLEXO vs SIMPLES")
print("="*80)

# Identificar o melhor modelo
best_model_name_cs = max(final_results_cs.items(), key=lambda x: x[1]['holdout_acc'])[0]
best_pipeline_cs = best_models_cs[best_model_name_cs]

print(f"\nMelhor modelo: {best_model_name_cs}")
print(f"Acur√°cia Hold-Out: {final_results_cs[best_model_name_cs]['holdout_acc']*100:.2f}%")

# Carregar dados de teste
df_test_cs = test_datasets["test_complexo_simples.csv"]
X_test_texts_cs = df_test_cs["text_preproc"].values

print(f"\nTotal de textos para predi√ß√£o: {len(X_test_texts_cs):,}")

# Fazer predi√ß√µes
y_pred_cs = best_pipeline_cs.predict(X_test_texts_cs)

# Obter os labels originais (complexo/simples)
df_train_original_cs = pd.read_csv(os.path.join(BASE_FOLDER_TRAIN, "train_complexo_simples.csv"), sep=";")
le_cs = LabelEncoder()
le_cs.fit(df_train_original_cs["style"])

# Converter predi√ß√µes num√©ricas para labels
predicted_labels_cs = le_cs.inverse_transform(y_pred_cs)

# Adicionar predi√ß√µes ao dataframe
df_test_cs["predicted_style"] = predicted_labels_cs

# Mostrar estat√≠sticas
print(f"\nüìä DISTRIBUI√á√ÉO DAS PREDI√á√ïES:")
pred_counts_cs = pd.Series(predicted_labels_cs).value_counts()
for label, count in pred_counts_cs.items():
    pct = (count / len(predicted_labels_cs)) * 100
    print(f"   ‚Ä¢ {label}: {count:,} ({pct:.2f}%)")

# Salvar resultado
output_path_cs = "test_complexo_simples_predictions.csv"
df_output_cs = df_test_cs[["text", "predicted_style"]]
df_output_cs.to_csv(output_path_cs, sep=";", index=False)
print(f"\n‚úÖ Predi√ß√µes salvas em: {output_path_cs}")
print("="*80)


PREDI√á√ïES - COMPLEXO vs SIMPLES

Melhor modelo: Logistic Regression
Acur√°cia Hold-Out: 83.11%

Total de textos para predi√ß√£o: 8,356

üìä DISTRIBUI√á√ÉO DAS PREDI√á√ïES:
   ‚Ä¢ simples: 4,275 (51.16%)
   ‚Ä¢ complexo: 4,081 (48.84%)

‚úÖ Predi√ß√µes salvas em: test_complexo_simples_predictions.csv


## Predi√ß√µes - LITERAL vs DIN√ÇMICO


In [27]:
print("="*80)
print("PREDI√á√ïES - LITERAL vs DIN√ÇMICO")
print("="*80)

# Identificar o melhor modelo
best_model_name_ld = max(final_results_ld.items(), key=lambda x: x[1]['holdout_acc'])[0]
best_pipeline_ld = best_models_ld[best_model_name_ld]

print(f"\nMelhor modelo: {best_model_name_ld}")
print(f"Acur√°cia Hold-Out: {final_results_ld[best_model_name_ld]['holdout_acc']*100:.2f}%")

# Carregar dados de teste
df_test_ld = test_datasets["test_literal_dinamico.csv"]
X_test_texts_ld = df_test_ld["text_preproc"].values

print(f"\nTotal de textos para predi√ß√£o: {len(X_test_texts_ld):,}")

# Fazer predi√ß√µes
y_pred_ld = best_pipeline_ld.predict(X_test_texts_ld)

# Obter os labels originais (literal/dinamico)
df_train_original_ld = pd.read_csv(os.path.join(BASE_FOLDER_TRAIN, "train_literal_dinamico.csv"), sep=";")
le_ld = LabelEncoder()
le_ld.fit(df_train_original_ld["style"])

# Converter predi√ß√µes num√©ricas para labels
predicted_labels_ld = le_ld.inverse_transform(y_pred_ld)

# Adicionar predi√ß√µes ao dataframe
df_test_ld["predicted_style"] = predicted_labels_ld

# Mostrar estat√≠sticas
print(f"\nüìä DISTRIBUI√á√ÉO DAS PREDI√á√ïES:")
pred_counts_ld = pd.Series(predicted_labels_ld).value_counts()
for label, count in pred_counts_ld.items():
    pct = (count / len(predicted_labels_ld)) * 100
    print(f"   ‚Ä¢ {label}: {count:,} ({pct:.2f}%)")

# Salvar resultado
output_path_ld = "test_literal_dinamico_predictions.csv"
df_output_ld = df_test_ld[["text", "predicted_style"]]
df_output_ld.to_csv(output_path_ld, sep=";", index=False)
print(f"\n‚úÖ Predi√ß√µes salvas em: {output_path_ld}")
print("="*80)


PREDI√á√ïES - LITERAL vs DIN√ÇMICO

Melhor modelo: SVM (LinearSVC)
Acur√°cia Hold-Out: 83.68%

Total de textos para predi√ß√£o: 9,242

üìä DISTRIBUI√á√ÉO DAS PREDI√á√ïES:
   ‚Ä¢ dinamico: 4,745 (51.34%)
   ‚Ä¢ literal: 4,497 (48.66%)

‚úÖ Predi√ß√µes salvas em: test_literal_dinamico_predictions.csv
