In [0]:
# ============================================
# AUTOML MANUAL COM SCIKIT-LEARN
# ============================================

import sys
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("AUTOML COM SCIKIT-LEARN")
print("=" * 80)

# ============================================
# ETAPA 1: CARREGAR E PREPARAR DADOS
# ============================================

print("\n" + "=" * 80)
print("ETAPA 1: CARREGAMENTO E PREPARACAO DOS DADOS")
print("=" * 80)

SOURCE_TABLE = "finance_silver.transacoes_silver"
print(f"\nCarregando: {SOURCE_TABLE}")

df_silver = spark.read.table(SOURCE_TABLE)

df_pandas = df_silver.select(
    "categoria",
    "valor",
    "tipo",
    "dia_semana",
    "mes",
    "trimestre",
    "alto_valor"
).filter(
    F.col("categoria").isNotNull()
).toPandas()

print(f"\nDados carregados: {len(df_pandas):,} linhas")
print(f"Features: {len(df_pandas.columns) - 1}")
print(f"Target: categoria ({df_pandas['categoria'].nunique()} classes)")

print("\nDistribuicao de Classes:")
print(df_pandas['categoria'].value_counts())

print("\nAmostra dos dados:")
display(df_pandas.head())

# ============================================
# ETAPA 2: FEATURES
# ============================================

print("\n" + "=" * 80)
print("ETAPA 2: PREPARACAO DE FEATURES")
print("=" * 80)

le = LabelEncoder()
df_pandas['tipo_encoded'] = le.fit_transform(df_pandas['tipo'])

features = ['valor', 'tipo_encoded', 'dia_semana', 'mes', 'trimestre', 'alto_valor']
X = df_pandas[features]
y = df_pandas['categoria']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nDados divididos:")
print(f"Treino: {len(X_train):,} amostras")
print(f"Teste: {len(X_test):,} amostras")

# ============================================
# ETAPA 3: COMPARAR MULTIPLOS MODELOS
# ============================================

print("\n" + "=" * 80)
print("ETAPA 3: COMPARANDO MODELOS")
print("=" * 80)

print("\nTestando 8 algoritmos")
print("=" * 80 + "\n")

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_jobs=-1),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

results = []

for name, model in models.items():
    print(f"Treinando {name}")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'F1-Score': f1,
        'CV F1 Mean': cv_mean,
        'CV F1 Std': cv_std,
        'model_object': model
    })
    
    print(f"  Accuracy: {accuracy:.4f} | F1: {f1:.4f} | CV F1: {cv_mean:.4f} (+/- {cv_std:.4f})")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1-Score', ascending=False)

print("\n" + "=" * 80)
print("RANKING DOS MODELOS")
print("=" * 80)
print(results_df[['Model', 'Accuracy', 'F1-Score', 'CV F1 Mean']].to_string(index=False))

# ============================================
# ETAPA 4: TUNAR MELHOR MODELO
# ============================================

print("\n" + "=" * 80)
print("ETAPA 4: TUNANDO MELHOR MODELO")
print("=" * 80)

best_model_name = results_df.iloc[0]['Model']
print(f"\nMelhor modelo: {best_model_name}")

if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'class_weight': ['balanced', None]
    }
    base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
    base_model = GradientBoostingClassifier(random_state=42)
    
elif best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l2'],
        'class_weight': ['balanced', None]
    }
    base_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
    
else:
    print("Usando melhor modelo sem tuning adicional")
    best_model = results_df.iloc[0]['model_object']
    param_grid = None

if param_grid:
    print(f"\nTunando hiperparametros com GridSearchCV")
    print("Isso pode levar varios minutos")
    
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    print(f"\nMelhores parametros: {grid_search.best_params_}")
    print(f"Melhor F1-Score (CV): {grid_search.best_score_:.4f}")

# ============================================
# ETAPA 5: AVALIAR MODELO FINAL
# ============================================

print("\n" + "=" * 80)
print("ETAPA 5: AVALIACAO DO MODELO FINAL")
print("=" * 80)

y_pred_final = best_model.predict(X_test)

accuracy_final = accuracy_score(y_test, y_pred_final)
f1_final = f1_score(y_test, y_pred_final, average='weighted')

print(f"\nMetricas no conjunto de teste:")
print(f"Accuracy: {accuracy_final:.4f}")
print(f"F1-Score: {f1_final:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_final))

cm = confusion_matrix(y_test, y_pred_final)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=sorted(y.unique()),
            yticklabels=sorted(y.unique()))
plt.title(f'Matriz de Confusao - {best_model_name}')
plt.ylabel('Real')
plt.xlabel('Previsto')
plt.tight_layout()
plt.savefig('/tmp/automl_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# ============================================
# ETAPA 6: FEATURE
# ============================================

print("\n" + "=" * 80)
print("ETAPA 6: IMPORTANCIA DAS FEATURES")
print("=" * 80)

if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': features,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\n")
    print(feature_importance_df.to_string(index=False))
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_importance_df)), feature_importance_df['importance'])
    plt.yticks(range(len(feature_importance_df)), feature_importance_df['feature'])
    plt.xlabel('Importancia')
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('/tmp/automl_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("\nFeature importance nao disponivel para este modelo")

# ============================================
# ETAPA 7: SALVAR MODELO NO MLFLOW
# ============================================

print("\n" + "=" * 80)
print("ETAPA 7: SALVANDO MODELO NO MLFLOW")
print("=" * 80)

mlflow.set_experiment("/Users/andre.bomfim99@gmail.com/finance-ml-experiments")

with mlflow.start_run(run_name="AutoML_ScikitLearn") as run:
    
    mlflow.log_param("best_model", best_model_name)
    mlflow.log_param("n_samples", len(df_pandas))
    mlflow.log_param("n_features", len(features))
    mlflow.log_param("n_classes", y.nunique())
    
    if param_grid and hasattr(grid_search, 'best_params_'):
        for param, value in grid_search.best_params_.items():
            mlflow.log_param(f"tuned_{param}", value)
    
    mlflow.log_metric("accuracy", accuracy_final)
    mlflow.log_metric("f1_score", f1_final)
    
    mlflow.sklearn.log_model(
        best_model, 
        "automl_model",
        input_example=X_test.head(1)
    )
    
    mlflow.log_artifact('/tmp/automl_confusion_matrix.png')
    
    if hasattr(best_model, 'feature_importances_'):
        mlflow.log_artifact('/tmp/automl_feature_importance.png')
    
    print(f"\nModelo salvo no MLflow")
    print(f"Run ID: {run.info.run_id}")

# ============================================
# ETAPA 8: COMO USAR O MODELO
# ============================================

print("\n" + "=" * 80)
print("ETAPA 8: COMO USAR O MODELO")
print("=" * 80)

print("\nExemplo de uso:")
print(f"""
import mlflow

# Carregar modelo
model_uri = "runs:/{run.info.run_id}/automl_model"
modelo = mlflow.sklearn.load_model(model_uri)

# Preparar novos dados
import pandas as pd

df_new = pd.DataFrame({{
    'valor': [1000, 5000],
    'tipo_encoded': [1, 0],
    'dia_semana': [1, 5],
    'mes': [6, 12],
    'trimestre': [2, 4],
    'alto_valor': [0, 1]
}})

# Fazer predicoes
predicoes = modelo.predict(df_new)
print(predicoes)
""")

# ============================================
# RESUMO FINAL
# ============================================

print("\n" + "=" * 80)
print("AUTOML CONCLUIDO")
print("=" * 80)

print(f"\nResumo Executivo:")
print(f"Dataset: {len(df_pandas):,} transacoes")
print(f"Classes: {y.nunique()}")
print(f"Modelos testados: {len(models)}")
print(f"Melhor modelo: {best_model_name}")
print(f"Accuracy: {accuracy_final:.2%}")
print(f"F1-Score: {f1_final:.2%}")
print(f"MLflow Run ID: {run.info.run_id}")

print("\nProximos passos:")
print("1. Analisar feature importance")
print("2. Testar modelo em novos dados")
print("3. Registrar no Model Registry")
print("4. Monitorar performance em producao")

print("\n" + "=" * 80)