In [0]:
# ============================================
# Pipeline Machine Learning 3 modelos 
# 1. Random Forest - Classificação de Categorias
# 2. Isolation Forest - Detecção de Anomalias
# 3. K-Means - Clustering de Padrões
# ============================================

import mlflow
import mlflow.sklearn
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, silhouette_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("\n" + "=" * 80)
print(" " * 20 + "Pipeline Analise Financeira")
print("=" * 80)


In [0]:
# ============================================
# Etapa 1: Configurar Mlflow
# ============================================

print("\n" + "=" * 80)
print("Etapa 1 Mlflow")
print("=" * 80)

experiment_path = "/Users/andre.bomfim99@gmail.com/finance-ml-experiments"

try:
    experiment_id = mlflow.create_experiment(experiment_path)
    print(f" Experimento criado: {experiment_path}")
    print(f"   ID: {experiment_id}")
except mlflow.exceptions.MlflowException as e:
    if "already exists" in str(e):
        print(f"  Experimento já existe: {experiment_path}")
    else:
        print(f"  Erro ao criar experimento: {e}")
        experiment_path = "finance-ml-project"
        print(f"   Usando nome simples: {experiment_path}")

mlflow.set_experiment(experiment_path)
print(f" MLflow configurado ok")


In [0]:
# ============================================
# Etapa2: carrega os dados
# ============================================

print("\n" + "=" * 80)
print("Etapa 2: carrega os dados")
print("=" * 80)

SOURCE_TABLE = "finance_silver.transacoes_silver"
print(f"\n Carregando: {SOURCE_TABLE}")

df = spark.read.table(SOURCE_TABLE)
total_records = df.count()

print(f" {total_records:,} transações carregadas")

In [0]:
# ============================================
# Etapa 3: Feature eng
# ============================================

print("\n" + "=" * 80)
print(" Etapa 3: Feature eng")
print("=" * 80)

df_pandas = df.select(
    "data", "descricao", "valor", "tipo", "categoria",  
    "ano", "mes", "dia_semana", "trimestre"
).toPandas()

print(f"\n Dataframe Pandas criado: {len(df_pandas):,} linhas")
df_pandas['descricao_length'] = df_pandas['descricao'].str.len()
keywords = {
    'salario': ['SALARIO', 'PAGAMENTO', 'FUNCIONARIO'],
    'fornecedor': ['FORNECEDOR', 'TED', 'COMPRA'],
    'receita': ['RECEBIDO', 'CLIENTE', 'VENDA'],
    'imposto': ['INSS', 'IR', 'ISS', 'IMPOSTO']
}

for key, words in keywords.items():
    df_pandas[f'keyword_{key}'] = df_pandas['descricao'].apply(
        lambda x: 1 if any(word in str(x) for word in words) else 0
    )

df_pandas['tipo_num'] = df_pandas['tipo'].map({'entrada': 1, 'saida': 0})
df_pandas['valor_log'] = np.log1p(df_pandas['valor'])
df_pandas['dia'] = pd.to_datetime(df_pandas['data']).dt.day
df_pandas['fim_mes'] = (df_pandas['dia'] > 25).astype(int)
print("\n Features criadas:")
features_created = [
    'descricao_length', 'keyword_salario', 'keyword_fornecedor',
    'keyword_receita', 'keyword_imposto', 'tipo_num', 'valor_log', 'fim_mes'
]
for i, feat in enumerate(features_created, 1):
    print(f"   {i}. {feat}")
df_classification = df_pandas.copy()
print(f"\n Dataset preparado: {len(df_classification):,} registros")

In [0]:
# ============================================
# Etapa 4 Modelo 1 Random Forest
# ============================================

print("\n" + "=" * 80)
print(" Etapa 4 Modelo 1 Random Forest ")
print("=" * 80)

features = [
    'valor', 'valor_log', 'tipo_num', 'dia_semana', 'mes', 'trimestre',
    'descricao_length', 'keyword_salario', 'keyword_fornecedor', 
    'keyword_receita', 'keyword_imposto', 'fim_mes'
]

X = df_classification[features]
y = df_classification['categoria']

print(f"\n Informações do Dataset:")
print(f"   Features: {len(features)}")
print(f"   Samples: {len(X):,}")
print(f"   Classes: {y.nunique()}")

print(f"\n Distribuição de Classes:")
class_dist = y.value_counts()
for categoria, count in class_dist.items():
    print(f"   {categoria:20s}: {count:6,} ({count/len(y)*100:5.1f}%)")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n  Split de Dados:")
print(f"   Train: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test:  {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")

with mlflow.start_run(run_name="1_RandomForest_Classification") as run:
    
    print("\n Treinando Random Forest")
    
    model_rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\n modelo treinado")
    print(f"   Acurácia: {accuracy:.2%}")
    
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("features_count", len(features))
    mlflow.log_metric("accuracy", accuracy)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    for categoria in y.unique():
        if categoria in report_dict:
            mlflow.log_metric(f"precision_{categoria}", report_dict[categoria]['precision'])
            mlflow.log_metric(f"recall_{categoria}", report_dict[categoria]['recall'])
            mlflow.log_metric(f"f1_{categoria}", report_dict[categoria]['f1-score'])
    
    mlflow.sklearn.log_model(model_rf, "random_forest_model", input_example=X_train.head(1))
    print(f"   MLflow Run ID: {run.info.run_id}")
    print("\n Classification Report:")
    print("-" * 80)
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=model_rf.classes_, yticklabels=model_rf.classes_,
                cbar_kws={'label': 'Quantidade'})
    plt.title('Matriz de Confusão Random Forest', fontsize=16, pad=20)
    plt.ylabel('Categoria Real', fontsize=12)
    plt.xlabel('Categoria Predita', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('/tmp/rf_confusion_matrix.png', dpi=300, bbox_inches='tight')
    mlflow.log_artifact('/tmp/rf_confusion_matrix.png')
    plt.show()
    
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': model_rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n Top 5 Features Mais Importantes:")
    print(feature_importance.head(5).to_string(index=False))
    
    plt.figure(figsize=(10, 6))
    top_features = feature_importance.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Importância', fontsize=12)
    plt.title('Top 10 Features - Random Forest', fontsize=14, pad=20)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('/tmp/rf_feature_importance.png', dpi=300, bbox_inches='tight')
    mlflow.log_artifact('/tmp/rf_feature_importance.png')
    plt.show()

print("\n Modelo 1 Random Forest concluído")

In [0]:


# ============================================
# Etapa 5 Modelo 2 isolation forest
# ============================================

print("\n" + "=" * 80)
print(" Etapa 5 Modelo 2 isolation forest detectando anomalias")
print("=" * 80)

anomaly_features = ['valor', 'dia_semana', 'mes']
X_anomaly = df_classification[anomaly_features].copy()

scaler = StandardScaler()
X_anomaly_scaled = scaler.fit_transform(X_anomaly)

print(f"\n Informações:")
print(f"   Features: {anomaly_features}")
print(f"   Samples: {len(X_anomaly):,}")

with mlflow.start_run(run_name="2_IsolationForest_Anomaly") as run:
    
    print("\n Treinando Isolation Forest")
    
    iso_forest = IsolationForest(
        contamination=0.05,
        random_state=42,
        n_estimators=100,
        n_jobs=-1
    )
    
    predictions = iso_forest.fit_predict(X_anomaly_scaled)
    scores = iso_forest.score_samples(X_anomaly_scaled)
    
    df_classification['anomalia'] = predictions
    df_classification['anomaly_score'] = scores
    
    num_anomalias = (predictions == -1).sum()
    pct_anomalias = (num_anomalias / len(predictions)) * 100
    
    print(f"\n Modelo treinado")
    print(f"   Anomalias detectadas: {num_anomalias:,} ({pct_anomalias:.1f}%)")
    
    mlflow.log_param("model_type", "IsolationForest")
    mlflow.log_param("contamination", 0.05)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("features", str(anomaly_features))
    mlflow.log_metric("anomalias_detectadas", num_anomalias)
    mlflow.log_metric("pct_anomalias", pct_anomalias)
    mlflow.sklearn.log_model(iso_forest, "isolation_forest_model")
    mlflow.sklearn.log_model(scaler, "scaler")
    
    print(f"   MLflow Run ID: {run.info.run_id}")
    
    print("\n 10 Transações Mais Anômalas:")
    anomalias_df = df_classification[df_classification['anomalia'] == -1].copy()
    anomalias_df = anomalias_df.sort_values('anomaly_score')
    
    display(anomalias_df[[
        'data', 'descricao', 'valor', 'categoria', 'anomaly_score'
    ]].head(10))
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    axes[0].hist(scores, bins=50, edgecolor='black', alpha=0.7, color='skyblue')
    axes[0].axvline(x=scores[predictions == -1].max(), color='red', 
                    linestyle='--', linewidth=2, label='Threshold Anomalia')
    axes[0].set_xlabel('Anomaly Score', fontsize=11)
    axes[0].set_ylabel('Frequência', fontsize=11)
    axes[0].set_title('Distribuição dos Anomaly Scores', fontsize=13, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    anomaly_by_category = df_classification.groupby('categoria')['anomalia'].apply(
        lambda x: (x == -1).sum()
    ).sort_values(ascending=False)
    anomaly_by_category.plot(kind='bar', ax=axes[1], color='coral', edgecolor='black')
    axes[1].set_xlabel('Categoria', fontsize=11)
    axes[1].set_ylabel('Número de Anomalias', fontsize=11)
    axes[1].set_title('Anomalias por Categoria', fontsize=13, fontweight='bold')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('/tmp/anomaly_analysis.png', dpi=300, bbox_inches='tight')
    mlflow.log_artifact('/tmp/anomaly_analysis.png')
    plt.show()

print("\n Isolation Forest concluído")

In [0]:


# ============================================
# Etapa 6 Modelo 3 Kmeans
# ============================================

print("\n" + "=" * 80)
print(" ETAPA 6/6: K-MEANS - CLUSTERING DE PADRÕES")
print("=" * 80)
df_category_agg = df_classification.groupby('categoria').agg({
    'valor': ['mean', 'sum', 'count', 'std'],
    'tipo_num': 'mean'
}).reset_index()

df_category_agg.columns = ['_'.join(col).strip('_') for col in df_category_agg.columns.values]
df_category_agg.columns = ['categoria'] + list(df_category_agg.columns[1:])

print(f"\n Categorias para clustering: {len(df_category_agg)}")
cluster_features = ['valor_mean', 'valor_sum', 'valor_count', 'valor_std']
X_cluster = df_category_agg[cluster_features].fillna(0)
scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

print(f"   Features: {cluster_features}")
print(f"   Dados normalizados: {X_cluster_scaled.shape}")

with mlflow.start_run(run_name="3_KMeans_Clustering") as run:
    
    print("\n Treinando K-Means...")
    
    kmeans = KMeans(
        n_clusters=3, 
        random_state=42, 
        n_init=10,
        max_iter=300
    )
    clusters = kmeans.fit_predict(X_cluster_scaled)
    
    df_category_agg['cluster'] = clusters
    
    silhouette = silhouette_score(X_cluster_scaled, clusters)
    
    print(f"\n Modelo treinado!")
    print(f"   Clusters criados: 3")
    print(f"   Inertia: {kmeans.inertia_:.2f}")
    print(f"   Silhouette Score: {silhouette:.3f}")
    
    mlflow.log_param("model_type", "KMeans")
    mlflow.log_param("n_clusters", 3)
    mlflow.log_param("features", str(cluster_features))
    mlflow.log_metric("inertia", kmeans.inertia_)
    mlflow.log_metric("silhouette_score", silhouette)
    mlflow.sklearn.log_model(kmeans, "kmeans_model")
    mlflow.sklearn.log_model(scaler_cluster, "scaler_cluster")
    
    print(f"   MLflow Run ID: {run.info.run_id}")
    print("\n Clusters Identificados:")
    for cluster_id in sorted(df_category_agg['cluster'].unique()):
        cluster_data = df_category_agg[df_category_agg['cluster'] == cluster_id]
        categorias = cluster_data['categoria'].tolist()
        
        print(f"\n   Cluster {cluster_id}: {', '.join(categorias)}")
        print(f"      Valor médio: R$ {cluster_data['valor_mean'].mean():,.2f}")
        print(f"      Transações: {cluster_data['valor_count'].sum():.0f}")

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    ax1 = axes[0, 0]
    scatter1 = ax1.scatter(
        df_category_agg['valor_mean'], 
        df_category_agg['valor_sum'],
        c=df_category_agg['cluster'], 
        cmap='viridis', 
        s=200, 
        alpha=0.7,
        edgecolors='black',
        linewidth=2
    )
    for idx, row in df_category_agg.iterrows():
        ax1.annotate(row['categoria'], (row['valor_mean'], row['valor_sum']),
                    fontsize=9, ha='center', fontweight='bold')
    ax1.set_xlabel('Valor Médio (R$)', fontsize=11)
    ax1.set_ylabel('Volume Total (R$)', fontsize=11)
    ax1.set_title('Clusters: Valor Médio vs Volume Total', fontsize=12, fontweight='bold')
    plt.colorbar(scatter1, ax=ax1, label='Cluster')
    ax1.grid(True, alpha=0.3)
    
    ax2 = axes[0, 1]
    scatter2 = ax2.scatter(
        df_category_agg['valor_count'], 
        df_category_agg['valor_std'],
        c=df_category_agg['cluster'], 
        cmap='viridis', 
        s=200, 
        alpha=0.7,
        edgecolors='black',
        linewidth=2
    )
    for idx, row in df_category_agg.iterrows():
        ax2.annotate(row['categoria'], (row['valor_count'], row['valor_std']),
                    fontsize=9, ha='center', fontweight='bold')
    ax2.set_xlabel('Quantidade de Transações', fontsize=11)
    ax2.set_ylabel('Desvio Padrão (R$)', fontsize=11)
    ax2.set_title('Clusters: Quantidade vs Variabilidade', fontsize=12, fontweight='bold')
    plt.colorbar(scatter2, ax=ax2, label='Cluster')
    ax2.grid(True, alpha=0.3)
    
    ax3 = axes[1, 0]
    cluster_counts = df_category_agg['cluster'].value_counts().sort_index()
    colors = plt.cm.viridis(np.linspace(0, 1, len(cluster_counts)))
    ax3.bar(cluster_counts.index, cluster_counts.values, color=colors, edgecolor='black', linewidth=2)
    ax3.set_xlabel('Cluster', fontsize=11)
    ax3.set_ylabel('Número de Categorias', fontsize=11)
    ax3.set_title('Distribuição por Cluster', fontsize=12, fontweight='bold')
    ax3.set_xticks(cluster_counts.index)
    ax3.grid(True, alpha=0.3, axis='y')
    
    ax4 = axes[1, 1]
    cluster_profiles = df_category_agg.groupby('cluster')[cluster_features].mean()
    sns.heatmap(cluster_profiles.T, annot=True, fmt='.0f', cmap='YlOrRd',
                cbar_kws={'label': 'Valor'}, ax=ax4, linewidths=1, linecolor='black')
    ax4.set_title('Perfil Médio dos Clusters', fontsize=12, fontweight='bold')
    ax4.set_xlabel('Cluster', fontsize=11)
    ax4.set_ylabel('Features', fontsize=11)
    
    plt.tight_layout()
    plt.savefig('/tmp/kmeans_clusters.png', dpi=300, bbox_inches='tight')
    mlflow.log_artifact('/tmp/kmeans_clusters.png')
    plt.show()
    
    df_category_agg.to_csv('/tmp/clusters_resultado.csv', index=False)
    mlflow.log_artifact('/tmp/clusters_resultado.csv')

print("\n Modelo K-Means concluído")

In [0]:
# ============================================
# Finalizando
# ============================================

print("\n" + "=" * 80)
print(" " * 25 + " Pipeline concluido e ok ")
print("=" * 80)

print(f"\n Resumo Executivo:")
print(f"   {'─' * 76}")
print(f"   Total de transações processadas: {len(df_classification):,}")
print(f"   {'─' * 76}")
print(f"\n    Modelo 1 random forest classificação")
print(f"      Acurácia: {accuracy:.2%}")
print(f"      Classes: {y.nunique()}")
print(f"      Macro F1 Score: {report_dict['macro avg']['f1-score']:.2%}")
print(f"      Weighted F1 Score: {report_dict['weighted avg']['f1-score']:.2%}")
print(f"   {'─' * 76}")
print(f"\n    Modelo 2 isolation forest anomalias")
print(f"      Anomalias detectadas: {num_anomalias:,} ({pct_anomalias:.1f}%)")
print(f"      Transações normais: {len(df_classification) - num_anomalias:,}")
print(f"   {'─' * 76}")
print(f"\n    Modelo 3 kmeans ou clustering")
print(f"      Clusters criados: 3")
print(f"      Silhouette Score: {silhouette:.3f}")
print(f"      Categorias agrupadas: {len(df_category_agg)}")
print(f"   {'─' * 76}")
print(f"\n    Experimento mlflow: {experiment_path}")
print(f"    Artefatos salvos: 8 imagens + 1 CSV")
print(f"   {'─' * 76}")

print("\n" + "=" * 80)
print(" " * 20 + " Todos os modelos ok e treinados")
print("=" * 80 + "\n")