In [None]:
import pandas as pd
import numpy as np
import mlflow
import dagshub
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import mlflow.sklearn
import mlflow.catboost
import mlflow.xgboost
import mlflow.lightgbm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score
)
from mlflow.models import infer_signature
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Carregando dados de treino e teste
train_df = pd.read_csv('../references/exemplo_train.csv')
test_df = pd.read_csv('../references/exemplo_test.csv')

print(f"Dados de treino: {train_df.shape}")
print(f"Dados de teste: {test_df.shape}")

# Visualizando primeiras linhas
train_df.head()


In [None]:
# Verificando informações gerais
print("Informações dos dados de treino:")
print(train_df.info())
print("\nValores únicos da variável alvo:")
print(train_df['Credit_Score'].value_counts())
print("\nDistribuição percentual:")
print(train_df['Credit_Score'].value_counts(normalize=True) * 100)


In [None]:
# Verificando valores nulos
print("Valores nulos por coluna:")
print(train_df.isnull().sum())
print("\nPercentual de valores nulos:")
print((train_df.isnull().sum() / len(train_df)) * 100)


In [None]:
def preprocess_data(df, is_training=True):
    """
    Função para pré-processar os dados de credit score
    """
    # Cópia do dataframe
    df_processed = df.copy()
    
    # Removendo colunas desnecessárias
    columns_to_drop = ['ID', 'Customer_ID', 'Name', 'SSN']
    df_processed = df_processed.drop(columns=[col for col in columns_to_drop if col in df_processed.columns])
    
    # Tratamento de valores problemáticos em colunas numéricas
    numeric_columns = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 
                      'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
                      'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries',
                      'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month',
                      'Amount_invested_monthly', 'Monthly_Balance']
    
    for col in numeric_columns:
        if col in df_processed.columns:
            # Convertendo valores não numéricos para NaN
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
            
            # Tratando outliers negativos para idade
            if col == 'Age':
                df_processed[col] = df_processed[col].apply(lambda x: np.nan if x < 0 or x > 100 else x)
    
    # Tratamento de valores categóricos problemáticos
    categorical_columns = ['Month', 'Occupation', 'Type_of_Loan', 'Credit_Mix', 
                          'Credit_History_Age', 'Payment_of_Min_Amount', 'Payment_Behaviour']
    
    for col in categorical_columns:
        if col in df_processed.columns:
            # Substituindo valores problemáticos por NaN
            df_processed[col] = df_processed[col].replace(['_', '!@9#%8', '#F%$D@*&8', '_______', 'NA'], np.nan)
    
    return df_processed

# Aplicando pré-processamento
train_processed = preprocess_data(train_df, is_training=True)
test_processed = preprocess_data(test_df, is_training=False)

print("Dados após pré-processamento:")
print(f"Treino: {train_processed.shape}")
print(f"Teste: {test_processed.shape}")
print("\nValores nulos após limpeza:")
print(train_processed.isnull().sum())


In [None]:
# Separando features e target
features = list(train_processed.columns)
features.remove('Credit_Score')

X = train_processed[features]
y = train_processed['Credit_Score']

print(f"Número de features: {len(features)}")
print(f"Features: {features}")
print(f"\nDistribuição da variável alvo:")
print(y.value_counts())


In [None]:
# Criando pipeline de pré-processamento
# Identificando colunas numéricas e categóricas
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

print(f"Features numéricas: {numeric_features}")
print(f"Features categóricas: {categorical_features}")

# Pipeline de pré-processamento
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Dividindo dados para treino e validação
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nTamanhos após divisão:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")


In [None]:
# Configuração do MLflow
dagshub.init(repo_owner="domires", repo_name="fiap-mlops-score-model", mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/domires/fiap-mlops-score-model.mlflow")
mlflow.autolog()

print("MLflow configurado!")


In [None]:
def evaluate_and_log_classification_model(kind, model_name, pipeline, X_val, y_val):
    """
    Função para avaliar e registrar modelos de classificação
    """
    # Fazendo predições
    predictions = pipeline.predict(X_val)
    prediction_proba = pipeline.predict_proba(X_val) if hasattr(pipeline, "predict_proba") else None
    
    # Calculando métricas
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='weighted')
    recall = recall_score(y_val, predictions, average='weighted')
    f1 = f1_score(y_val, predictions, average='weighted')
    
    # AUC-ROC para classificação multiclasse
    if prediction_proba is not None:
        try:
            auc_roc = roc_auc_score(y_val, prediction_proba, multi_class='ovr', average='weighted')
        except:
            auc_roc = 0.0
    else:
        auc_roc = 0.0
    
    # Registrando métricas no MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("auc_roc", auc_roc)
    
    # Criando assinatura do modelo
    signature = infer_signature(X_val, predictions)
    
    # Registrando modelo baseado no tipo
    if kind == "catboost":
        mlflow.catboost.log_model(pipeline.named_steps['classifier'], model_name, signature=signature, input_example=X_val[:5])
    elif kind == "xgboost":
        mlflow.xgboost.log_model(pipeline.named_steps['classifier'], model_name, signature=signature, input_example=X_val[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(pipeline.named_steps['classifier'], model_name, signature=signature, input_example=X_val[:5])
    else:
        mlflow.sklearn.log_model(pipeline, model_name, signature=signature, input_example=X_val[:5])
    
    print(f"Modelo {model_name} avaliado:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  AUC-ROC: {auc_roc:.4f}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc_roc
    }


In [None]:
with mlflow.start_run(run_name="Logistic Regression - Credit Score"):
    # Criando pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    # Definindo parâmetros para busca
    param_grid = {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__solver': ['liblinear', 'lbfgs']
    }
    
    # Grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, 
        scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Melhor modelo
    best_pipeline = grid_search.best_estimator_
    
    # Registrando melhores parâmetros
    mlflow.log_param("best_C", grid_search.best_params_['classifier__C'])
    mlflow.log_param("best_solver", grid_search.best_params_['classifier__solver'])
    
    # Avaliando modelo
    metrics = evaluate_and_log_classification_model("sklearn", "logistic_regression", best_pipeline, X_val, y_val)


In [None]:
with mlflow.start_run(run_name="Random Forest - Credit Score"):
    # Criando pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    # Definindo parâmetros para busca
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5]
    }
    
    # Grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=3, 
        scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Melhor modelo
    best_pipeline = grid_search.best_estimator_
    
    # Registrando melhores parâmetros
    mlflow.log_param("best_n_estimators", grid_search.best_params_['classifier__n_estimators'])
    mlflow.log_param("best_max_depth", grid_search.best_params_['classifier__max_depth'])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_['classifier__min_samples_split'])
    
    # Avaliando modelo
    metrics = evaluate_and_log_classification_model("sklearn", "random_forest", best_pipeline, X_val, y_val)


In [None]:
with mlflow.start_run(run_name="XGBoost - Credit Score"):
    # Criando pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(random_state=42, eval_metric='mlogloss'))
    ])
    
    # Definindo parâmetros para busca
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 6],
        'classifier__learning_rate': [0.1, 0.2]
    }
    
    # Grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=3, 
        scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Melhor modelo
    best_pipeline = grid_search.best_estimator_
    
    # Registrando melhores parâmetros
    mlflow.log_param("best_n_estimators", grid_search.best_params_['classifier__n_estimators'])
    mlflow.log_param("best_max_depth", grid_search.best_params_['classifier__max_depth'])
    mlflow.log_param("best_learning_rate", grid_search.best_params_['classifier__learning_rate'])
    
    # Avaliando modelo
    metrics = evaluate_and_log_classification_model("xgboost", "xgboost_classifier", best_pipeline, X_val, y_val)


In [None]:
with mlflow.start_run(run_name="LightGBM - Credit Score"):
    # Criando pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', lgb.LGBMClassifier(random_state=42, verbose=-1))
    ])
    
    # Definindo parâmetros para busca
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 6],
        'classifier__learning_rate': [0.1, 0.2],
        'classifier__num_leaves': [31, 50]
    }
    
    # Grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=3, 
        scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Melhor modelo
    best_pipeline = grid_search.best_estimator_
    
    # Registrando melhores parâmetros
    mlflow.log_param("best_n_estimators", grid_search.best_params_['classifier__n_estimators'])
    mlflow.log_param("best_max_depth", grid_search.best_params_['classifier__max_depth'])
    mlflow.log_param("best_learning_rate", grid_search.best_params_['classifier__learning_rate'])
    mlflow.log_param("best_num_leaves", grid_search.best_params_['classifier__num_leaves'])
    
    # Avaliando modelo
    metrics = evaluate_and_log_classification_model("lightgbm", "lightgbm_classifier", best_pipeline, X_val, y_val)


In [None]:
# Aqui você pode carregar o melhor modelo do MLflow e avaliar no conjunto de teste
# Exemplo de como preparar dados de teste quando disponível

# X_test = test_processed[features]
# # Se houver target no conjunto de teste:
# # y_test = test_processed['Credit_Score']

print("Desenvolvimento de modelos concluído!")
print("Verifique os resultados no MLflow UI para comparar os modelos.")
