In [None]:
# =============================================================================
# IMPORTAÇÃO DAS BIBLIOTECAS
# =============================================================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scipy.stats import ks_2samp
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# =============================================================================
# FUNÇÕES AUXILIARES
# =============================================================================

def calcular_ks(y_true, y_pred):
    """
    Calcula a estatística KS (Kolmogorov-Smirnov).

    Args:
        y_true: Array numpy ou lista contendo os valores reais (0 ou 1).
        y_pred: Array numpy ou lista contendo as probabilidades previstas (entre 0 e 1).

    Returns:
        O valor da estatística KS.
    """
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
    data_bom = data[data['y_true'] == 0]['y_pred']
    data_mau = data[data['y_true'] == 1]['y_pred']
    ks = ks_2samp(data_bom, data_mau).statistic
    return ks

def calcular_gini(y_true, y_pred):
    """
    Calcula o GINI a partir das previsões e valores reais.

    Args:
        y_true: Array numpy ou lista contendo os valores reais (0 ou 1).
        y_pred: Array numpy ou lista contendo as probabilidades previstas (entre 0 e 1).

    Returns:
        O valor do GINI.
    """
    auc = roc_auc_score(y_true, y_pred)
    gini = 2 * auc - 1
    return gini

def fill_missing_values(df, column, filler_values):
    """
    Preenche valores ausentes (NaN) em uma coluna de um DataFrame com uma lista de valores,
    distribuindo esses valores uniformemente.

    Args:
        df (pandas.DataFrame): O DataFrame a ser modificado.
        column (str): O nome da coluna a ser preenchida.
        filler_values (list): Uma lista de valores para preencher os NaNs.
    """
    nan_count = df[column].isnull().sum()

    if nan_count > 0:
        for i, filler in enumerate(filler_values):
            limit = int(nan_count // len(filler_values)) if i < len(filler_values) - 1 else None
            df[column].fillna(filler, limit=limit, inplace=True)

    print(f"Valores NaN restantes na coluna '{column}': {df[column].isnull().sum()}")

def plot_roc_curve(y_true, y_pred_proba):
    """Plota a curva ROC."""
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taxa de Falsos Positivos')
    plt.ylabel('Taxa de Verdadeiros Positivos')
    plt.title('Curva ROC')
    plt.legend(loc="lower right")
    plt.show()

def evaluate_credit_model(y_true, y_pred, y_pred_proba):
    """Avalia modelo de credit scoring com métricas específicas."""
    ks = calcular_ks(y_true, y_pred_proba)
    gini = calcular_gini(y_true, y_pred_proba)

    print(f"KS Score: {ks:.4f}")
    print(f"Gini Index: {gini:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

    plot_roc_curve(y_true, y_pred_proba)

def select_important_features(model, feature_names, threshold=0.1):
    """Seleciona features importantes baseado nos coeficientes do modelo."""
    coef_abs = abs(model.coef_[0])

    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': coef_abs
    })

    feature_importance = feature_importance.sort_values('importance', ascending=False)
    selected_features = feature_importance[feature_importance['importance'] > threshold]['feature'].tolist()

    return selected_features, feature_importance

In [None]:
# =============================================================================
# PREPARAÇÃO E TRATAMENTO DE DADOS
# =============================================================================

# Importação dos dados
df = pd.read_csv('/content/drive/MyDrive/DATA_VIKING/credit_risk.csv')
print("Dados carregados. Shape:", df.shape)

# Visualização inicial
print("\nPrimeiras linhas:")
print(df.head())

print("\nInformações do DataFrame:")
print(df.info())

# Tratamento de valores nulos
# Substituindo valores NaN na coluna 'saving_accounts', pela variável 'little'
df['saving_accounts'].fillna('little', inplace=True)
# Substituindo valores NaN na coluna 'checking_account', dividindo entre 'little' e 'moderate'
fill_missing_values(df, 'checking_account', ['little', 'moderate'])

# Codificação de variáveis categóricas
df['risk'] = df['risk'].map({'good': 1, 'bad': 0})
df['sex'] = df['sex'].map({'male': 1, 'female': 0})

# Criação de features de tempo
df['month'] = df['reference'].str.split('-').str[1].astype(int)
df['year'] = df['reference'].str.split('-').str[0].astype(int)

# Remoção de colunas desnecessárias
df = df.drop(columns=['Unnamed: 0', 'cpf', 'income', 'reference'])

# One-hot encoding para variáveis categóricas
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df = pd.get_dummies(df, columns=[column], drop_first=True, dtype=int)

# Separação em features e target
df_x = df.drop(columns=['risk'])
df_y = df['risk']

In [None]:
# =============================================================================
# MODELO I - REGRESSÃO LOGÍSTICA BÁSICA
# =============================================================================

print("\n" + "="*50)
print("MODELO I - REGRESSÃO LOGÍSTICA BÁSICA")
print("="*50)

# Divisão treino-teste
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=42)

# Treinamento do modelo
model = LogisticRegression()
model.fit(X_train, y_train)

# Previsões
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
y_pred_proba_1 = y_pred_proba[:, 1]

# Avaliação
print("Acurácia:", accuracy_score(y_test, y_pred))
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred))
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, y_pred))

# Métricas de credit scoring
ks = calcular_ks(y_test, y_pred_proba_1)
gini = calcular_gini(y_test, y_pred_proba_1)
print(f"\nKS: {ks:.4f}")
print(f"GINI: {gini:.4f}")

# Curva ROC
plot_roc_curve(y_test, y_pred_proba_1)

# Coeficientes do modelo
coef_df = pd.DataFrame({
    'Feature': df_x.columns,
    'Coefficient': model.coef_[0]
}).sort_values('Coefficient', ascending=False)
print("\nCoeficientes do modelo:")
print(coef_df)

# Validação cruzada
cv_scores = cross_val_score(model, df_x, df_y, cv=5)
print("\nScores da validação cruzada:", cv_scores)
print("Média da validação cruzada:", cv_scores.mean())

# Grid Search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("\nMelhores parâmetros:", grid_search.best_params_)
print("Melhor score:", grid_search.best_score_)

In [None]:
# =============================================================================
# TRATAMENTO DE DESBALANCEAMENTO
# =============================================================================

print("\n" + "="*50)
print("TRATAMENTO DE DESBALANCEAMENTO")
print("="*50)

# Pipeline de balanceamento
balancing_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('undersampling', RandomUnderSampler(random_state=42))
])

# Aplicar balanceamento
X_train_balanced, y_train_balanced = balancing_pipeline.fit_resample(X_train, y_train)

In [None]:
# =============================================================================
# MODELO II - COM BALANCEAMENTO
# =============================================================================

print("\n" + "="*50)
print("MODELO II - COM BALANCEAMENTO")
print("="*50)

# Divisão treino-teste com dados balanceados
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(
    X_train_balanced, y_train_balanced, test_size=0.2, random_state=42
)

# Treinamento do modelo balanceado
model_bal = LogisticRegression(class_weight='balanced')
model_bal.fit(X_train_bal, y_train_bal)

# Previsões
y_pred_bal = model_bal.predict(X_test_bal)
y_pred_proba_bal = model_bal.predict_proba(X_test_bal)[:, 1]

# Avaliação
print("accuracy_score:", accuracy_score(y_test_bal, y_pred_bal))
print("precision_score:", precision_score(y_test_bal, y_pred_bal))
print("recall_score:", recall_score(y_test_bal, y_pred_bal))
print("f1_score:", f1_score(y_test_bal, y_pred_bal))
print("roc_auc_score:", roc_auc_score(y_test_bal, y_pred_proba_bal))
print("\nconfusion_matrix:\n", confusion_matrix(y_test_bal, y_pred_bal))

# Métricas de credit scoring
ks_bal = calcular_ks(y_test_bal, y_pred_proba_bal)
gini_bal = calcular_gini(y_test_bal, y_pred_proba_bal)
print(f"\nKS (balanceado): {ks_bal:.4f}")
print(f"GINI (balanceado): {gini_bal:.4f}")

In [None]:
# =============================================================================
# FEATURE ENGINEERING E SELEÇÃO
# =============================================================================

print("\n" + "="*50)
print("FEATURE ENGINEERING E SELEÇÃO")
print("="*50)

# Coeficientes do modelo balanceado para seleção de features
coef_bal_df = pd.DataFrame({
    'Feature': X_train_balanced.columns,
    'Coefficient': model_bal.coef_[0]
}).sort_values('Coefficient', ascending=False)

print("Coeficientes do modelo balanceado:")
print(coef_bal_df)

# Seleção de features importantes
feature_names = list(X_train_balanced.columns)
selected_features, feature_importance = select_important_features(model_bal, feature_names, threshold=0.1)

print(f"\nFeatures selecionadas ({len(selected_features)}):")
print(selected_features)

# Criar DataFrame com features selecionadas
X_reduced = X_train_balanced[selected_features]
y_reduced = y_train_balanced

In [None]:
# =============================================================================
# MODELO III - COM FEATURES SELECIONADAS
# =============================================================================

print("\n" + "="*50)
print("MODELO III - COM FEATURES SELECIONADAS")
print("="*50)

# Divisão treino-teste com features selecionadas
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(
    X_reduced, y_reduced, test_size=0.2, random_state=42
)

# Treinamento do modelo com features selecionadas
model_red = LogisticRegression()
model_red.fit(X_train_red, y_train_red)

# Previsões
y_pred_red = model_red.predict(X_test_red)
y_pred_proba_red = model_red.predict_proba(X_test_red)
y_pred_proba_1_red = y_pred_proba_red[:, 1]

# Avaliação
print("Acurácia:", accuracy_score(y_test_red, y_pred_red))
print("\nRelatório de Classificação:")
print(classification_report(y_test_red, y_pred_red))
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test_red, y_pred_red))

# Métricas de credit scoring
ks_red = calcular_ks(y_test_red, y_pred_proba_1_red)
gini_red = calcular_gini(y_test_red, y_pred_proba_1_red)
print(f"\nKS (features selecionadas): {ks_red:.4f}")
print(f"GINI (features selecionadas): {gini_red:.4f}")

In [None]:
# =============================================================================
# PIPELINE COMPLETO COM FEATURE ENGINEERING
# =============================================================================

print("\n" + "="*50)
print("PIPELINE COMPLETO COM FEATURE ENGINEERING")
print("="*50)

# Definir features numéricas (todas as colunas neste caso)
numeric_features = df_x.columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numeric_features)
    ]
)

# Pipeline completo
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Grid Search com validação cruzada
param_grid_complete = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__class_weight': ['balanced', None],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__penalty': ['l1', 'l2']
}

grid_search_complete = GridSearchCV(
    model_pipeline,
    param_grid_complete,
    cv=5,
    scoring=['accuracy', 'f1', 'roc_auc'],
    refit='roc_auc'
)

# Treinar grid search
grid_search_complete.fit(X_train, y_train)

print("Melhores parâmetros (pipeline completo):", grid_search_complete.best_params_)
print("Melhor score (pipeline completo):", grid_search_complete.best_score_)

In [None]:
# =============================================================================
# VALIDAÇÃO COM K-FOLD
# =============================================================================

print("\n" + "="*50)
print("VALIDAÇÃO COM K-FOLD")
print("="*50)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_kf = cross_val_score(model_pipeline, df_x, df_y, cv=kf, scoring='roc_auc')

print("Scores K-Fold:", cv_scores_kf)
print("Média K-Fold:", cv_scores_kf.mean())
print("Desvio padrão K-Fold:", cv_scores_kf.std())

In [None]:
# =============================================================================
# RESULTADOS FINAIS
# =============================================================================

print("\n" + "="*50)
print("RESULTADOS FINAIS COMPARATIVOS")
print("="*50)

resultados = pd.DataFrame({
    'Modelo': ['Básico', 'Balanceado', 'Features Selecionadas', 'Pipeline Completo'],
    'AUC': [
        roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]),
        roc_auc_score(y_test_bal, model_bal.predict_proba(X_test_bal)[:, 1]),
        roc_auc_score(y_test_red, model_red.predict_proba(X_test_red)[:, 1]),
        grid_search_complete.best_score_
    ],
    'KS': [ks, ks_bal, ks_red, np.nan],
    'GINI': [gini, gini_bal, gini_red, np.nan]
})

print(resultados)

print("\nAnálise concluída!")