In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt




In [8]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("model-experiment-v1")

experiment = mlflow.get_experiment_by_name('model-experiment-v1')

if experiment:
    print(f"ID: {experiment.experiment_id}, Name: {experiment.name}")
else:
    print("The experiment doesn't exist")

ID: 1, Name: model-experiment-v1


In [8]:
df = pd.read_parquet('../../data/processed/df_final.parquet', engine= 'fastparquet')

In [10]:
print(mlflow.__version__)


2.14.3


In [7]:
try:
    experiments = mlflow.search_experiments()
    for exp in experiments:
        print(f"ID: {exp.experiment_id}, Name: {exp.name}")
except AttributeError as e:
    print(f"Error: {e}")

ID: 2, Name: test-experiment
ID: 1, Name: model-experiment-v1
ID: 0, Name: Default


In [None]:
X = df.drop('fraud', axis=1)
y = df['fraud']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

In [None]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    results[name] = {
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'roc_auc_score': roc_auc_score(y_test, y_prob),
        'roc_curve': roc_curve(y_test, y_prob)
    }

In [None]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid_rf, cv=3, n_jobs=-1, scoring='roc_auc')
grid_search_rf.fit(X_train_scaled, y_train)

In [None]:
# Entrenar el mejor modelo encontrado para Random Forest
best_rf_model = grid_search_rf.best_estimator_
best_rf_model.fit(X_train_scaled, y_train)

# Hacer predicciones y evaluar el mejor modelo Random Forest
y_pred_best_rf = best_rf_model.predict(X_test_scaled)
y_prob_best_rf = best_rf_model.predict_proba(X_test_scaled)[:, 1]

print("Random Forest Best Model Classification Report:")
print(classification_report(y_test, y_pred_best_rf))

print("Random Forest Best Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_rf))

roc_auc_best_rf = roc_auc_score(y_test, y_prob_best_rf)
print(f"Random Forest Best Model ROC AUC Score: {roc_auc_best_rf:.2f}")