In [19]:
import kagglehub
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import RBF
import numpy as np
from sklearn.model_selection import cross_val_score


In [20]:
path = kagglehub.dataset_download("anurag629/credit-card-fraud-transaction-data")
print("Path to dataset files:", path)
os.listdir(path)

Path to dataset files: /Users/sofia/.cache/kagglehub/datasets/anurag629/credit-card-fraud-transaction-data/versions/1


['CreditCardData.csv']

In [None]:
df = pd.read_csv(os.path.join(path, "CreditCardData.csv"))

In [22]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    data = data.dropna().reset_index(drop=True)
    data = data.drop(columns=['Transaction ID',
                              'Date',
                              'Shipping Address',
                              'Country of Residence'])
    data['Amount'] = data['Amount'].replace('£', '', regex=True).astype(float)
    data = pd.get_dummies(data, columns=['Merchant Group',
                                         'Type of Card',
                                         'Bank',
                                         'Gender',
                                         'Country of Transaction',
                                         'Entry Mode',
                                         'Type of Transaction'], drop_first=True)
    data['Day of Week'] = data['Day of Week'].map({
        'Monday': 0,
        'Tuesday': 1,
        'Wednesday': 2,
        'Thursday': 3,
        'Friday': 4,
        'Saturday': 5,
        'Sunday': 6
    })

    return data


def get_target(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    X = data.drop(columns=['Fraud'])
    y = data['Fraud']
    return X, y

def scaler(X: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()

    columns_to_scale = X.select_dtypes(include=['float64', 'int64']).columns
    X_scaled = X.copy()
    X_scaled[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

    return X_scaled

In [23]:
data = preprocess_data(df)
X, y = get_target(data)
X_scaled = scaler(X)
X_scaled

Unnamed: 0,Day of Week,Time,Amount,Age,Merchant Group_Electronics,Merchant Group_Entertainment,Merchant Group_Fashion,Merchant Group_Food,Merchant Group_Gaming,Merchant Group_Products,...,Bank_RBS,Gender_M,Country of Transaction_India,Country of Transaction_Russia,Country of Transaction_USA,Country of Transaction_United Kingdom,Entry Mode_PIN,Entry Mode_Tap,Type of Transaction_Online,Type of Transaction_POS
0,0.966509,0.824984,-0.816659,-2.056738,False,True,False,False,False,False,...,True,True,False,False,False,True,False,True,False,True
1,0.966509,0.462920,1.505351,0.403693,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True
2,0.966509,-0.080177,-0.816659,-0.342503,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,True
3,-1.026416,-0.080177,-0.627944,0.544865,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
4,-1.026416,1.549112,-0.111030,-0.766020,True,False,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0.966509,0.824984,-0.709994,-1.189537,True,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
994,0.966509,1.187048,0.922798,-1.028197,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
995,-1.026416,-0.623273,-0.792044,0.716289,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
996,0.966509,-1.347401,1.997650,0.141516,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False


# Regresión Logística

In [24]:
def regresion_logistica(X: pd.DataFrame, y: pd.Series, kfolds: KFold, C: float) -> tuple:
    pipeline = Pipeline(steps=[
        ('model', LogisticRegression(C=C, random_state=42, max_iter=1000))
    ])

    scoring_metrics = {
        'accuracy': 'accuracy',
        'roc_auc': 'roc_auc'
    }

    scores = cross_validate(
        estimator=pipeline,
        X=X,
        y=y,
        cv=kfolds,
        scoring=scoring_metrics,
        return_train_score=False
    )

    results = []
    results.append({
        'model': 'Logistic Regression',
        'accuracy_mean': scores['test_accuracy'].mean(),
        'accuracy_std': scores['test_accuracy'].std(),
        'roc_auc_mean': scores['test_roc_auc'].mean(),
        'roc_auc_std': scores['test_roc_auc'].std()
    })

    roc_auc_mean = scores['test_roc_auc'].mean()
    return roc_auc_mean, results

In [25]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

_, results = regresion_logistica(X_scaled, y, kfolds, C=1.0)
results

[{'model': 'Logistic Regression',
  'accuracy_mean': np.float64(0.9248484848484848),
  'accuracy_std': np.float64(0.020639900911629223),
  'roc_auc_mean': np.float64(0.8687345103615023),
  'roc_auc_std': np.float64(0.05296624532548086)}]

In [26]:
n_iter = 30

# Inicializamos puntos para GPR (log10(C)) y sus scores
gaussian_x = np.random.uniform(-3, 3, size=(5, 1))  # log10(C) entre 1e-3 y 1e3
gaussian_y = []

# Evaluamos puntos iniciales
for c_log in gaussian_x.flatten():
    C = 10 ** c_log
    auc, _ = regresion_logistica(X_scaled, y, kfolds, C)
    gaussian_y.append(auc)

gaussian_y = np.array(gaussian_y)

# Loop de optimización bayesiana
for i in range(n_iter):
    # Ajuste del GPR
    kernel = 1.0 * RBF(length_scale=1.0)
    gp = GPR(kernel=kernel, n_restarts_optimizer=5, random_state=42)
    gp.fit(gaussian_x, gaussian_y)

    # Muestra candidatos para log10(C)
    x_candidates = np.random.uniform(-3, 3, size=(1000, 1))
    y_pred, y_std = gp.predict(x_candidates, return_std=True)

    # Upper Confidence Bound (UCB)
    y_upper = y_pred + 1.96 * y_std
    max_idx = np.argmax(y_upper)
    xn = x_candidates[max_idx, 0]

    # Evaluar Logistic Regression con este C
    C_new = 10 ** xn
    auc_new, _ = regresion_logistica(X_scaled, y, kfolds, C_new)

    # Agregar al dataset
    gaussian_x = np.vstack([gaussian_x, [[xn]]])
    gaussian_y = np.append(gaussian_y, auc_new)

# Mejor C encontrado
best_idx = np.argmax(gaussian_y)
best_log_c = gaussian_x[best_idx, 0]
best_c = 10 ** best_log_c
best_score_lr = gaussian_y[best_idx]

print(f"Best C: {best_c:.5f} with ROC AUC: {best_score_lr:.4f}")



Best C: 1.07511 with ROC AUC: 0.8693


# Máquina de soporte vectorial con kernel RBF

In [27]:
def sup_vector_machine(X: pd.DataFrame, y: pd.Series, kfolds: KFold, C: float) -> dict:
    pipeline = Pipeline(steps=[
    ('model', SVC(C=C, kernel='rbf', random_state=42, probability=True))
    ])

    scoring_metrics = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc' 
    }
    
    scores = cross_validate(
        estimator=pipeline,
        X=X,
        y=y,
        cv=kfolds,
        scoring=scoring_metrics,
        return_train_score=False
        )
    results = []

    results.append({
    'model': 'Support Vector Machine',
    'accuracy_mean': scores['test_accuracy'].mean(),
    'accuracy_std': scores['test_accuracy'].std(),
    'roc_auc_mean': scores['test_roc_auc'].mean(),
    'roc_auc_std': scores['test_roc_auc'].std()
    })
    
    roc_auc_mean = scores['test_roc_auc'].mean()

    return roc_auc_mean, results

In [28]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

_, results_svc = sup_vector_machine(X_scaled, y, kfolds, C=1.0)
results_svc

[{'model': 'Support Vector Machine',
  'accuracy_mean': np.float64(0.9418989898989899),
  'accuracy_std': np.float64(0.022244791110684833),
  'roc_auc_mean': np.float64(0.8326351798282271),
  'roc_auc_std': np.float64(0.0775648334286751)}]

In [None]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

n_iter = 15

gaussian_x = np.random.uniform(-3, 3, size=(5, 1)) 
gaussian_y = []

for c_log in gaussian_x.flatten():
    C = 10 ** c_log
    auc, _ = sup_vector_machine(X_scaled, y, kfolds, C)
    gaussian_y.append(auc)

gaussian_y = np.array(gaussian_y)

for i in range(n_iter):
    kernel = 1.0 * RBF(length_scale=1.0)
    gp = GPR(kernel=kernel, n_restarts_optimizer=5, random_state=42)
    gp.fit(gaussian_x, gaussian_y)

    x_candidates = np.random.uniform(-3, 3, size=(1000, 1))
    y_pred, y_std = gp.predict(x_candidates, return_std=True)

    y_upper = y_pred + 1.96 * y_std
    max_idx = np.argmax(y_upper)
    xn = x_candidates[max_idx, 0]

    C_new = 10 ** xn
    auc_new, _ = sup_vector_machine(X_scaled, y, kfolds, C_new)

    gaussian_x = np.vstack([gaussian_x, [[xn]]])
    gaussian_y = np.append(gaussian_y, auc_new)

best_idx = np.argmax(gaussian_y)
best_log_c = gaussian_x[best_idx, 0]
best_c = 10 ** best_log_c
best_score_svc = gaussian_y[best_idx]


print(f"Best C: {best_c:.5f} with ROC AUC: {best_score_svc:.4f}")

Best C: 4.06828 with ROC AUC: 0.8520


# Multi-layer Perceptron

In [30]:
def mlp_classifier(X: pd.DataFrame, y: pd.Series, kfolds: KFold,
                   hidden_layer_sizes=(25,17), activation='relu', solver='adam',
                   max_iter=200, random_state=42) -> tuple:
    pipeline = Pipeline(steps=[
        ('model', MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            max_iter=max_iter,
            random_state=random_state
        ))
    ])
    
    scoring_metrics = {
        'accuracy': 'accuracy',
        'roc_auc': 'roc_auc'
    }
    
    scores = cross_validate(
        estimator=pipeline,
        X=X,
        y=y,
        cv=kfolds,
        scoring=scoring_metrics,
        return_train_score=False
    )
    
    results = []
    results.append({
        'model': 'MLP Classifier',
        'accuracy_mean': scores['test_accuracy'].mean(),
        'accuracy_std': scores['test_accuracy'].std(),
        'roc_auc_mean': scores['test_roc_auc'].mean(),
        'roc_auc_std': scores['test_roc_auc'].std()
    })
    
    roc_auc_mean = scores['test_roc_auc'].mean()
    
    return roc_auc_mean, results

In [31]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

_, results_mlp = mlp_classifier(X_scaled, y, kfolds)
results_mlp

[{'model': 'MLP Classifier',
  'accuracy_mean': np.float64(0.9248383838383839),
  'accuracy_std': np.float64(0.029467693577874567),
  'roc_auc_mean': np.float64(0.8642479637730197),
  'roc_auc_std': np.float64(0.0509584611106095)}]

In [None]:
n_iter = 15

gaussian_x = np.random.uniform(-5, 0, size=(5, 1)) 
gaussian_y = []

def mlp_evaluate_alpha(X, y, kfolds, alpha):
    pipeline = Pipeline([
        ('model', MLPClassifier(hidden_layer_sizes=(25,17),
                                activation='relu',
                                solver='adam',
                                max_iter=200,
                                alpha=alpha,
                                random_state=42))
    ])
    scoring = {'accuracy':'accuracy','roc_auc':'roc_auc'}

    scores = cross_validate(pipeline, X, y, cv=kfolds, scoring=scoring, return_train_score=False)
    return scores['test_roc_auc'].mean()

for log_alpha in gaussian_x.flatten():
    alpha = 10 ** log_alpha
    auc = mlp_evaluate_alpha(X_scaled, y, kfolds, alpha)
    gaussian_y.append(auc)

gaussian_y = np.array(gaussian_y)

for i in range(n_iter):
    kernel = 1.0 * RBF(length_scale=1.0)
    gp = GPR(kernel=kernel, n_restarts_optimizer=5, random_state=42)
    gp.fit(gaussian_x, gaussian_y)

    x_candidates = np.random.uniform(-5, 0, size=(1000, 1))
    y_pred, y_std = gp.predict(x_candidates, return_std=True)

    y_upper = y_pred + 1.96 * y_std
    max_idx = np.argmax(y_upper)
    xn = x_candidates[max_idx, 0]

    alpha_new = 10 ** xn
    auc_new = mlp_evaluate_alpha(X_scaled, y, kfolds, alpha_new)

    gaussian_x = np.vstack([gaussian_x, [[xn]]])
    gaussian_y = np.append(gaussian_y, auc_new)

best_idx = np.argmax(gaussian_y)
best_log_alpha = gaussian_x[best_idx, 0]
best_alpha = 10 ** best_log_alpha
best_score_mlp = gaussian_y[best_idx]

print(f"Best alpha: {best_alpha:.5f} with ROC AUC: {best_score_mlp:.4f}")

Best alpha: 0.99791 with ROC AUC: 0.9002


In [33]:
comparison_df = pd.DataFrame(results + results_svc + results_mlp)
comparison_df

Unnamed: 0,model,accuracy_mean,accuracy_std,roc_auc_mean,roc_auc_std
0,Logistic Regression,0.924848,0.02064,0.868735,0.052966
1,Support Vector Machine,0.941899,0.022245,0.832635,0.077565
2,MLP Classifier,0.924838,0.029468,0.864248,0.050958


In [35]:
optim_comparison_df = pd.DataFrame({
    "ROC AUC": [best_score_lr, best_score_svc, best_score_mlp]
}, index=["Logistic Regression", "Support Vector Machine", "MLP Classifier"])

optim_comparison_df

Unnamed: 0,ROC AUC
Logistic Regression,0.869258
Support Vector Machine,0.852023
MLP Classifier,0.900168
