In [25]:
import kagglehub
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')


In [3]:
path = kagglehub.dataset_download("anurag629/credit-card-fraud-transaction-data")
print("Path to dataset files:", path)
os.listdir(path)

Path to dataset files: /Users/sofia/.cache/kagglehub/datasets/anurag629/credit-card-fraud-transaction-data/versions/1


['CreditCardData.csv']

In [13]:
df = pd.read_csv(os.path.join(path, "CreditCardData.csv"))
df = df.head(1000)

In [14]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    data = data.dropna().reset_index(drop=True)
    data = data.drop(columns=['Transaction ID',
                              'Date',
                              'Shipping Address',
                              'Country of Residence'])
    data['Amount'] = data['Amount'].replace('£', '', regex=True).astype(float)
    data = pd.get_dummies(data, columns=['Merchant Group',
                                         'Type of Card',
                                         'Bank',
                                         'Gender',
                                         'Country of Transaction',
                                         'Entry Mode',
                                         'Type of Transaction'], drop_first=True)
    data['Day of Week'] = data['Day of Week'].map({
        'Monday': 0,
        'Tuesday': 1,
        'Wednesday': 2,
        'Thursday': 3,
        'Friday': 4,
        'Saturday': 5,
        'Sunday': 6
    })

    return data


def get_target(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    X = data.drop(columns=['Fraud'])
    y = data['Fraud']
    return X, y

def scaler(X: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()

    columns_to_scale = X.select_dtypes(include=['float64', 'int64']).columns
    X_scaled = X.copy()
    X_scaled[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

    return X_scaled

In [15]:
data = preprocess_data(df)
X, y = get_target(data)
X_scaled = scaler(X)
X_scaled

Unnamed: 0,Day of Week,Time,Amount,Age,Merchant Group_Electronics,Merchant Group_Entertainment,Merchant Group_Fashion,Merchant Group_Food,Merchant Group_Gaming,Merchant Group_Products,...,Bank_RBS,Gender_M,Country of Transaction_India,Country of Transaction_Russia,Country of Transaction_USA,Country of Transaction_United Kingdom,Entry Mode_PIN,Entry Mode_Tap,Type of Transaction_Online,Type of Transaction_POS
0,0.966509,0.824984,-0.816659,-2.056738,False,True,False,False,False,False,...,True,True,False,False,False,True,False,True,False,True
1,0.966509,0.462920,1.505351,0.403693,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True
2,0.966509,-0.080177,-0.816659,-0.342503,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,True
3,-1.026416,-0.080177,-0.627944,0.544865,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,True
4,-1.026416,1.549112,-0.111030,-0.766020,True,False,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0.966509,0.824984,-0.709994,-1.189537,True,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
994,0.966509,1.187048,0.922798,-1.028197,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
995,-1.026416,-0.623273,-0.792044,0.716289,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
996,0.966509,-1.347401,1.997650,0.141516,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False


# Regresión Logística

In [None]:
def regresion_logistica(X: pd.DataFrame, y: pd.Series, kfolds: KFold, scoring: dict) -> dict:
    pipeline = Pipeline(steps=[
    ('model', LogisticRegression())
    ])



    results = cross_validate(pipeline, X, y, cv=kfolds, scoring=scoring)
    return results

In [21]:
pipeline_logistic = Pipeline(steps=[
    ('model', LogisticRegression())
])

pipeline_logistic.fit(X_scaled, y)

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

results = []
scoring_metrics = {
    'accuracy': 'accuracy',
    'roc_auc_': 'roc_auc'
}

scores = cross_validate(
    estimator=pipeline_logistic,
    X=X_scaled,
    y=y,
    cv=kfolds,
    scoring=scoring_metrics,
    return_train_score=False
)

results.append({
    'model': 'Logistic Regression',
    'accuracy_mean': scores['test_accuracy'].mean(),
    'accuracy_std': scores['test_accuracy'].std(),
    'roc_auc_mean': scores['test_roc_auc_'].mean(),
    'roc_auc_std': scores['test_roc_auc_'].std()
})

In [22]:
results

[{'model': 'Logistic Regression',
  'accuracy_mean': np.float64(0.9248484848484848),
  'accuracy_std': np.float64(0.020639900911629223),
  'roc_auc_mean': np.float64(0.8687345103615023),
  'roc_auc_std': np.float64(0.05296624532548086)}]

# Máquina de soporte vectorial con kernel RBF

In [20]:
indice = list(range(X.shape[1]))

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'auc_ovr': 'roc_auc_ovr' 
}

results_svc = []

kernel = 'rbf'

pipeline_logistic = Pipeline(steps=[
    ('model', SVC(
        C=1.0,          
        kernel=kernel,
        probability=True,
        random_state=42
            ))
])
pipeline_logistic.fit(X_scaled, y)

scores_svc = cross_validate(
    estimator=pipeline_logistic,
    X=X_scaled,
    y=y,
    cv=kfolds,
    scoring=scoring,
    return_train_score=False
)   

results_svc.append({
    'model': f'SVC Kernel={kernel}',
    'accuracy_mean': scores_svc['test_accuracy'].mean(),
    'accuracy_std': scores_svc['test_accuracy'].std(),
    'roc_auc_mean': scores_svc['test_auc_ovr'].mean(),
    'roc_auc_std': scores_svc['test_auc_ovr'].std()
})

results_svc

[{'model': 'SVC Kernel=rbf',
  'accuracy_mean': np.float64(0.9418989898989899),
  'accuracy_std': np.float64(0.022244791110684833),
  'roc_auc_mean': np.float64(0.8324170260684067),
  'roc_auc_std': np.float64(0.07744577194535571)}]

# Multi-layer Perceptron

In [26]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'auc_ovr': 'roc_auc_ovr' 
}

results_mlp = []

pipeline_mlp = Pipeline(steps=[
    ('model', MLPClassifier(
        hidden_layer_sizes=(25, 17),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42
    ))
])  

pipeline_mlp.fit(X_scaled, y)

scores_mlp = cross_validate(
    estimator=pipeline_mlp,
    X=X_scaled,
    y=y,
    cv=kfolds,
    scoring=scoring,
    return_train_score=False
)

results_mlp.append({
    'model': 'MLP Classifier',
    'accuracy_mean': scores_mlp['test_accuracy'].mean(),
    'accuracy_std': scores_mlp['test_accuracy'].std(),
    'roc_auc_mean': scores_mlp['test_auc_ovr'].mean(),
    'roc_auc_std': scores_mlp['test_auc_ovr'].std()
})


In [27]:
results_mlp

[{'model': 'MLP Classifier',
  'accuracy_mean': np.float64(0.9248383838383839),
  'accuracy_std': np.float64(0.029467693577874567),
  'roc_auc_mean': np.float64(0.8642479637730197),
  'roc_auc_std': np.float64(0.0509584611106095)}]

In [28]:
comparison_df = pd.DataFrame(results + results_svc + results_mlp)
comparison_df

Unnamed: 0,model,accuracy_mean,accuracy_std,roc_auc_mean,roc_auc_std
0,Logistic Regression,0.924848,0.02064,0.868735,0.052966
1,SVC Kernel=rbf,0.941899,0.022245,0.832417,0.077446
2,MLP Classifier,0.924838,0.029468,0.864248,0.050958
