# Scripts del Proyecto

Script 1: Preparación de datos para el entrenamiento

In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv("../data/raw/credit_card_fraud_data.csv")

In [None]:
#Eliminar duplicados
df.drop_duplicates(inplace=True)

In [None]:
#Escalar los datos con RobustScaler
from sklearn.preprocessing import RobustScaler
robust_scaler=RobustScaler()
df[['Time', 'Amount']]=robust_scaler.fit_transform(df[['Time', 'Amount']])

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
df_under = pd.concat([X_resampled, y_resampled], axis=0)

In [None]:
#Guardar la data transformada
df.to_csv("../data/processed/credit_card_fraud_train.csv")

Script 2: Código de Entrenamiento

In [None]:
df=pd.read_csv("../data/processed/credit_card_fraud_train.csv")

In [None]:
from sklearn.model_selection import train_test_split
X_train=df.drop('Class', axis=1)
y_train=df['Class']

In [None]:
import os
import pickle
from xgboost import XGBClassifier
# Crear directorio para guardar modelos si no existe
model_dir = "../models"
os.makedirs(model_dir, exist_ok=True)

In [None]:
xgb_model=XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)

In [None]:
filename='../models/best_model.pkl'
pickle.dump(xgb_model, open(filename, 'wb'))

Script 3: Preparación de Datos de Validación

In [None]:
df=pd.read_csv("../data/raw/testing_data.csv")

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
from sklearn.preprocessing import RobustScaler
robust_scaler=RobustScaler()
df[['Time', 'Amount']]=robust_scaler.fit_transform(df[['Time', 'Amount']])

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
df_under = pd.concat([X_resampled, y_resampled], axis=0)

In [None]:
#Guardar la data tranformada
df_under.to_csv("../data/processed/credit_card_fraud_testing.csv")

Script 4: Código de Validación

In [None]:
import pandas as pd
import xgboost as xgb
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [None]:
df=pd.read_csv("../data/processed/credit_card_fraud_testing.csv")
X_test=df.drop('Class', axis=1)
y_test=df[['Class']]

In [None]:
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_and_save_confusion_matrix(model, X_test, y_test, save_path="../models/confusion_matrix_bestmodel.png"):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    fig, ax = plt.subplots(figsize=(6, 5)) 

    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap="Blues", ax=ax)

    ax.set_title("Matriz de Confusión", fontsize=14)

    plt.savefig(save_path, dpi=300, bbox_inches="tight")



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_and_save_roc_auc(model, X_test, y_test, save_path="../models/roc_curve_bestmodel.png"):
    y_probs = model.predict_proba(X_test)[:, 1]  
    
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    roc_auc = auc(fpr, tpr)  
    
    plt.figure(figsize=(6, 5))
    
    plt.plot(fpr, tpr, color="blue", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
    
    plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
    
    plt.xlabel("False Positive Rate (FPR)")
    plt.ylabel("True Positive Rate (TPR)")
    plt.title("Curva ROC-AUC")
    plt.legend(loc="lower right")
    
    plt.savefig(save_path, dpi=300, bbox_inches="tight")


In [None]:
plot_and_save_confusion_matrix(model, X_test, y_test)
plot_and_save_roc_auc(model, X_test, y_test )

Script 5: Preparación de Datos de Score (Automatización)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv("../data/raw/validation_data.csv")

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
from sklearn.preprocessing import RobustScaler
robust_scaler=RobustScaler()
df[['Time', 'Amount']]=robust_scaler.fit_transform(df[['Time', 'Amount']])

In [None]:
df.to_csv("../data/processed/credit_card_fraud_score.csv")

Script 6: Código de Scoring (Automatización)

In [None]:
import pandas as pd
import xgboost as xgb
import pickle

In [None]:
df=pd.read_csv("../data/processed/credit_card_fraud_score.csv")

In [None]:
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
scores=model.predict(df)

In [None]:
df_score=pd.DataFrame(scores, columns='Prediction_Fraud')
df_score.to_csv('../data/scores/final_score.csv')