# Proyecto 2

## Diego Franco - 20240

In [3]:
import pandas as pd

#Lee el archivo CSV
data = pd.read_csv('BalanceData.csv')

In [4]:
from sklearn.preprocessing import LabelEncoder

# Eliminar la columna 'trans_date_trans_time'
data.drop(columns=['trans_date_trans_time', 'transaction_time'], inplace=True)

# Codificar variables categóricas con label encoding
label_encoder = LabelEncoder()
categorical_cols = ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']
for col in categorical_cols:
    data[col + '_encoded'] = label_encoder.fit_transform(data[col])

# Eliminar columnas originales no numéricas y otras columnas irrelevantes
data.drop(columns=['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'], inplace=True)



In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 

# Dividir el dataset en conjunto de entrenamiento, conjunto de validación (dev) y conjunto de prueba
X_train, X_test, y_train, y_test = train_test_split(data.drop('is_fraud', axis=1), data['is_fraud'], test_size=0.2, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Imputar valores faltantes con la estrategia de relleno con el valor medio
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_dev_imputed = imputer.transform(X_dev)


## Creacion de los modelos

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer 
import joblib

from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def evaluate_model(model, X, y):
    # Predecir probabilidades
    y_pred_proba = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X)
    
    # Calcular métricas
    roc_auc = roc_auc_score(y, y_pred_proba)
    precision = precision_score(y, model.predict(X))
    recall = recall_score(y, model.predict(X))
    f1 = f1_score(y, model.predict(X))
    
    return roc_auc, precision, recall, f1

# Inicializar los modelos
ann_model = MLPClassifier()
lgb_model = lgb.LGBMClassifier()
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(probability=False)  # Habilitar la predicción de probabilidades para SVM

# Diccionario de modelos
models = {
    #"ANN": ann_model,
    #"LightGBM": lgb_model,
    #"XGBoost": xgb_model,
    #"Random Forest": rf_model,
    "SVM": svm_model
}

# Imputar valores faltantes con la estrategia de relleno con el valor medio
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_dev_imputed = imputer.transform(X_dev)

# Diccionario para almacenar métricas antes y después del entrenamiento incremental
metrics_before_incremental = {}
metrics_after_incremental = {}

# Iterar sobre cada modelo
for model_name, model in models.items():

    print(f"Entrenando modelo {model_name}...")
    # Entrenamiento inicial
    model.fit(X_train_imputed, y_train)  # Utilizar X_train_imputed

    # Guardar el modelo antes del entrenamiento incremental
    initial_model_filename = f"{model_name}_initial_model.pkl"
    joblib.dump(model, initial_model_filename)
    print(f"Modelo {model_name} guardado exitosamente.")
    
    # Evaluación inicial
    roc_auc_before, precision_before, recall_before, f1_before = evaluate_model(model, X_dev_imputed, y_dev)  # Utilizar X_dev_imputed
    metrics_before_incremental[model_name] = (roc_auc_before, precision_before, recall_before, f1_before)
    
    # Entrenamiento incremental con nuevos datos nunca antes utilizados por el modelo
    X_new_batch, _, y_new_batch, _ = train_test_split(data.drop('is_fraud', axis=1), data['is_fraud'], test_size=0.1, random_state=42)
    X_new_batch_imputed = imputer.transform(X_new_batch)
    model.partial_fit(X_new_batch_imputed, y_new_batch)

    # Guardar el modelo después del entrenamiento incremental
    incremental_model_filename = f"{model_name}_incremental_model.pkl"
    joblib.dump(model, incremental_model_filename)
    print(f"Modelo {model_name} guardado exitosamente.")
    
    # Evaluación después del entrenamiento incremental
    roc_auc_after, precision_after, recall_after, f1_after = evaluate_model(model, X_dev_imputed, y_dev)  # Utilizar X_dev_imputed
    metrics_after_incremental[model_name] = (roc_auc_after, precision_after, recall_after, f1_after)

    print(f"Modelo {model_name} entrenado y evaluado exitosamente.")
    print()
    print("Antes del entrenamiento incremental:")
    print(f"ROC-AUC: {roc_auc_before}, Precisión: {precision_before}, Recall: {recall_before}, F1-score: {f1_before}")
    print("Después del entrenamiento incremental:")
    print(f"ROC-AUC: {roc_auc_after}, Precisión: {precision_after}, Recall: {recall_after}, F1-score: {f1_after}")

# Imprimir las métricas antes y después del entrenamiento incremental para cada modelo
for model_name, (roc_auc_before, precision_before, recall_before, f1_before), (roc_auc_after, precision_after, recall_after, f1_after) in zip(metrics_before_incremental.keys(), metrics_before_incremental.values(), metrics_after_incremental.values()):
    print(f"Modelo: {model_name}")
    print("Antes del entrenamiento incremental:")
    print(f"ROC-AUC: {roc_auc_before}, Precisión: {precision_before}, Recall: {recall_before}, F1-score: {f1_before}")
    print("Después del entrenamiento incremental:")
    print(f"ROC-AUC: {roc_auc_after}, Precisión: {precision_after}, Recall: {recall_after}, F1-score: {f1_after}")
    print()


Entrenando modelo SVM...


In [17]:
import joblib
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np


def evaluate_model(model, X, y):
    # Predecir probabilidades
    y_pred_proba = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X)
    
    # Calcular métricas
    roc_auc = roc_auc_score(y, y_pred_proba)
    precision = precision_score(y, model.predict(X))
    recall = recall_score(y, model.predict(X))
    f1 = f1_score(y, model.predict(X))
    
    return roc_auc, precision, recall, f1

# Cargar el modelo
model = joblib.load('XGBoost_initial_model.pkl')

roc_auc_before, precision_before, recall_before, f1_before = evaluate_model(model, X_dev_imputed, y_dev)  # Utilizar X_dev_imputed

print(f"ROC-AUC: {roc_auc_before}, Precisión: {precision_before}, Recall: {recall_before}, F1-score: {f1_before}")

# Entrenamiento incremental con nuevos datos nunca antes utilizados por el modelo
X_new_batch, _, y_new_batch, _ = train_test_split(data.drop('is_fraud', axis=1), data['is_fraud'], test_size=0.1, random_state=42)
X_new_batch_imputed = imputer.transform(X_new_batch)

# Combinar datos originales con nuevos datos
X_combined = np.concatenate((X_dev_imputed, X_new_batch_imputed), axis=0)
y_combined = np.concatenate((y_dev, y_new_batch), axis=0)

# Reentrenar el modelo con los datos combinados
model.fit(X_combined, y_combined)

# Evaluar el modelo actualizado
roc_auc_after, precision_after, recall_after, f1_after = evaluate_model(model, X_dev_imputed, y_dev)
print(f"Después del reentrenamiento incremental:")
print(f"ROC-AUC: {roc_auc_after}, Precisión: {precision_after}, Recall: {recall_after}, F1-score: {f1_after}")




ROC-AUC: 0.999940938211968, Precisión: 0.9970716435232042, Recall: 1.0, F1-score: 0.9985336748001541
Después del reentrenamiento incremental:
ROC-AUC: 0.9999737131241117, Precisión: 0.9978930877069772, Recall: 1.0, F1-score: 0.9989454329133092


In [12]:
import numpy as np
import joblib
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

def evaluate_model(model, X, y, threshold=0.5):
    # Predecir probabilidades
    y_pred_proba = model.predict(X)

    # Convertir probabilidades a etiquetas binarias
    y_pred = (y_pred_proba >= threshold).astype(int)

    # Calcular métricas
    roc_auc = roc_auc_score(y, y_pred_proba)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    return roc_auc, precision, recall, f1

# Definir hiperparámetros
learning_rate = 0.01
epochs = 100

# Definir el modelo SVM lineal en TensorFlow
class LinearSVM(tf.keras.Model):
    def __init__(self):
        super(LinearSVM, self).__init__()
        self.dense = tf.keras.layers.Dense(1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.01))

    def call(self, inputs):
        return self.dense(inputs)

# Convertir los datos a tensores de TensorFlow
X_train_tensor = tf.constant(X_train_imputed, dtype=tf.float32)
y_train_tensor = tf.constant(y_train.values.reshape(-1, 1), dtype=tf.float32)

# Crear el modelo SVM lineal
model = LinearSVM()

# Definir la función de pérdida (hinge loss)
def hinge_loss(y_true, y_pred):
    return tf.reduce_mean(tf.maximum(0., 1. - y_true * y_pred))

# Definir el optimizador
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

# Función de entrenamiento
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = hinge_loss(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Entrenamiento del modelo
for epoch in range(epochs):
    loss = train_step(X_train_tensor, y_train_tensor)
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.numpy()}')


roc_auc_before, precision_before, recall_before, f1_before = evaluate_model(model, X_dev_imputed, y_dev)  # Utilizar X_dev_imputed

print(f"ROC-AUC: {roc_auc_before}, Precisión: {precision_before}, Recall: {recall_before}, F1-score: {f1_before}")

initial_model_filename = "SVM_initial_model.pkl"
joblib.dump(model, initial_model_filename)

# Entrenamiento incremental con nuevos datos nunca antes utilizados por el modelo
X_new_batch, _, y_new_batch, _ = train_test_split(data.drop('is_fraud', axis=1), data['is_fraud'], test_size=0.1, random_state=42)
X_new_batch_imputed = imputer.transform(X_new_batch)

# Combinar datos originales con nuevos datos
X_combined = np.concatenate((X_dev_imputed, X_new_batch_imputed), axis=0)
y_combined = np.concatenate((y_dev, y_new_batch), axis=0)

# Compilar el modelo con un optimizador y una función de pérdida
model.compile(optimizer='sgd', loss='hinge')

# Reentrenar el modelo con los datos combinados
model.fit(X_combined, y_combined)

# Evaluar el modelo actualizado
roc_auc_after, precision_after, recall_after, f1_after = evaluate_model(model, X_dev_imputed, y_dev)
print("Después del reentrenamiento incremental:")
print(f"ROC-AUC: {roc_auc_after}, Precisión: {precision_after}, Recall: {recall_after}, F1-score: {f1_after}")

initial_model_filename = "SVM_incremental_model.pkl"
joblib.dump(model, initial_model_filename)

Epoch 0, Loss: 0.5002959966659546
Epoch 10, Loss: 0.5002959966659546
Epoch 20, Loss: 0.5002959966659546
Epoch 30, Loss: 0.5002959966659546
Epoch 40, Loss: 0.5002959966659546
Epoch 50, Loss: 0.5002959966659546
Epoch 60, Loss: 0.5002959966659546
Epoch 70, Loss: 0.5002959966659546
Epoch 80, Loss: 0.5002959966659546
Epoch 90, Loss: 0.5002959966659546
ROC-AUC: 0.49299079804473644, Precisión: 0.5011955677505351, Recall: 1.0, F1-score: 0.6677285471892926
Después del reentrenamiento incremental:
ROC-AUC: 0.5069611277327026, Precisión: 0.0, Recall: 0.0, F1-score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


['SVM_incremental_model.pkl']