In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [4]:
# Importar datos
from ucimlrepo import fetch_ucirepo

# fetch dataset
spambase = fetch_ucirepo(id=94)

# # metadata
# print(spambase.metadata)

# # información de variables
# print(spambase.variables)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Datos
X = spambase.data.features
y = spambase.data.targets

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Escalar las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Una vez los datos esten preparados a continuación se definen los modelos de clasificación y las métricas de evaluación. 


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Definir los modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}

# Definir las métricas
metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score
}


Ahora, entrenamos y evaluamos los modelos utilizando un loop:

In [9]:
results = {}

for model_name, model in models.items():
    # Entrenar el modelo
    model.fit(X_train, y_train.values.ravel())

    # Realizar predicciones
    y_pred = model.predict(X_test)

    # Evaluar el modelo con las métricas definidas
    results[model_name] = {}
    for metric_name, metric in metrics.items():
        # Para las métricas de precisión y recall, especificar average='binary'
        if metric_name in ["Precision", "Recall"]:
            results[model_name][metric_name] = metric(y_test, y_pred, average='binary')
        else:
            results[model_name][metric_name] = metric(y_test, y_pred)

    # Mostrar resultados
    print(f"{model_name} Results:")
    for metric_name, score in results[model_name].items():
        print(f"  {metric_name}: {score:.4f}")
    print("\n")

Logistic Regression Results:
  Accuracy: 0.9225
  Precision: 0.9336
  Recall: 0.8769


Random Forest Results:
  Accuracy: 0.9566
  Precision: 0.9675
  Recall: 0.9272


XGBoost Results:
  Accuracy: 0.9587
  Precision: 0.9610
  Recall: 0.9393




Guardar el modelo para ser utilizado en produccion

In [11]:
import joblib

# Entrenar y guardar el modelo XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train.values.ravel())

# Guardar el modelo
joblib.dump(xgb_model, 'xgb_spam_model.pkl')


['xgb_spam_model.pkl']