In [3]:
# 1) Quitar columnas no predictivas o con strings
drop_cols = [
    "Animal ID", "Name_intake", "DateTime_intake", "DateTime_outcome",
    "MonthYear_intake", "MonthYear_outcome", "Found Location",
    "Intake Type", "Intake Condition", "Age upon Intake", "Date of Birth",
    "Name_outcome", "Outcome Subtype", "Sex upon Intake", "Sex upon Outcome",
    "Age upon Outcome", "Breed_intake", "Color_intake", "Breed_outcome", "Color_outcome",
    "stay_weekday_intake", "stay_weekday_outcome", "intake_season"
]

df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])

# 2) Confirmar solo columnas num√©ricas
X = df_model.select_dtypes(include=["number"])
y = df["adopted"]

print("üìå Features finales:", X.shape)
print("üìå Target:", y.name)

# 3) Train/Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


üìå Features finales: (3261, 18)
üìå Target: adopted


In [5]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)



ValueError: could not convert string to float: '*Ximena'

In [6]:
# Verifica si quedan columnas con strings
non_numeric_cols = X.select_dtypes(exclude=["number"]).columns
print("‚ùå Columnas no num√©ricas detectadas:", non_numeric_cols.tolist())

# Revisa ejemplos de valores raros
for col in non_numeric_cols:
    print(f"\nColumna {col}:")
    print(X[col].unique()[:10])


‚ùå Columnas no num√©ricas detectadas: ['Name_intake', 'MonthYear_intake', 'Found Location', 'Intake Type', 'Intake Condition', 'Animal Type_intake', 'Sex upon Intake', 'Age upon Intake', 'Breed_intake', 'Color_intake', 'sex_intake', 'status_intake', 'Date of Birth', 'Name_outcome', 'MonthYear_outcome', 'Outcome Subtype', 'Animal Type_outcome', 'Sex upon Outcome', 'Age upon Outcome', 'Breed_outcome', 'Color_outcome', 'sex_outcome', 'status_outcome', 'stay_weekday_intake', 'stay_weekday_outcome', 'age_category_intake', 'stay_category', 'intake_season']

Columna Name_intake:
['Scamp' 'Evette' 'Cinnamon' 'Dugan' '*Commander' 'Izzie' 'Johnnie'
 'Lilah' 'Whiskey' 'Fozzy Bear']

Columna MonthYear_intake:
['March 2014' 'October 2015' 'May 2017' 'April 2015' 'June 2017'
 'March 2017' 'November 2015' 'July 2014' 'August 2016' 'January 2017']

Columna Found Location:
['8700 Research in Austin (TX)' 'Austin (TX)'
 '9401 S 1St St in Austin (TX)' 'W Parmer Ln & Mcneil Dr in Austin (TX)'
 '7201 Leva

In [7]:
from sklearn.preprocessing import LabelEncoder

# 1) Columnas a dropear
drop_cols = [
    "Name_intake", "Name_outcome", 
    "MonthYear_intake", "MonthYear_outcome", 
    "Date of Birth", "Found Location", 
    "Age upon Intake", "Age upon Outcome"
]

X = X.drop(columns=[c for c in drop_cols if c in X.columns])

# 2) Label Encoding para las categ√≥ricas restantes
categorical_cols = X.select_dtypes(include=["object"]).columns

encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = encoder.fit_transform(X[col].astype(str))

print("‚úÖ Columnas codificadas:", categorical_cols.tolist())
print("üìÇ Shape final de X:", X.shape)


‚úÖ Columnas codificadas: ['Intake Type', 'Intake Condition', 'Animal Type_intake', 'Sex upon Intake', 'Breed_intake', 'Color_intake', 'sex_intake', 'status_intake', 'Outcome Subtype', 'Animal Type_outcome', 'Sex upon Outcome', 'Breed_outcome', 'Color_outcome', 'sex_outcome', 'status_outcome', 'stay_weekday_intake', 'stay_weekday_outcome', 'age_category_intake', 'stay_category', 'intake_season']
üìÇ Shape final de X: (3261, 37)


In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    RocCurveDisplay
)

# -----------------------------
# 1) Calcular m√©tricas de ambos modelos
# -----------------------------
models = {
    "Logistic Regression": logreg,
    "Random Forest": rf
}

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob)
    })

df_results = pd.DataFrame(results)
print("üìä Comparaci√≥n de m√©tricas:\n", df_results)

# -----------------------------
# 2) Gr√°fico de comparaci√≥n (barras)
# -----------------------------
df_results.set_index("Model")[["Accuracy","Precision","Recall","F1-score","ROC-AUC"]].plot(
    kind="bar", figsize=(10,6), colormap="tab20c", rot=0
)
plt.title("üìä Comparaci√≥n de modelos")
plt.ylabel("Score")
plt.ylim(0,1)
plt.legend(loc="lower right")
plt.show()

# -----------------------------
# 3) Curvas ROC comparativas
# -----------------------------
plt.figure(figsize=(6,6))

for name, model in models.items():
    RocCurveDisplay.from_estimator(model, X_test, y_test, name=name)

plt.plot([0,1],[0,1],"k--")  # l√≠nea base
plt.title("Curvas ROC comparativas")
plt.show()


NameError: name 'logreg' is not defined