# Evaluación de Modelos de Clasificación - Protocolo


In [None]:
import pandas as pd

# Cargar los datos de entrenamiento y prueba
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Explorar los primeros registros de los datos de entrenamiento
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
# Descripción estadística de las variables numéricas
train.describe()


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


## 2. Preparación de los datos

In [None]:
# Manejo de valores nulos para el conjunto de entrenamiento
train.drop(["Name","PassengerId","Cabin"], axis=1, inplace=True)
# 1. Rellenar valores nulos en columnas numéricas con la mediana
numeric_cols_train = train.select_dtypes(include=['float64', 'int64']).columns
train[numeric_cols_train] = train[numeric_cols_train].fillna(train[numeric_cols_train].median())

# 2. Rellenar columnas categóricas con el valor más frecuente (moda), o con "Unknown" si no se puede calcular la moda
categorical_cols_train = train.select_dtypes(include=['object']).columns

for col in categorical_cols_train:
    if train[col].isnull().all():
        train[col].fillna('Unknown', inplace=True)  # Rellenar con "Unknown" si toda la columna tiene valores nulos
    else:
        train[col].fillna(train[col].mode()[0], inplace=True)  # Rellenar con la moda si es posible calcularla

# Relleno para el conjunto de prueba, sin la columna 'SalePrice'
numeric_cols_test = test.select_dtypes(include=['float64', 'int64']).columns
test[numeric_cols_test] = test[numeric_cols_test].fillna(test[numeric_cols_test].median())

categorical_cols_test = test.select_dtypes(include=['object']).columns

for col in categorical_cols_test:
    if test[col].isnull().all():
        test[col].fillna('Unknown', inplace=True)
    else:
        test[col].fillna(test[col].mode()[0], inplace=True)

# Convertir variables categóricas a variables dummy (one-hot encoding)
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# Alinear las columnas entre el conjunto de entrenamiento y prueba (sin 'SalePrice' en prueba)
train, test = train.align(test, join='left', axis=1)

train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,True,False,False,True
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,False,False,False,True
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,True,False,False,True
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,True,False,False,True
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,False,False,False,True


## 3. Definir variables predictoras y objetivo

In [None]:

# Separar las variables predictoras (X) y la variable objetivo (y)
X = train.drop(columns=['Transported'])
y = train['Transported']

# Dividir los datos en conjuntos de entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificar las formas de los conjuntos de datos
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((6954, 12), (1739, 12), (6954,), (1739,))

# 4. Entrenamiento de los modelos de regresión

In [None]:
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor  # Necesitarás instalar xgboost si no lo tienes
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Crear los modelos de regresión
models = {
    'CatBoostClassifier': CatBoostClassifier(verbose=0),
    'SGDClassifier': SGDClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'XGBClassifier':xgb.XGBClassifier(),
    'SVC':SVC()
}

# Entrenar y evaluar los modelos
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}
    print(f"{name}: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}")

# Mostrar los resultados en un DataFrame
results_df = pd.DataFrame(results).T
results_df

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
CatBoostClassifier: Accuracy=0.7861, Precision=0.7663, Recall=0.8292, F1=0.7965
SGDClassifier: Accuracy=0.7700, Precision=0.7626, Recall=0.7904, F1=0.7763


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression: Accuracy=0.7769, Precision=0.7681, Recall=0.7995, F1=0.7835
DecisionTreeClassifier: Accuracy=0.7326, Precision=0.7162, Recall=0.7790, F1=0.7463
RandomForestClassifier: Accuracy=0.7838, Precision=0.7777, Recall=0.8007, F1=0.7890
GradientBoostingClassifier: Accuracy=0.7838, Precision=0.7525, Recall=0.8519, F1=0.7991
KNeighborsClassifier: Accuracy=0.7625, Precision=0.7541, Recall=0.7859, F1=0.7697
XGBClassifier: Accuracy=0.7809, Precision=0.7652, Recall=0.8166, F1=0.7901
SVC: Accuracy=0.7740, Precision=0.7299, Recall=0.8770, F1=0.7967


Unnamed: 0,Accuracy,Precision,Recall,F1
CatBoostClassifier,0.786084,0.766316,0.829157,0.796499
SGDClassifier,0.769983,0.762637,0.790433,0.776286
LogisticRegression,0.776883,0.768053,0.799544,0.783482
DecisionTreeClassifier,0.732605,0.71623,0.779043,0.746318
RandomForestClassifier,0.783784,0.777655,0.800683,0.789001
GradientBoostingClassifier,0.783784,0.752515,0.851936,0.799145
KNeighborsClassifier,0.762507,0.754098,0.785877,0.76966
XGBClassifier,0.780909,0.765208,0.816629,0.790083
SVC,0.774008,0.729858,0.876993,0.796689
