In [1]:
# Cargamos las librerías

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

In [2]:
# Cargamos el dataset train.csv

df = pd.read_csv("src\\datasets\\train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
# Busca valores nulos
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
# Busca valores extraños
columns = df.columns
n_values = [df[a].unique() for a in df.columns]

count = pd.DataFrame()
count['features'] = columns
count['n_values'] = n_values
count

Unnamed: 0,features,n_values
0,PassengerId,"[0001_01, 0002_01, 0003_01, 0003_02, 0004_01, ..."
1,HomePlanet,"[Europa, Earth, Mars, nan]"
2,CryoSleep,"[False, True, nan]"
3,Cabin,"[B/0/P, F/0/S, A/0/S, F/1/S, F/0/P, F/2/S, G/0..."
4,Destination,"[TRAPPIST-1e, PSO J318.5-22, 55 Cancri e, nan]"
5,Age,"[39.0, 24.0, 58.0, 33.0, 16.0, 44.0, 26.0, 28...."
6,VIP,"[False, True, nan]"
7,RoomService,"[0.0, 109.0, 43.0, 303.0, 42.0, 39.0, 73.0, 71..."
8,FoodCourt,"[0.0, 9.0, 3576.0, 1283.0, 70.0, 483.0, 1539.0..."
9,ShoppingMall,"[0.0, 25.0, 371.0, 151.0, 3.0, 17.0, nan, 589...."


Rellenamos con 0 los valores nulos para las columnas de gastos y las las bool, entendiendo que la ausencia de dato quiere decir el no uso de este.

In [7]:
# Lista de columnas donde los nulos serán reemplazados con 0
columns_fill_zero = ['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Rellenar esos valores nulos con 0
df[columns_fill_zero] = df[columns_fill_zero].fillna(0)

Convertimos los booleanos a int, porque facilita el trabajo para algunas algoritmos de ML

In [8]:
# Convertir 'VIP' y 'CryoSleep' a enteros (0 o 1)
df['VIP'] = df['VIP'].astype(int)
df['CryoSleep'] = df['CryoSleep'].astype(int)

In [9]:
# Divido en numéricas y categóricas, para imputar los valores nulos por separado
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Excluir las columnas que ya hemos rellenado con 0
numerical_cols = numerical_cols.difference(columns_fill_zero)

# Rellenar con la media en variables numéricas restantes
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())

# Rellenar con la moda (el valor más frecuente) en variables categóricas
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [10]:
# Dividimos la columna Cabin en 3 columnas, ya que representa deck/num/side y será más fácil
# al entrenar el modelo y al hacer la conversion a numerico
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
try:
    df = df.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

# Convertir 'Cabin_num' a numérico
df["Cabin_num"] = pd.to_numeric(df["Cabin_num"], errors='coerce')

In [11]:
# Convertir la variable objetivo 'Transported' a enteros
df['Transported'] = df['Transported'].astype(int)

In [12]:
# Borramos las columnas PassengerId y Name ya que no dan información relevante para el modelo
df = df.drop(['PassengerId', 'Name'], axis=1)

In [13]:
# Realizamos comprobacion para ver el Dataset
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,S


In [14]:
# Separar características (X) y etiquetas (y). (y) es la variable a predecir y (X) las predictoras
X = df.drop(columns=['Transported'])
y = df['Transported']

In [15]:
# Convertir variables categóricas a variables dummies (One-Hot Encoding)
X = pd.get_dummies(X)

In [16]:
from sklearn.preprocessing import StandardScaler

# Escalar las variables numéricas
numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [17]:
# Realizamos comprobacion para ver el Dataset
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,S


In [18]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Dividir los datos en conjuntos de entrenamiento (70%) y prueba (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [22]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

# Definir los hiperparámetros para XGBoost
param_dist_xgb = {
    'n_estimators': [100, 200, 300, 500],        # Número de árboles
    'learning_rate': [0.01, 0.1, 0.2, 0.3],     # Tasa de aprendizaje
    'max_depth': [3, 5, 7, 10],                 # Profundidad máxima del árbol
    'min_child_weight': [1, 3, 5],              # Peso mínimo de una hoja
    'gamma': [0, 0.1, 0.2],                     # Reducción mínima en la métrica de pérdida
    'subsample': [0.7, 0.8, 1],                 # Fracción de muestras usadas para construir cada árbol
    'colsample_bytree': [0.7, 0.8, 1],          # Fracción de características usadas para construir cada árbol
    'reg_alpha': [0, 0.1, 0.5],                 # Regularización L1
    'reg_lambda': [1, 1.5, 2]                   # Regularización L2
}

# Crear el modelo base
model_xgb = xgb.XGBClassifier(random_state=42)

# Implementar RandomizedSearchCV para ajustar los hiperparámetros
random_search_xgb = RandomizedSearchCV(estimator=model_xgb, param_distributions=param_dist_xgb, 
                                       n_iter=50, cv=5, random_state=42, n_jobs=-1, verbose=1)

# Entrenar el modelo con los mejores hiperparámetros encontrados
random_search_xgb.fit(X_train, y_train)

# Ver los mejores parámetros encontrados
print(f"Best hyperparameters for XGBoost: {random_search_xgb.best_params_}")

# Predecir en el conjunto de prueba con el mejor modelo
y_pred_model_xgb = random_search_xgb.best_estimator_.predict(X_test)

# Evaluar el modelo
accuracy_model_xgb = accuracy_score(y_test, y_pred_model_xgb)
print(f"XGBoost Accuracy (after hyperparameter tuning): {accuracy_model_xgb:.4f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters for XGBoost: {'subsample': 1, 'reg_lambda': 1.5, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
XGBoost Accuracy (after hyperparameter tuning): 0.8048


In [23]:
# Validación cruzada con 5 particiones (folds) usando el mejor modelo
cv_scores_xgb = cross_val_score(random_search_xgb.best_estimator_, X, y, cv=5)

# Imprimir los resultados promedio
print(f"Cross-validation scores (XGBoost): {cv_scores_xgb}")
print(f"Mean cross-validation accuracy (XGBoost): {cv_scores_xgb.mean():.4f}")

Cross-validation scores (XGBoost): [0.75388154 0.74353076 0.80448534 0.83371692 0.80034522]
Mean cross-validation accuracy (XGBoost): 0.7872
