CARGA Y LIMPIEZA DE TRAIN.CSV

In [103]:
# Cargamos las librerías

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

In [104]:
# Cargamos el dataset train.csv

df = pd.read_csv("src\\datasets\\train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [105]:
# Eliminar filas duplicadas
df.drop_duplicates(keep='first', inplace=True)

In [106]:
# Verificar si hay valores negativos en columnas donde no deberían haber
columns_to_check = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in columns_to_check:
    if (df[col] < 0).any():
        print(f"Warning: {col} has negative values.")
        # Si encuentras valores negativos, podrías corregirlos
        df[col] = df[col].apply(lambda x: 0 if x < 0 else x)

In [107]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [109]:
# Busca valores nulos
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [110]:
# Busca valores extraños
columns = df.columns
n_values = [df[a].unique() for a in df.columns]

count = pd.DataFrame()
count['features'] = columns
count['n_values'] = n_values
count

Unnamed: 0,features,n_values
0,PassengerId,"[0001_01, 0002_01, 0003_01, 0003_02, 0004_01, ..."
1,HomePlanet,"[Europa, Earth, Mars, nan]"
2,CryoSleep,"[False, True, nan]"
3,Cabin,"[B/0/P, F/0/S, A/0/S, F/1/S, F/0/P, F/2/S, G/0..."
4,Destination,"[TRAPPIST-1e, PSO J318.5-22, 55 Cancri e, nan]"
5,Age,"[39.0, 24.0, 58.0, 33.0, 16.0, 44.0, 26.0, 28...."
6,VIP,"[False, True, nan]"
7,RoomService,"[0.0, 109.0, 43.0, 303.0, 42.0, 39.0, 73.0, 71..."
8,FoodCourt,"[0.0, 9.0, 3576.0, 1283.0, 70.0, 483.0, 1539.0..."
9,ShoppingMall,"[0.0, 25.0, 371.0, 151.0, 3.0, 17.0, nan, 589...."


In [111]:
# Dividimos la columna Cabin en 3 columnas, ya que representa deck/num/side y será más fácil
# al entrenar el modelo y al hacer la conversion a numerico
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
try:
    df = df.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

# Convertir 'Cabin_num' a numérico
df["Cabin_num"] = pd.to_numeric(df["Cabin_num"], errors='coerce')

In [112]:
# Borramos las columnas PassengerId y Name ya que no dan información relevante para el modelo
df = df.drop(['PassengerId', 'Name'], axis=1)

In [113]:
# Realizamos comprobacion para ver el Dataset
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0.0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0.0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0.0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0.0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1.0,S


In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Deck          8494 non-null   object 
 12  Cabin_num     8494 non-null   float64
 13  Side          8494 non-null   object 
dtypes: bool(1), float64(7), object(6)
memory usage: 891.5+ KB


In [115]:
# Convertir variables categóricas a variables dummies (One-Hot Encoding)
df = pd.get_dummies(df)

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8514 non-null   float64
 1   RoomService                8512 non-null   float64
 2   FoodCourt                  8510 non-null   float64
 3   ShoppingMall               8485 non-null   float64
 4   Spa                        8510 non-null   float64
 5   VRDeck                     8505 non-null   float64
 6   Transported                8693 non-null   bool   
 7   Cabin_num                  8494 non-null   float64
 8   HomePlanet_Earth           8693 non-null   bool   
 9   HomePlanet_Europa          8693 non-null   bool   
 10  HomePlanet_Mars            8693 non-null   bool   
 11  CryoSleep_False            8693 non-null   bool   
 12  CryoSleep_True             8693 non-null   bool   
 13  Destination_55 Cancri e    8693 non-null   bool 

In [117]:
from sklearn.preprocessing import StandardScaler

# Escalar las variables numéricas
numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [118]:
# Asegurar que todos los valores nulos sean representados uniformemente como np.nan
df = df.fillna(np.nan)

In [119]:
from sklearn.impute import KNNImputer
# Aplicar KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [120]:
# Escalar las variables numéricas
numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
scaler = StandardScaler()
df_imputed[numeric_cols] = scaler.fit_transform(df_imputed[numeric_cols])

In [121]:
# Convertir 'Transported' a enteros si es necesario
df_imputed['Transported'] = df_imputed['Transported'].astype(int)

In [122]:
df_imputed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,0.707369,-0.339376,-0.284736,-0.289195,-0.275779,-0.267528,0,-1.179264,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.334932,-0.174869,-0.279115,-0.247498,0.210934,-0.228943,1,-1.179264,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2.027618,-0.274479,1.948932,-0.289195,5.677364,-0.224558,0,-1.179264,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.290449,-0.339376,0.516661,0.329598,2.675526,-0.09828,0,-1.179264,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.890826,0.117924,-0.241012,-0.037341,0.225119,-0.265774,1,-1.177298,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [123]:
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   Transported                8693 non-null   int64  
 7   Cabin_num                  8693 non-null   float64
 8   HomePlanet_Earth           8693 non-null   float64
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  CryoSleep_False            8693 non-null   float64
 12  CryoSleep_True             8693 non-null   float64
 13  Destination_55 Cancri e    8693 non-null   float

In [124]:
df_imputed.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,...,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,-1.0625850000000001e-17,-3.678181e-18,6.947674e-18,4.0459990000000004e-17,-3.269494e-18,4.0868669999999994e-19,0.503624,0.0,0.529391,0.24514,...,0.029449,0.089612,0.085931,0.054987,0.100771,0.321408,0.294375,0.000575,0.483838,0.49327
std,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058,0.500016,1.000058,0.499164,0.430195,...,0.169071,0.285642,0.280279,0.227968,0.301042,0.467044,0.455787,0.023977,0.499767,0.499983
min,-2.002615,-0.3393759,-0.2847364,-0.2891952,-0.2757785,-0.2675281,0.0,-1.179264,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6823661,-0.3393759,-0.2847364,-0.2891952,-0.2757785,-0.2675281,0.0,-0.847015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.1264719,-0.3393759,-0.2847364,-0.2891952,-0.2757785,-0.2675281,1.0,-0.329966,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.6378827,-0.2578769,-0.2335169,-0.2374901,-0.2172666,-0.2219276,1.0,0.76901,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
max,3.486841,21.28353,18.33729,38.89326,19.5899,20.89548,1.0,2.544278,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [125]:
# Separar características (X) y etiquetas (y)
X = df_imputed.drop(columns=['Transported'])
y = df_imputed['Transported']

SEPARACIÓN DE DATOS Y ENTRENAMIENTO DEL MODELO

In [126]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Dividir los datos en conjuntos de entrenamiento (70%) y prueba (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [127]:
# Verificar el balance de las clases en y_train
print('Proporción de la clase 1 (Transported = 1):', y_train.mean())
print('Proporción de la clase 0 (Transported = 0):', 1 - y_train.mean())

Proporción de la clase 1 (Transported = 1): 0.5036976170912079
Proporción de la clase 0 (Transported = 0): 0.49630238290879214


In [128]:
# Calcular la correlación de cada característica con la variable objetivo
correlation = pd.concat([X_train, y_train], axis=1).corr().abs().Transported.values[:-1]

# Crear un dataframe para visualización
associations = pd.DataFrame({
    'Correlation': correlation
}, index=X_train.columns).sort_values(by='Correlation', ascending=False)

In [129]:
from sklearn.feature_selection import mutual_info_regression

# Calcular la información mutua entre las características y la variable objetivo
mutual_info = mutual_info_regression(X_train, y_train, random_state=42)

# Añadir la información mutua al dataframe
associations['Mutual_Info'] = mutual_info

# Mostrar la tabla con ambas métricas
print(associations)

                           Correlation  Mutual_Info
CryoSleep_True                0.466738     0.000000
CryoSleep_False               0.457900     0.067668
RoomService                   0.246576     0.048082
Spa                           0.219484     0.058537
VRDeck                        0.207366     0.077866
HomePlanet_Europa             0.178483     0.060971
HomePlanet_Earth              0.168371     0.008576
Deck_B                        0.145052     0.023147
Destination_55 Cancri e       0.107697     0.018889
Side_P                        0.098309     0.000246
Destination_TRAPPIST-1e       0.097661     0.116813
Side_S                        0.097337     0.115947
Deck_C                        0.096033     0.003658
Deck_E                        0.094848     0.000000
Deck_F                        0.085735     0.000000
Age                           0.085593     0.003053
FoodCourt                     0.056644     0.021182
Cabin_num                     0.045402     0.001113
VIP_True    

Tras analizar la correlacion y la mutual info:

Estas características con baja correlación y baja mutual information:

ShoppingMall, Deck_A, Deck_T, Deck_G: Estas características tienen tanto baja correlación como baja MI, lo que indica que probablemente no son muy relevantes. Podría considerar eliminarlas del modelo, ya que su contribución parece mínima.
Cabin_num: También muestra baja correlación y MI, por lo que podría considerar eliminarla, salvo que creas que pueda haber relaciones más complejas que no están siendo capturadas en estos cálculos.

Por lo tanto voy a comprobar el rendimiento del modelo eliminando las características y sin hacerlo, para poder comparar.

In [130]:
# Lista de columnas irrelevantes (baja correlación y baja MI)
columns_to_drop = ['ShoppingMall', 'Cabin_num']

# Eliminar las columnas irrelevantes
X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

MODELOS SIN ELIMINAR CARACTERÍSTICAS

In [131]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# ** Modelo Random Forest con ajuste de hiperparámetros usando RandomizedSearchCV **

# Definir más hiperparámetros para Random Forest
param_dist_rf = {
    'n_estimators': [100, 200, 300, 500, 1000],           # Añadir más opciones de número de árboles
    'max_depth': [None, 10, 20, 30, 40, 50, 60],          # Añadir más valores de profundidad máxima
    'min_samples_split': [2, 5, 10, 15],                  # Más valores para el mínimo número de muestras para dividir
    'min_samples_leaf': [1, 2, 4, 6],                     # Añadir opciones para el tamaño mínimo de las hojas
    'max_features': ['sqrt', 'log2', 0.8, 0.6, 0.4],      # Probar diferentes proporciones de características
    'bootstrap': [True, False],                           # Usar muestreo con o sin reemplazo
    'criterion': ['gini', 'entropy'],                     # Probar distintos criterios para dividir
    'class_weight': [None, 'balanced', 'balanced_subsample'] # Ajuste de pesos para clases desbalanceadas
}

# Crear el modelo base
model_rf = RandomForestClassifier(random_state=42)

# Implementar RandomizedSearchCV para ajustar los hiperparámetros
random_search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=param_dist_rf, 
                                      n_iter=100, cv=5, random_state=42, n_jobs=-1, verbose=1)

# Entrenar el modelo con los mejores hiperparámetros encontrados
random_search_rf.fit(X_train, y_train)

# Ver los mejores parámetros encontrados
print(f"Best hyperparameters for Random Forest: {random_search_rf.best_params_}")

# Predecir en el conjunto de prueba con el mejor modelo
y_pred_model_rf = random_search_rf.best_estimator_.predict(X_test)

# Evaluar el modelo
accuracy_model_rf = accuracy_score(y_test, y_pred_model_rf)
print(f"Random Forest Accuracy (after hyperparameter tuning): {accuracy_model_rf:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 0.4, 'max_depth': 10, 'criterion': 'gini', 'class_weight': 'balanced', 'bootstrap': True}
Random Forest Accuracy (after hyperparameter tuning): 0.8102


In [132]:
# Validación cruzada con 5 particiones (folds) usando el mejor modelo
cv_scores_rf = cross_val_score(random_search_rf.best_estimator_, X, y, cv=5)

# Imprimir los resultados promedio
print(f"Cross-validation scores (Random Forest): {cv_scores_rf}")
print(f"Mean cross-validation accuracy (Random Forest): {cv_scores_rf.mean():.4f}")

Cross-validation scores (Random Forest): [0.75905693 0.76883266 0.80448534 0.82911392 0.80552359]
Mean cross-validation accuracy (Random Forest): 0.7934


LIMPIEZA DE TEST.CSV

In [133]:
# Carga del dataset de Test
test_df = pd.read_csv(".\\src\\datasets\\test.csv")

In [134]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [135]:
test_df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0


In [136]:
test_df.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [137]:
# Busca valores extraños
columns = test_df.columns
n_values = [test_df[a].unique() for a in test_df.columns]

count = pd.DataFrame()
count['features'] = columns
count['n_values'] = n_values
count

Unnamed: 0,features,n_values
0,PassengerId,"[0013_01, 0018_01, 0019_01, 0021_01, 0023_01, ..."
1,HomePlanet,"[Earth, Europa, Mars, nan]"
2,CryoSleep,"[True, False, nan]"
3,Cabin,"[G/3/S, F/4/S, C/0/S, C/1/S, F/5/S, F/7/P, B/2..."
4,Destination,"[TRAPPIST-1e, 55 Cancri e, PSO J318.5-22, nan]"
5,Age,"[27.0, 19.0, 31.0, 38.0, 20.0, 21.0, 23.0, 24...."
6,VIP,"[False, nan, True]"
7,RoomService,"[0.0, 10.0, 339.0, 932.0, 2.0, 26.0, nan, 39.0..."
8,FoodCourt,"[0.0, 9.0, 6652.0, 1615.0, nan, 639.0, 3.0, 74..."
9,ShoppingMall,"[0.0, 635.0, 263.0, 136.0, nan, 253.0, 243.0, ..."


In [138]:
# Lista de columnas donde los nulos serán reemplazados con 0
test_columns_fill_zero = ['VIP', 'CryoSleep', 'RoomService' ,'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Rellenar esos valores nulos con 0
test_df[test_columns_fill_zero] = test_df[test_columns_fill_zero].fillna(0)


# Convertir 'VIP' y 'CryoSleep' a enteros (0 o 1)
test_df['VIP'] = test_df['VIP'].astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(int)

In [139]:
# Dividimos la columna Cabin en 3 columnas, ya que representa deck/num/side y será más fácil
# al entrenar el modelo y al hacer la conversion a numerico
test_df[["Deck", "Cabin_num", "Side"]] = test_df["Cabin"].str.split("/", expand=True)
try:
    test_df = test_df.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

# Convertir 'Cabin_num' a numérico
test_df["Cabin_num"] = pd.to_numeric(test_df["Cabin_num"], errors='coerce')

In [140]:
# Divido en numéricas y categóricas, para imputar los valores nulos por separado
test_categorical_cols = test_df.select_dtypes(include=['object']).columns
test_numerical_cols = test_df.select_dtypes(include=['float64', 'int64']).columns

# Excluir las columnas que ya hemos rellenado con 0
test_numerical_cols = test_numerical_cols.difference(test_columns_fill_zero)

# Rellenar con la media en variables numéricas restantes
for col in test_numerical_cols:
    test_df[col] = test_df[col].fillna(test_df[col].mean())

# Rellenar con la moda (el valor más frecuente) en variables categóricas
for col in test_categorical_cols:
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

In [141]:
# Borramos las columnas PassengerId y Name ya que no dan información relevante para el modelo
test_df = test_df.drop(['PassengerId', 'Name'], axis=1)

In [142]:
test_df.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Deck            0
Cabin_num       0
Side            0
dtype: int64

In [143]:
test_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Cabin_num,Side
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,G,3.0,S
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,F,4.0,S
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,C,0.0,S
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,C,1.0,S
4,Earth,0,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,F,5.0,S


In [144]:
# Convertir variables categóricas a variables dummies (One-Hot Encoding)
test_df = pd.get_dummies(test_df)
# Convertir solo las columnas booleanas a 0 y 1
for col in test_df.select_dtypes(include=['bool']).columns:
    test_df[col] = test_df[col].astype(int)

In [145]:
# Escalar las variables numéricas
test_numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
test_df[numeric_cols] = scaler.transform(test_df[test_numeric_cols])

In [146]:
# Busca valores extraños
columns = test_df.columns
n_values = [test_df[a].unique() for a in test_df.columns]

count = pd.DataFrame()
count['features'] = columns
count['n_values'] = n_values
count

Unnamed: 0,features,n_values
0,CryoSleep,"[1, 0]"
1,Age,"[27.182424888912166, 19.12853459516126, 31.209..."
2,VIP,"[0, 1]"
3,RoomService,"[-0.00026801778752820655, 10.061521797262278, ..."
4,FoodCourt,"[0.0013912911567672476, 9.060084600738127, 669..."
5,ShoppingMall,"[0.0005687856743460865, 640.409293471622, 265...."
6,Spa,"[5.970086453937663e-05, 2844.6813664351107, 18..."
7,VRDeck,"[-0.0001910527334258212, 587.7243533783087, 60..."
8,Cabin_num,"[3.0198039075259295, 4.0260587277425, 0.001039..."
9,HomePlanet_Earth,"[1, 0]"


In [147]:
# Eliminar la columna 'VIP' si fue eliminada en el modelo de entrenamiento
test_df = test_df.drop(columns=['VIP'], errors='ignore')

In [148]:
test_df.describe()

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
count,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,...,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0
mean,0.361001,28.851741,216.391026,431.388684,174.70922,298.168566,306.318117,613.996435,0.549451,0.234276,...,0.022913,0.084639,0.083002,0.056582,0.104513,0.361235,0.285714,0.001403,0.487257,0.512743
std,0.480347,14.121876,605.633722,1520.004424,559.719017,1113.357696,1241.748246,512.094043,0.497607,0.423595,...,0.149644,0.278376,0.275918,0.231069,0.30596,0.480415,0.451807,0.037433,0.499896,0.499896
min,0.0,0.000545,-0.000268,0.001391,0.000569,6e-05,-0.000191,0.001039,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,20.135271,-0.000268,0.001391,0.000569,6e-05,-0.000191,180.120652,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,27.182425,-0.000268,0.001391,0.000569,6e-05,-0.000191,465.897021,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,37.249788,48.296323,66.431809,27.230546,43.33031,31.144187,1018.330918,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
max,1.0,79.532712,11638.472011,25437.818726,8362.629144,19996.406666,22375.728106,1901.82265,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [149]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  4277 non-null   int64  
 1   Age                        4277 non-null   float64
 2   RoomService                4277 non-null   float64
 3   FoodCourt                  4277 non-null   float64
 4   ShoppingMall               4277 non-null   float64
 5   Spa                        4277 non-null   float64
 6   VRDeck                     4277 non-null   float64
 7   Cabin_num                  4277 non-null   float64
 8   HomePlanet_Earth           4277 non-null   int64  
 9   HomePlanet_Europa          4277 non-null   int64  
 10  HomePlanet_Mars            4277 non-null   int64  
 11  Destination_55 Cancri e    4277 non-null   int64  
 12  Destination_PSO J318.5-22  4277 non-null   int64  
 13  Destination_TRAPPIST-1e    4277 non-null   int64

In [150]:
test_df.head()

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_num,HomePlanet_Earth,HomePlanet_Europa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,1,27.182425,-0.000268,0.001391,0.000569,6e-05,-0.000191,3.019804,1,0,...,0,0,0,0,0,0,1,0,0,1
1,0,19.128535,-0.000268,9.060085,0.000569,2844.681366,-0.000191,4.026059,1,0,...,0,0,0,0,0,1,0,0,0,1
2,1,31.20937,-0.000268,0.001391,0.000569,6e-05,-0.000191,0.001039,0,1,...,0,0,1,0,0,0,0,0,0,1
3,0,38.256524,-0.000268,6695.382269,0.000569,182.390182,587.724353,1.007294,0,1,...,0,0,1,0,0,0,0,0,0,1
4,0,20.135271,10.061522,0.001391,640.409293,6e-05,-0.000191,5.032314,1,0,...,0,0,0,0,0,1,0,0,0,1


REALIZAMOS PREDICCIONES CON TEST.CSV Y LOS GUARDAMOS COMO SUBMISSION

In [151]:
# Alinear las columnas de test_df con las de X_train
# Si faltan columnas en test_df, las agregamos y las rellenamos con 0
for col in X_train.columns:
    if col not in test_df.columns:
        test_df[col] = 0

# Si test_df tiene columnas adicionales que no están en X_train, las eliminamos
test_df = test_df[X_train.columns]

In [152]:
# Realizar las predicciones con el modelo entrenado
test_predictions = best_rf_model.predict(test_df)

NameError: name 'best_rf_model' is not defined

In [None]:
# Convertir las predicciones de 0/1 a True/False
test_predictions_bool = test_predictions.astype(bool)

In [None]:
# Cargar nuevamente PassengerId desde el archivo original de test.csv
original_test_df = pd.read_csv(".\\src\\datasets\\test.csv")

# Crear el dataframe con PassengerId y las predicciones (Transported)
submission_df = pd.DataFrame({
    'PassengerId': original_test_df['PassengerId'],
    'Transported': test_predictions_bool
})

In [None]:
# Guardar el dataframe de submission en un archivo CSV
submission_df.to_csv('.\\src\\datasets\\submission.csv', index=False)

In [None]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  4277 non-null   object
 1   Transported  4277 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 37.7+ KB
