In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

df_train = pd.read_csv("/spaceship_titanic_train.csv")

In [4]:
#convertir cryosleep y vip a boleanos
df_train['CryoSleep'] = df_train['CryoSleep'].astype('boolean')
df_train['VIP'] = df_train['VIP'].astype('boolean')

In [5]:
#reemplazo por medias
numerical= df_train.select_dtypes(include=['int64', 'float64']).columns
for col in numerical:
    df_train[col] = df_train[col].replace(np.nan, df_train[col].mean())

In [6]:
#Reemplazo por la moda
categorical=df_train.select_dtypes(include=['object']).columns
for col in categorical:
    df_train[col] = df_train[col].replace(np.nan, df_train[col].mode()[0])


In [7]:
#Reemplazo por la moda en boleanos
categorical=df_train.select_dtypes(include=['boolean']).columns
for col in categorical:
    df_train[col] = df_train[col].replace(np.nan, df_train[col].mode()[0])

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   boolean
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   boolean
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), boolean(2), float64(6), object(5)
memory usage: 789.6+ KB


In [9]:
#Porcentaje de valores faltantes
df_train.replace(['N/A','?', ''], np.nan, inplace=True)
print(df_train.isna().sum()/len(df_train)*100)

PassengerId     0.0
HomePlanet      0.0
CryoSleep       0.0
Cabin           0.0
Destination     0.0
Age             0.0
VIP             0.0
RoomService     0.0
FoodCourt       0.0
ShoppingMall    0.0
Spa             0.0
VRDeck          0.0
Name            0.0
Transported     0.0
dtype: float64


In [10]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [11]:
df_train = df_train.reset_index(drop=True)
target_column = 'Transported'
columns_to_drop = [target_column,'PassengerId', 'Name']
X = df_train.drop(columns=columns_to_drop)
y = df_train[target_column]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [13]:
categorical_features = X_train.select_dtypes(include=['object']).columns
other_features = X_train.select_dtypes(include=['int64', 'float64','boolean']).columns

In [14]:
# diccionario para guardar los codificadores de cada columna
encoders = {}
for feature in categorical_features:
    # Inicializamos el codificador y le decimos que ignore valores desconocidos
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # Ajustamos el codificador SOLO con los datos de entrenamiento
    encoder.fit(X_train[[feature]])

    # Guardamos el codificador en nuestro diccionario
    encoders[feature] = encoder

In [15]:
# --- Paso 3: Aplicar la codificación a los datos de entrenamiento y validación ---
# Creamos listas para guardar los DataFrames codificados
encoded_dfs_train = []
encoded_dfs_val = []

for feature in categorical_features:
    # Usamos el codificador que creamos para esa columna
    encoder = encoders[feature]

    # Transformamos los datos de entrenamiento
    encoded_train = encoder.transform(X_train[[feature]])
    encoded_df_train = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out([feature]), index=X_train.index)
    encoded_dfs_train.append(encoded_df_train)

    # Transformamos los datos de validación usando el MISMO codificador
    encoded_val = encoder.transform(X_val[[feature]])
    encoded_df_val = pd.DataFrame(encoded_val, columns=encoder.get_feature_names_out([feature]), index=X_val.index)
    encoded_dfs_val.append(encoded_df_val)

# Concatenamos los DataFrames codificados con las columnas numéricas
X_train_processed = pd.concat([X_train[other_features]] + encoded_dfs_train, axis=1)
X_val_processed = pd.concat([X_val[other_features]] + encoded_dfs_val, axis=1)


In [16]:
rf = RandomForestClassifier(criterion='gini', max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=42)
rf.fit(X_train_processed, y_train)

In [17]:
# --- Paso 5: Evaluar el modelo ---
y_pred_val = rf.predict(X_val_processed)

print("--- Métricas de Clasificación en el conjunto de validación ---")
print(f"Accuracy: {accuracy_score(y_val, y_pred_val):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_val):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_val):.4f}")
print(f"F1-score: {f1_score(y_val, y_pred_val):.4f}")
print("Matriz de Confusión:")
print(confusion_matrix(y_val, y_pred_val))

--- Métricas de Clasificación en el conjunto de validación ---
Accuracy: 0.7839
Precision: 0.7698
Recall: 0.7986
F1-score: 0.7839
Matriz de Confusión:
[[341 102]
 [ 86 341]]


In [45]:
#Introduccion de dataframe de testeo
test_df = pd.read_csv('/spaceship_titanic_test.csv')

In [46]:
# Eliminar columnas no necesarias
#test_df = test_df.drop(columns=['PassengerId', 'Name'])
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [30]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-null   boolean
 2   Cabin         4277 non-null   object 
 3   Destination   4277 non-null   object 
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   boolean
 6   RoomService   4277 non-null   float64
 7   FoodCourt     4277 non-null   float64
 8   ShoppingMall  4277 non-null   float64
 9   Spa           4277 non-null   float64
 10  VRDeck        4277 non-null   float64
dtypes: boolean(2), float64(6), object(3)
memory usage: 317.6+ KB


In [31]:
# Preprocesamiento de los datos de prueba

categorical_features_test = test_df.select_dtypes(include=['object']).columns
numerical_features = test_df.select_dtypes(include=['int64', 'float64']).columns

for col in categorical_features_test:
  test_df[col] = df_train[col].replace(np.nan, test_df[col].mode()[0])

for col in numerical_features:
    test_df[col] = test_df[col].fillna(test_df[col].mean())

In [32]:
test_df.replace(['N/A','?', ''], np.nan, inplace=True)
print(test_df.isna().sum()/len(test_df)*100)

HomePlanet      0.0
CryoSleep       0.0
Cabin           0.0
Destination     0.0
Age             0.0
VIP             0.0
RoomService     0.0
FoodCourt       0.0
ShoppingMall    0.0
Spa             0.0
VRDeck          0.0
dtype: float64


In [33]:
test_df = test_df.reset_index(drop=True)

In [34]:
# Codificación de los datos de prueba usando los codificadores ajustados
encoded_dfs_test = []
for feature in categorical_features:
    encoder = encoders[feature]
    encoded_test = encoder.transform(test_df[[feature]])
    encoded_df_test = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out([feature]), index=test_df.index)
    encoded_dfs_test.append(encoded_df_test)

In [36]:
#concatenar de nuevo
X_test_processed = pd.concat([test_df[other_features]] + encoded_dfs_test, axis=1)
# Realign columns to match the training set
X_test_final_processed = X_test_processed.reindex(columns=X_train_processed.columns, fill_value=0)



In [39]:
# Realign columns to match the training set
#ESTO ES LO QUE NECESITO APRENDER


prediccion = rf.predict(X_test_final_processed)

In [47]:
# Suponiendo que test_df contiene la columna 'PassengerId'
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': prediccion
})

# Convertir valores booleanos a 'True'/'False' si es necesario
submission['Transported'] = submission['Transported'].astype(bool)

# Exportar a CSV
submission.to_csv('predictions_spaceship_titanic.csv', index=False)

