In [1111]:
# Functin for normalizing data transformation and format
def normalizeDatasetFormat(dataset):
    # Divide the passenger id, into group and member number
    passengerIdSplit = dataset['PassengerId'].str.split("_", expand=True)

    # Create new columns for group number and group member within that group
    dataset['GroupNumber'] = pd.to_numeric(passengerIdSplit[0], errors='coerce')
    dataset['GroupMember'] = pd.to_numeric(passengerIdSplit[1], errors='coerce')

    # Split the cabin variable in 3 sub sets
    cabin_split = dataset['Cabin'].str.split("/", expand=True)

    # Create new columns for each sub set of cabin
    dataset['Deck'] = cabin_split[0]
    dataset['CabinNum'] = pd.to_numeric(cabin_split[1], errors='coerce')
    dataset['Side'] = cabin_split[2]

    # Divide name in first name and last name to see if there are any insights from there.
    dataset['FirstName'], dataset['LastName'] = dataset['Name'].str.split(' ').str[0], dataset['Name'].str.split(' ').str[1]

    # Get frequency for first names and last names
    name_freq = dataset['FirstName'].value_counts()
    dataset['FirstNameFreq'] = dataset['FirstName'].map(name_freq)

    name_freq = dataset['LastName'].value_counts()
    dataset['LastNameFreq'] = dataset['LastName'].map(name_freq)

    return dataset

In [1112]:
# Functin for normalizing boolean data transformation and format
def normalizeDatasetBoolFormat(dataset, bool_columns):
    print(dataset.head())
    
    # For dataset, map string-boolean columns to 1 for true and 0 for false, since one-hot encoding doesn't work well with 2 cardinality columns.
    for col in bool_columns:
        dataset[col] = dataset[col].map({True: 1, False: 0, 'S': 1, 'P': 0})

    print(dataset.head())

    return dataset

In [1129]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score
from xgboost import XGBClassifier

# Importing the train and test csv files for model training and model testing.
X = pd.read_csv("D:/Archivos Personales/Courses/Data Science/Projects and Competitions/Spaceship Titanic/train.csv")
X_test_full = pd.read_csv("D:/Archivos Personales/Courses/Data Science/Projects and Competitions/Spaceship Titanic/test.csv")

# Remove rows that have Nan values in the target column.
X.dropna(axis=0, subset=['Transported'], inplace=True)

# rowsWithoutTargetValue = X[X['Transported'].isnull()] Can work as well to get empty rows for target column.

# Assign target column to Y from X dataset
Y = X.Transported
X.drop(['Transported'], axis=1, inplace=True)

# Create split dataset for training and validation
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, Y, train_size= 0.8, test_size= 0.2, random_state=0)

In [1131]:
# Check how many Nan values per column
nanValuesPerColumn = X_train_full.isnull().sum()
nanValuesPerColumn

PassengerId       0
HomePlanet      156
CryoSleep       170
Cabin           151
Destination     149
Age             146
VIP             176
RoomService     151
FoodCourt       148
ShoppingMall    172
Spa             152
VRDeck          146
Name            156
dtype: int64

In [1115]:
# Normalize format and columns for all datasets
X_train_full = normalizeDatasetFormat(X_train_full)
X_valid_full = normalizeDatasetFormat(X_valid_full)
X_test_full = normalizeDatasetFormat(X_test_full)
X = normalizeDatasetFormat(X)

In [1143]:
# Get categorical columns from the dataset
cat_cols = list(X_train_full.select_dtypes(include=['object']).columns)
X_train_full[cat_cols].nunique()

PassengerId    6954
HomePlanet        3
CryoSleep         2
Cabin          5449
Destination       3
VIP               2
Name           6787
dtype: int64

In [1117]:
# Remove high cardinality categorical columns
final_categorical_cols = [col for col in cat_cols if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() < 16]

# Columns that can be safely ordinal encoded, verify to only use categories that were used during training.
# Needed for ordinal encoding.
#final_categorical_cols = [col for col in cat_cols if
#                    set(X_valid_full[col]).issubset(set(X_train_full[col]))]

print(final_categorical_cols)

['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']


In [1118]:
# Get boolean columns that appear as string/object type from
bool_columns = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and set(X_train_full[col].unique()).issubset([True, False, np.nan, 'S', 'P'])]
print(bool_columns)

['CryoSleep', 'VIP', 'Side']


In [1119]:
# Get numeric values from the dataset
final_numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]
print(final_numerical_cols)

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupNumber', 'GroupMember', 'CabinNum', 'FirstNameFreq', 'LastNameFreq']


In [1120]:
# Normalize boolean format for all datasets
X_train_full = normalizeDatasetBoolFormat(X_train_full, bool_columns)
X_valid_full = normalizeDatasetBoolFormat(X_valid_full, bool_columns)
X_test_full = normalizeDatasetBoolFormat(X_test_full, bool_columns)
X = normalizeDatasetBoolFormat(X, bool_columns)

     PassengerId HomePlanet CryoSleep     Cabin  Destination   Age    VIP  \
4278     4558_01     Europa     False   C/167/S  55 Cancri e  54.0  False   
5971     6326_01      Earth     False  F/1307/P  TRAPPIST-1e  20.0  False   
464      0503_02       Mars     False    F/90/S  TRAPPIST-1e  43.0  False   
4475     4757_01      Earth     False   F/896/S  TRAPPIST-1e  24.0  False   
8469     9046_01     Europa      True   C/335/S  55 Cancri e  25.0  False   

      RoomService  FoodCourt  ShoppingMall  ...               Name  \
4278          0.0      559.0           0.0  ...      Wezna Baleful   
5971          0.0       20.0           1.0  ...  Therek Hinetthews   
464        1821.0        0.0          47.0  ...         Torms Fone   
4475        185.0        0.0         476.0  ...    Tanley Mirandry   
8469          0.0        0.0           0.0  ...    Alphah Cratrave   

      GroupNumber GroupMember  Deck  CabinNum Side  FirstName    LastName  \
4278         4558           1     C    

In [1121]:
# Normalize all dataset to have the same columns and format

final_categorical_cols = [col for col in final_categorical_cols if col not in bool_columns] # Since we know boolean is recognized as string and to avoid duplicates.
my_columns = final_categorical_cols + bool_columns + final_numerical_cols

X_train = X_train_full[my_columns].copy()
X_valid = X_valid_full[my_columns].copy()
X_test = X_test_full[my_columns].copy()
X = X[my_columns].copy()

In [1122]:
#------------------------------------------------Pipeline creation----------------------------------------------------

# Preprocessing for numerical columns
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for boolean columns
boolean_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('OneHotEnc', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    #('ordEnc', OrdinalEncoder())
])

# Bundles numerical and categorical preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, final_numerical_cols),
    ('boolean', boolean_transformer, bool_columns),
    ('categorical', categorical_transformer, final_categorical_cols)
])

# Apply preprocessing pipeline to the datasets
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)

#print(pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())) Useful for visualizing post tranformation

In [1123]:
# Model creation and training for training X_train data
validationModel = XGBClassifier(n_estimators = 200, early_stopping_rounds = 20, learning_rate=0.1, n_jobs=4, verbosity=1, eval_metric='logloss')

validationModel.fit(X_train_processed, y_train,
          eval_set=[(X_valid_processed, y_valid)])

# Preprocessing of validation data to get predictions
predictions = validationModel.predict(X_valid_processed)

[0]	validation_0-logloss:0.65391
[1]	validation_0-logloss:0.62110
[2]	validation_0-logloss:0.59333
[3]	validation_0-logloss:0.57068
[4]	validation_0-logloss:0.55113
[5]	validation_0-logloss:0.53437
[6]	validation_0-logloss:0.51986
[7]	validation_0-logloss:0.50756
[8]	validation_0-logloss:0.49682
[9]	validation_0-logloss:0.48758
[10]	validation_0-logloss:0.47911
[11]	validation_0-logloss:0.47246
[12]	validation_0-logloss:0.46567
[13]	validation_0-logloss:0.45977
[14]	validation_0-logloss:0.45453
[15]	validation_0-logloss:0.44946
[16]	validation_0-logloss:0.44499
[17]	validation_0-logloss:0.44072
[18]	validation_0-logloss:0.43786
[19]	validation_0-logloss:0.43208
[20]	validation_0-logloss:0.42858
[21]	validation_0-logloss:0.42641
[22]	validation_0-logloss:0.42386
[23]	validation_0-logloss:0.42170
[24]	validation_0-logloss:0.41943
[25]	validation_0-logloss:0.41687
[26]	validation_0-logloss:0.41545
[27]	validation_0-logloss:0.41424
[28]	validation_0-logloss:0.41320
[29]	validation_0-loglos

In [1124]:
# Get accuracy and precision values for the model
accuracy = accuracy_score(y_valid, predictions)
precision = precision_score(y_valid, predictions)

# Print accuracy and precision values for the model
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")

# Print MAE value for the model
print("MAE: ", mean_absolute_error(y_valid, predictions))

Accuracy: 81.02%
Precision: 79.42%
MAE:  0.18976423231742381


In [1125]:
# Final training with whole X and Y datasets
#------------------------------------------------Pipeline creation----------------------------------------------------

# For the final model training and prediction after validation and tweaking
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# Model creation model.best_iteration + 1
finalModel = XGBClassifier(n_estimators = validationModel.best_iteration + 1, learning_rate=0.1, n_jobs=4, verbosity=1, eval_metric='logloss')

# Final model fitting and training with whole dataset
finalModel.fit(X_processed, Y)

# Preprocessing of validation data to get predictions
predictions = finalModel.predict(X_test_processed)

# Change predictions to boolean format
boolPredictions = predictions.astype(bool)

In [1126]:
# Save predictions in the format used for competition scoring
output = pd.DataFrame({'PassengerId': X_test_full.PassengerId,
                       'Transported': boolPredictions})
output.to_csv('submission.csv', index=False)