In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

train = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTrain.csv")
test = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTest.csv")

# Separate ID before training
train_passenger_ids = train['PassengerId']
test_passenger_ids = test['PassengerId']

# Prepare data
X_train = train.drop(['Transported', 'PassengerId'], axis=1)
y_train = train['Transported'].astype('int')  # Convert True/False to 1/0
X_test = test.drop('PassengerId', axis=1)


In [4]:

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=1)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of gradient boosted trees. Equivalent to the number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage used to prevent overfitting. Range is [0,1]
    'max_depth': [3, 6, 9],  # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit
    'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0, 0.1, 0.2],  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'subsample': [0.7, 0.8, 0.9],  # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting
    'colsample_bytree': [0.7, 0.8, 0.9],  # Subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.01, 0.1],  # L1 regularization term on weights. Increasing this value will make model more conservative
    'reg_lambda': [1, 1.1, 1.2],  # L2 regularization term on weights. Increasing this value will make model more conservative
}

# RandomizedSearchCV for hyperparameter tuning
random_search_cv = RandomizedSearchCV(xgb_model, param_grid, n_iter=200, cv=5, scoring='accuracy', verbose=1, random_state=1)
random_search_cv.fit(X_train, y_train)

# Best parameters
best_params_cv = random_search_cv.best_params_
print("Best Parameters CV: ", best_params_cv)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best Parameters CV:  {'subsample': 0.7, 'reg_lambda': 1.1, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}


In [5]:
# Create the XGBoost model with best parameters
model = xgb.XGBClassifier(**best_params_cv, random_state=1)
model.fit(X_train, y_train)

# Make predictions
y_test_pred = model.predict(X_test)

# Prepare submission
test_passenger_ids = test['PassengerId']
test_predictions = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': y_test_pred
})

# Convert predictions to boolean
test_predictions['Transported'] = test_predictions['Transported'].astype(bool)

# Export to CSV
test_predictions.to_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Test\\Submission - XGBoost.csv", index=False)

In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

train = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTrain.csv")
test = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTest.csv")

# Separate ID before training
train_passenger_ids = train['PassengerId']
test_passenger_ids = test['PassengerId']

# Prepare data
X_train = train.drop(['Transported', 'PassengerId'], axis=1)
y_train = train['Transported'].astype('int')  # Convert True/False to 1/0
X_test = test.drop('PassengerId', axis=1)

best_params_cv = {'subsample': 0.7, 'reg_lambda': 1.1, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}

# Create the XGBoost model with best parameters
model = xgb.XGBClassifier(**best_params_cv, random_state=1)
model.fit(X_train, y_train)

# Make predictions
y_test_pred = model.predict(X_test)

# Prepare submission
test_passenger_ids = test['PassengerId']
test_predictions = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': y_test_pred
})
test_predictions['Transported'] = test_predictions['Transported'].astype(bool)

# Export to CSV
test_predictions.to_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Test\\Submission - XGBoost.csv", index=False)