In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import optuna
import numpy as np

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Feature engineering
def feature_engineering(data):
    data.fillna(0, inplace=True)
    data['TotalSpent'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']
    return data

train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

# Convert categorical columns to string type
categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin', 'Name']
for col in categorical_cols:
    train_data[col] = train_data[col].astype(str)
    test_data[col] = test_data[col].astype(str)

# Split features and target
X = train_data.drop('Transported', axis=1)
y = train_data['Transported'].astype(int)

# Identify numerical columns
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpent']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define objective function for Bayesian optimization with Hyperband
def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4)
    }
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    # Create classifier with trial parameters
    clf = RandomForestClassifier(**params, random_state=42)
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    
    # Use a smaller subset of the training data for early stopping
    X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, test_size=0.5, random_state=42)
    
    try:
        # Fit and evaluate
        pipeline.fit(X_train_subset, y_train_subset)
        y_pred = pipeline.predict_proba(X_val)
        loss = log_loss(y_val, y_pred)
        return loss
    except Exception as e:
        print(f"Error in trial: {e}")
        return float('inf')

# Optimize hyperparameters using Hyperband
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=1)  # Set n_jobs=1 for debugging

# Get best parameters
best_params = study.best_params
print("Best parameters:", best_params)

# Create final pipeline with best parameters
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

final_classifier = RandomForestClassifier(**best_params, random_state=42)

final_pipeline = Pipeline([
    ('preprocessor', final_preprocessor),
    ('classifier', final_classifier)
])

# Fit final model
final_pipeline.fit(X, y)

# ... (previous code remains the same until predictions) ...

# Make predictions
test_predictions = final_pipeline.predict(test_data)

# Create submission DataFrame and convert predictions to boolean
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions.astype(bool)  # Convert 0/1 to False/True
})

# Save submission
submission.to_csv('sample_submission.csv', index=False)

[I 2024-11-11 15:58:29,694] A new study created in memory with name: no-name-a3d776c8-82d0-4e2e-8d37-0c56c4f40c3c
[I 2024-11-11 15:58:29,827] Trial 0 finished with value: 0.6249544895416176 and parameters: {'n_estimators': 23, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.6249544895416176.
[I 2024-11-11 15:58:29,943] Trial 1 finished with value: 0.6829597288763883 and parameters: {'n_estimators': 36, 'max_depth': 1, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.6249544895416176.
[I 2024-11-11 15:58:30,095] Trial 2 finished with value: 0.6790202406626689 and parameters: {'n_estimators': 62, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.6249544895416176.
[I 2024-11-11 15:58:30,216] Trial 3 finished with value: 0.6193225863821219 and parameters: {'n_estimators': 24, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.61