In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load the data
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

# Step 2: Handle missing data
# We'll use SimpleImputer to fill missing values
imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for the relevant columns
spaceship['HomePlanet'] = imputer.fit_transform(spaceship[['HomePlanet']])
spaceship['CryoSleep'] = imputer.fit_transform(spaceship[['CryoSleep']])
spaceship['Destination'] = imputer.fit_transform(spaceship[['Destination']])
spaceship['VIP'] = imputer.fit_transform(spaceship[['VIP']])
spaceship['Age'] = imputer.fit_transform(spaceship[['Age']])
spaceship['RoomService'] = imputer.fit_transform(spaceship[['RoomService']])
spaceship['FoodCourt'] = imputer.fit_transform(spaceship[['FoodCourt']])
spaceship['ShoppingMall'] = imputer.fit_transform(spaceship[['ShoppingMall']])
spaceship['Spa'] = imputer.fit_transform(spaceship[['Spa']])
spaceship['VRDeck'] = imputer.fit_transform(spaceship[['VRDeck']])

# Step 3: Convert categorical columns to numerical (encoding)
spaceship = pd.get_dummies(spaceship, drop_first=True)

# Step 4: Feature scaling
scaler = StandardScaler()
scaled_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
spaceship[scaled_columns] = scaler.fit_transform(spaceship[scaled_columns])

# Step 5: Train-test split
X = spaceship.drop(['PassengerId', 'Name', 'Transported'], axis=1)  # Features
y = spaceship['Transported']  # Target variable

# Split the data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the base model (RandomForestClassifier)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the accuracy
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy (Base): {accuracy_rf:.4f}")

# Step 6: Hyperparameter Tuning using GridSearchCV
# Define the hyperparameters to fine-tune
param_grid = {
    'n_estimators': [50, 100, 150, 200],  # Number of trees
    'max_depth': [10, 20, 30, None],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # The number of features to consider for the best split
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # Evaluate based on accuracy
                           n_jobs=-1,  # Use all processors for parallel computation
                           verbose=1)  # Show progress

# Run Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Step 7: Evaluate the best model
y_pred_best_rf = best_rf_model.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)

# Output the results
print(f"\nBest Hyperparameters: {best_params}")
print(f"Random Forest Model Accuracy (Tuned): {accuracy_best_rf:.4f}")
