In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load the data
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

# Step 2: Handle missing data
# We'll use SimpleImputer to fill missing values
imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for the relevant columns
spaceship['HomePlanet'] = imputer.fit_transform(spaceship[['HomePlanet']])
spaceship['CryoSleep'] = imputer.fit_transform(spaceship[['CryoSleep']])
spaceship['Destination'] = imputer.fit_transform(spaceship[['Destination']])
spaceship['VIP'] = imputer.fit_transform(spaceship[['VIP']])
spaceship['Age'] = imputer.fit_transform(spaceship[['Age']])
spaceship['RoomService'] = imputer.fit_transform(spaceship[['RoomService']])
spaceship['FoodCourt'] = imputer.fit_transform(spaceship[['FoodCourt']])
spaceship['ShoppingMall'] = imputer.fit_transform(spaceship[['ShoppingMall']])
spaceship['Spa'] = imputer.fit_transform(spaceship[['Spa']])
spaceship['VRDeck'] = imputer.fit_transform(spaceship[['VRDeck']])

# Step 3: Convert categorical columns to numerical (encoding)
spaceship = pd.get_dummies(spaceship, drop_first=True)

# Step 4: Feature scaling
scaler = StandardScaler()
scaled_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
spaceship[scaled_columns] = scaler.fit_transform(spaceship[scaled_columns])

# Check the data after preprocessing
spaceship.head()

# Step 5: Train-test split
X = spaceship.drop(['PassengerId', 'Name', 'Transported'], axis=1)  # Features
y = spaceship['Transported']  # Target variable

# Split the data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Bagging (using DecisionTreeClassifier as base estimator)
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(X_train, y_train)

# Predict and evaluate the accuracy
y_pred_bagging = bagging_model.predict(X_test)
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print(f"Bagging Model Accuracy: {accuracy_bagging:.4f}")

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the accuracy
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf:.4f}")

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predict and evaluate the accuracy
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosting Model Accuracy: {accuracy_gb:.4f}")

# AdaBoost Classifier
ab_model = AdaBoostClassifier(n_estimators=100, random_state=42)
ab_model.fit(X_train, y_train)

# Predict and evaluate the accuracy
y_pred_ab = ab_model.predict(X_test)
accuracy_ab = accuracy_score(y_test, y_pred_ab)
print(f"AdaBoost Model Accuracy: {accuracy_ab:.4f}")

# Comparing the results
results = {
    'Bagging': accuracy_bagging,
    'Random Forest': accuracy_rf,
    'Gradient Boosting': accuracy_gb,
    'AdaBoost': accuracy_ab
}

# Identify the best performing model
best_model = max(results, key=results.get)
best_accuracy = results[best_model]

# Display the comparison
print("\nModel Comparison:")
for model, accuracy in results.items():
    print(f"{model}: {accuracy:.4f}")

print(f"\nThe best model is: {best_model} with accuracy {best_accuracy:.4f}")
