1. Data Exploration and Preprocessing
2. Feature Engineering
3. Model Selection and Training
4. Model Evaluation
5. Prediction on Test Data
6. Submission Preparation

### 1. Data Exploration and Preprocesing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display basic information about the datasets
print(train_data.info())
print(train_data.describe())

# Check for missing values
print(train_data.isnull().sum())

# Visualize the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='Transported', data=train_data)
plt.title('Distribution of Transported')
plt.show()

# Separate features and target
X = train_data.drop(['PassengerId', 'Name', 'Transported'], axis=1)
y = train_data['Transported']

# Define preprocessing steps
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit the preprocessor to the training data
X_processed = preprocessor.fit_transform(X)

print("Preprocessing completed.")