In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load the dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# View the first few rows
print(train.head())

# Check  missing values and data types
print(train.info())


print(train.describe())

# Load a clean copy of train for filling missing values in test
train_raw = pd.read_csv("train.csv")


  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
<c

In [3]:
# Drop Name column if present
train = train.drop(columns=['Name'], errors='ignore')

# Fill missing categorical values with mode
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
    train[col] = train[col].fillna(train[col].mode()[0])

# Fill missing numerical values
train['Age'] = train['Age'].fillna(train['Age'].median())
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train[spending_cols] = train[spending_cols].fillna(0)

# Handle Cabin column
train['Cabin'] = train['Cabin'].fillna('Unknown/0/U')
train[['Deck', 'CabinNum', 'Side']] = train['Cabin'].str.split('/', expand=True)
train['CabinNum'] = pd.to_numeric(train['CabinNum'], errors='coerce')
train = train.drop(columns=['Cabin'], errors='ignore')

# Total spending feature
train['TotalSpending'] = train[spending_cols].sum(axis=1)

# Convert booleans
for col in ['CryoSleep', 'VIP']:
    train[col] = train[col].astype(str).str.lower().map({'true': True, 'false': False})

# One-hot encoding for categorical features
train = pd.get_dummies(train, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'], drop_first=True)

# Split features and label
y = train['Transported']
X = train.drop(columns=['Transported', 'PassengerId'])


  train[col] = train[col].fillna(train[col].mode()[0])


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [6]:
y_pred = rf_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Accuracy: 0.7964347326049454

Classification Report:
               precision    recall  f1-score   support

       False       0.78      0.82      0.80       861
        True       0.81      0.77      0.79       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [7]:
# Fill categorical using train_raw
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
    test[col] = test[col].fillna(train_raw[col].mode()[0])

# Fill numeric values
test['Age'] = test['Age'].fillna(train_raw['Age'].median())
test[spending_cols] = test[spending_cols].fillna(0)

# Handle Cabin
test['Cabin'] = test['Cabin'].fillna('Unknown/0/U')
test[['Deck', 'CabinNum', 'Side']] = test['Cabin'].str.split('/', expand=True)
test['CabinNum'] = pd.to_numeric(test['CabinNum'], errors='coerce')
test = test.drop(columns=['Cabin', 'Name'], errors='ignore')

# TotalSpending
test['TotalSpending'] = test[spending_cols].sum(axis=1)

# Convert booleans
for col in ['CryoSleep', 'VIP']:
    test[col] = test[col].astype(str).str.lower().map({'true': True, 'false': False})

# One-hot encode
test = pd.get_dummies(test, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'], drop_first=True)

# Align columns with training set
X_test = test.drop(columns=['PassengerId'])
X_test = X_test.reindex(columns=X.columns, fill_value=0)


  test[col] = test[col].fillna(train_raw[col].mode()[0])


In [8]:
# Predict
test_predictions = rf_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_predictions.astype(bool)
})

submission.to_csv("submission.csv", index=False)


In [9]:
import pickle

with open("spaceship_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)


In [10]:
columns = X.columns  # After encoding
with open("spaceship_columns.pkl", "wb") as f:
    pickle.dump(columns, f)


In [11]:
with open("spaceship_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)


In [14]:
from sklearn.preprocessing import StandardScaler
import pickle

# List numeric features you want to scale
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending']


In [15]:
scaler = StandardScaler()
train_scaled = train.copy()
train_scaled[numeric_features] = scaler.fit_transform(train_scaled[numeric_features])


In [16]:
with open("spaceship_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [17]:
with open("spaceship_columns.pkl", "wb") as f:
    pickle.dump(train_scaled.drop(columns=['Transported', 'PassengerId']).columns, f)
