âœ… 1. Set Up & Load Dataset

In [14]:
import pandas as pd

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()

Train shape: (8693, 14)
Test shape: (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [27]:
df.colums

AttributeError: 'DataFrame' object has no attribute 'colums'

âœ… STEP 2 â€” Handle Missing Values

In [16]:
# Numeric columns
num_cols = ["Age","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns
cat_cols = ["HomePlanet","CryoSleep","Cabin","Destination","VIP"]

for col in cat_cols:
    df[col] = df[col].fillna("Unknown")

### âœ… STEP 3 â€” Feature Engineering 

ðŸ”¹ 1. Total Spending Feature

In [17]:
df["TotalSpending"] = (
    df["RoomService"] +
    df["FoodCourt"] +
    df["ShoppingMall"] +
    df["Spa"] +
    df["VRDeck"]
)

ðŸ”¹ 2. Cabin Split

In [18]:
df[["Deck","CabinNum","Side"]] = df["Cabin"].str.split("/", expand=True)

df.drop("Cabin", axis=1, inplace=True)

ðŸ”¹ 3. Passenger Group Feature

In [19]:
df["Group"] = df["PassengerId"].str.split("_").str[0]

ðŸ”¹ 4. Drop Unnecessary Columns

In [20]:
df.drop(["PassengerId","Name"], axis=1, inplace=True)

### âœ… STEP 4 â€” Encode Categorical Variables

In [21]:
df = pd.get_dummies(df, drop_first=True)

### âœ… STEP 5 â€” Split Back Train & Test

In [23]:
# Combine train and test safely
full = pd.concat([train, test], axis=0, ignore_index=True)

In [24]:
# Separate target before encoding
y = full["Transported"]

# Drop target temporarily
full_features = full.drop("Transported", axis=1)

# Encode only features
full_features = pd.get_dummies(full_features, drop_first=True)

# Add target back
full_features["Transported"] = y

In [25]:
train_df = full_features[full_features["Transported"].notnull()]
test_df = full_features[full_features["Transported"].isnull()].drop("Transported", axis=1)

X = train_df.drop("Transported", axis=1)
y = train_df["Transported"].astype(int)

### âœ… STEP 6 â€” Train Model

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, pred))

Validation Accuracy: 0.7855089131684876


### âœ… STEP 7 â€” Predict Test Set

In [28]:
test_pred = model.predict(test_df)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": test_pred.astype(bool)
})

submission.to_csv("submission.csv", index=False)