# *importing important libraries*

In [140]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# *reading the training and testing dataset*

In [141]:
train_df = pd.read_csv("/kaggle/input/jc-dev-comm-recruitment-task/train.csv")
test_df = pd.read_csv("/kaggle/input/jc-dev-comm-recruitment-task/test.csv")

In [142]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [143]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


# *applying cleaning and feature engineering to the data*

In [144]:
# adding placeholder Transported column in test for mergin
test_df['Transported'] = None
train_df['source'] = 'train'
test_df['source'] = 'test'

# combining both train and test datasets
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# extracting Deck / Cabin_num / Side from Cabin
combined_df[['Deck', 'Cabin_num', 'Side']] = combined_df['Cabin'].str.split('/', expand=True)

# dropping the Cabin, Name, PassengerId columns
combined_df.drop(['Cabin', 'Name', 'PassengerId'], axis=1, inplace=True)

# *filling missing values*

In [145]:
# Fill missing values in extracted columns
combined_df['Deck'] = combined_df['Deck'].fillna('Unknown')
combined_df['Side'] = combined_df['Side'].fillna('Unknown')
combined_df['Cabin_num'] = pd.to_numeric(combined_df['Cabin_num'], errors='coerce').fillna(-1)

# Add Smart Features
combined_df['IsTopDeck'] = combined_df['Deck'].apply(lambda x: 1 if x in ['A', 'B'] else 0)
combined_df['IsBackCabin'] = combined_df['Cabin_num'].apply(lambda x: 1 if x > 1500 else 0)

# Deck frequency
deck_counts = combined_df['Deck'].value_counts().to_dict()
combined_df['DeckFrequency'] = combined_df['Deck'].map(deck_counts)

In [146]:
for col in combined_df.select_dtypes(include='object').columns:
    combined_df[col] = combined_df[col].fillna('Unknown')

for col in combined_df.select_dtypes(include='number').columns:
    combined_df[col] = combined_df[col].fillna(0)

# Convert all object and boolean columns to string
for col in combined_df.columns:
    if combined_df[col].dtype == 'object' or combined_df[col].dtype == 'bool':
        combined_df[col] = combined_df[col].astype(str)

# Label Encoding for all categorical features
for col in combined_df.columns:
    if col != 'source':
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col])

In [147]:
# Split back to train and test
train_processed = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
test_processed = combined_df[combined_df['source'] == 'test'].drop(['source', 'Transported'], axis=1)

# *assigning features & label and splitting data into training & testing*

In [148]:
# Features and label
X = train_processed.drop('Transported', axis=1)
y = train_processed['Transported'].astype(int)

# Split into training and validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

# *fitting the model (XGBoost Classifier)*

In [149]:
# XGBoost model
xgb_model = XGBClassifier(n_estimators=50, random_state=1)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
xgb_pred = xgb_model.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, xgb_pred))

XGBoost Validation Accuracy: 0.828735632183908


# *final prediction on test set*

In [150]:
# Final prediction on test set
final_preds = xgb_model.predict(test_processed)

In [151]:
submission = pd.read_csv("/kaggle/input/jc-dev-comm-recruitment-task/sample_submission.csv")
submission['Transported'] = final_preds.astype(bool)
submission.to_csv("submission.csv", index=False)
print("XGBoost submission.csv generated successfully.")

XGBoost submission.csv generated successfully.
