In [30]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [31]:
file_path = 'C:\\Users\\Asus\\Downloads\\Datasets\\SpaceshipTitanic\\train.csv'
train_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [32]:
# Basic information about the dataset
info = train_data.info()

# Summary statistics for numerical columns
summary_statistics = train_data.describe()

# Checking for missing values in each column
missing_values = train_data.isnull().sum()

info, summary_statistics, missing_values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


(None,
                Age   RoomService     FoodCourt  ShoppingMall           Spa  \
 count  8514.000000   8512.000000   8510.000000   8485.000000   8510.000000   
 mean     28.827930    224.687617    458.077203    173.729169    311.138778   
 std      14.489021    666.717663   1611.489240    604.696458   1136.705535   
 min       0.000000      0.000000      0.000000      0.000000      0.000000   
 25%      19.000000      0.000000      0.000000      0.000000      0.000000   
 50%      27.000000      0.000000      0.000000      0.000000      0.000000   
 75%      38.000000     47.000000     76.000000     27.000000     59.000000   
 max      79.000000  14327.000000  29813.000000  23492.000000  22408.000000   
 
              VRDeck  
 count   8505.000000  
 mean     304.854791  
 std     1145.717189  
 min        0.000000  
 25%        0.000000  
 50%        0.000000  
 75%       46.000000  
 max    24133.000000  ,
 PassengerId       0
 HomePlanet      201
 CryoSleep       217
 Cabin   

In [33]:
# Handling missing values
# Imputers for categorical and numerical data
cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer = SimpleImputer(strategy="median")

# Columns to impute
cat_columns = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
num_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Applying imputation
train_data[cat_columns] = cat_imputer.fit_transform(train_data[cat_columns])
train_data[num_columns] = num_imputer.fit_transform(train_data[num_columns])


In [34]:
# Feature Engineering
# Splitting the 'Cabin' into 'Deck', 'Num', and 'Side'
train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].str.split('/', expand=True)

# Total Expenditure
train_data['TotalExpenditure'] = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

# Group Size from 'PassengerId'
train_data['GroupSize'] = train_data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

# Age Categories
train_data['AgeCategory'] = pd.cut(train_data['Age'], bins=[0, 12, 60, 100], labels=['Child', 'Adult', 'Senior'])

In [35]:
# Preprocessing
# Convert boolean columns to numerical
bool_columns = ['CryoSleep', 'VIP', 'Transported']
train_data[bool_columns] = train_data[bool_columns].astype(int)

# One-hot encoding categorical variables
categorical_features = ['HomePlanet', 'Destination', 'Deck', 'Side', 'AgeCategory']
train_data = pd.get_dummies(train_data, columns=categorical_features)

# Checking the transformed dataset
train_data.head()

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S,AgeCategory_Child,AgeCategory_Adult,AgeCategory_Senior
0,0001_01,0,B/0/P,39.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,1,0
1,0002_01,0,F/0/S,24.0,0,109.0,9.0,25.0,549.0,44.0,...,0,0,1,0,0,0,1,0,1,0
2,0003_01,0,A/0/S,58.0,1,43.0,3576.0,0.0,6715.0,49.0,...,0,0,0,0,0,0,1,0,1,0
3,0003_02,0,A/0/S,33.0,0,0.0,1283.0,371.0,3329.0,193.0,...,0,0,0,0,0,0,1,0,1,0
4,0004_01,0,F/1/S,16.0,0,303.0,70.0,151.0,565.0,2.0,...,0,0,1,0,0,0,1,0,1,0


In [36]:
# Defining the feature matrix (X) and the target variable (y)
X = train_data.drop(['PassengerId', 'Cabin', 'Name', 'Num', 'Transported'], axis=1)
y = train_data['Transported']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
# Initialize the Random Forest Classifier
param_grid = {
   'n_estimators': 87,
    'max_depth': 7,
    'min_samples_split': 3,
    'min_samples_leaf': 6,
    'max_features': 'sqrt'
}

rf_classifier = RandomForestClassifier(**param_grid, random_state=42)


In [42]:
# Training the model
rf_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_pred = rf_classifier.predict(X_val)

In [43]:
# Evaluating the model
accuracy = accuracy_score(y_val, y_pred)
classification_rep = classification_report(y_val, y_pred)

print(accuracy)
print(classification_rep)

0.7855089131684876
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       861
           1       0.78      0.80      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [44]:
# Loading the test dataset
test_file_path = 'C:\\Users\\Asus\\Downloads\\Datasets\\SpaceshipTitanic\\test.csv'
test_data = pd.read_csv(test_file_path)

In [45]:
# Handling missing values
test_data[cat_columns] = cat_imputer.transform(test_data[cat_columns])
test_data[num_columns] = num_imputer.transform(test_data[num_columns])

In [46]:
# Feature Engineering
# Splitting the 'Cabin' into 'Deck', 'Num', and 'Side'
test_data[['Deck', 'Num', 'Side']] = test_data['Cabin'].str.split('/', expand=True)

# Total Expenditure
test_data['TotalExpenditure'] = test_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

# Group Size from 'PassengerId'
test_data['GroupSize'] = test_data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

# Age Categories
test_data['AgeCategory'] = pd.cut(test_data['Age'], bins=[0, 12, 60, 100], labels=['Child', 'Adult', 'Senior'])


In [47]:
# Preprocessing

# Convert boolean columns to numerical (excluding 'Transported')
bool_columns_test = ['CryoSleep', 'VIP']
test_data[bool_columns_test] = test_data[bool_columns_test].astype(int)

# One-hot encoding categorical variables
test_data = pd.get_dummies(test_data, columns=categorical_features)

test_features = [col for col in X.columns if col != 'Transported']
test_data = test_data.reindex(columns=test_features, fill_value=0)

# Checking the transformed test dataset
test_data.head()


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpenditure,GroupSize,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S,AgeCategory_Child,AgeCategory_Adult,AgeCategory_Senior
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,0.0,13,...,0,0,0,1,0,0,1,0,1,0
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,2832.0,18,...,0,0,1,0,0,0,1,0,1,0
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0.0,19,...,0,0,0,0,0,0,1,0,1,0
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,7418.0,21,...,0,0,0,0,0,0,1,0,1,0
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,645.0,23,...,0,0,1,0,0,0,1,0,1,0


In [48]:
# Making predictions on the test dataset
test_predictions = rf_classifier.predict(test_data)


In [49]:
# Preparing the submission file
submission = pd.DataFrame({
    'PassengerId': test_file_path,
    'Transported': test_predictions
})

# Converting 'Transported' back to boolean for submission
submission['Transported'] = submission['Transported'].astype(bool)
# Correcting the PassengerId in the submission file
submission['PassengerId'] = pd.read_csv(test_file_path)['PassengerId']


# Displaying the first few rows of the submission file
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
