In [1104]:
    # Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [1105]:
# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [1106]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [1107]:
# Data exploration and preprocessing
destination_common_value = train_data['Destination'].mode()[0]

random_true_false = lambda: np.random.choice([True, False])

# Determine 75th percentile for numeric features to fill missing values
columns_for_percentile = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
percentile = train_data[columns_for_percentile].quantile(0.75)
print(percentile)


RoomService     47.0
FoodCourt       76.0
ShoppingMall    27.0
Spa             59.0
VRDeck          46.0
Name: 0.75, dtype: float64


In [1108]:
# Drop unnecessary columns
train_data = train_data.drop(columns=['Name'])
test_data = test_data.drop(columns=['Name'])


In [1109]:
fill_values = {
    'HomePlanet' : 'Unknown',
    'CryoSleep' : random_true_false(),
    'Cabin' : 'Unknown',
    'Destination' : destination_common_value,
    'Age' : train_data['Age'].mean(),
    'VIP' : False,
    'RoomService' : percentile['RoomService'],
    'FoodCourt' : percentile['FoodCourt'],
    'ShoppingMall' : percentile['ShoppingMall'],
    'Spa' : percentile['Spa'],
    'VRDeck' : percentile['VRDeck'],
}

In [1110]:
# Fill missing values
train_data_filled = train_data.fillna(value=fill_values)
test_data_filled = test_data.fillna(value=fill_values)

  train_data_filled = train_data.fillna(value=fill_values)
  test_data_filled = test_data.fillna(value=fill_values)


In [1111]:
# One-hot encode categorical variables
train_data_filled_encoded = pd.get_dummies(train_data_filled, columns=['HomePlanet', 'Destination'], dtype=int)
test_data_filled_encoded = pd.get_dummies(test_data_filled, columns=['HomePlanet', 'Destination'], dtype=int)

# Drop PassengerID column
train_data_filled_encoded = train_data_filled_encoded.drop(columns=['PassengerId'])
test_data_filled_encoded = test_data_filled_encoded.drop(columns=['PassengerId'])

In [1112]:
# List of boolean columns to convert to integers
columns_for_boolean = ['CryoSleep', 'VIP']

# Apply conversion to both train and test data
for column in columns_for_boolean:
    train_data_filled_encoded[column] = train_data_filled_encoded[column].astype(int)
    test_data_filled_encoded[column] = test_data_filled_encoded[column].astype(int)


In [1113]:
# Extract the first character of Cabin and create new CabinGroup column
train_data_filled_encoded['CabinGroup'] = train_data_filled_encoded['Cabin'].str[0]
test_data_filled_encoded['CabinGroup'] = test_data_filled_encoded['Cabin'].str[0]

# Drop the original Cabin column
train_data_filled_encoded = train_data_filled_encoded.drop(columns=['Cabin'])
test_data_filled_encoded = test_data_filled_encoded.drop(columns=['Cabin'])

In [1114]:
# Convert CabinGroup column to one-hot encoded variables
train_data_filled_encoded = pd.get_dummies(train_data_filled_encoded, columns=['CabinGroup'], dtype=int)
test_data_filled_encoded = pd.get_dummies(test_data_filled_encoded, columns=['CabinGroup'], dtype=int)

In [1115]:
# Standarize the features
scaler = StandardScaler()

# Separate features and target for training data
train_features = train_data_filled_encoded.drop(columns=['Transported'])
train_target = train_data_filled_encoded['Transported']

# Fit the scaler
train_features_scaled = scaler.fit_transform(train_features)

# Convert the scaled features back to a DataFrame and add target variable
train_data_filled_encoded_scaled = pd.DataFrame(train_features_scaled, columns=train_features.columns)
train_data_filled_encoded_scaled['Transported'] = train_target.reset_index(drop=True)


# Transform the test features using the same scaler from training data
test_features = test_data_filled_encoded
test_features_scaled = scaler.transform(test_features)
test_data_filled_encoded_scaled = pd.DataFrame(test_features_scaled, columns=test_features.columns)


In [1116]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [1117]:
# Separate features and target for model training
X = train_data_filled_encoded_scaled.drop(columns=['Transported']) # Features
y = train_data_filled_encoded_scaled['Transported'] # Target

In [1118]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [1119]:
# Initialize and train the model
model = GradientBoostingClassifier(
    n_estimators=400,
    random_state=2,
    learning_rate=0.1
)
model.fit(X_train, y_train)

In [1120]:
# Make predictions on test set
y_pred = model.predict(X_test)

In [1121]:
# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy {accuracy:.2f}')
print(classification_report(y_test, y_pred))

accuracy 0.80
              precision    recall  f1-score   support

       False       0.80      0.79      0.79       828
        True       0.81      0.82      0.81       911

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739

