In [187]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load the dataset

In [156]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [157]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [158]:
destination_common_value = train_data['Destination'].mode()[0]

random_true_false = lambda: np.random.choice([True, False])

columns_for_percentile = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
percentile = train_data[columns_for_percentile].quantile(0.75)
print(percentile)


RoomService     47.0
FoodCourt       76.0
ShoppingMall    27.0
Spa             59.0
VRDeck          46.0
Name: 0.75, dtype: float64


In [159]:
train_data = train_data.drop(columns=['Name'])
test_data = test_data.drop(columns=['Name'])


In [160]:
fill_values = {
    'HomePlanet' : 'Unknown',
    'CryoSleep' : random_true_false(),
    'Cabin' : 'Unknown',
    'Destination' : destination_common_value,
    'Age' : train_data['Age'].mean(),
    'VIP' : False,
    'RoomService' : percentile['RoomService'],
    'FoodCourt' : percentile['FoodCourt'],
    'ShoppingMall' : percentile['ShoppingMall'],
    'Spa' : percentile['Spa'],
    'VRDeck' : percentile['VRDeck'],
}

In [161]:
train_data_filled = train_data.fillna(value=fill_values)
test_data_filled = test_data.fillna(value=fill_values)

  train_data_filled = train_data.fillna(value=fill_values)
  test_data_filled = test_data.fillna(value=fill_values)


In [162]:
train_data_filled

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [163]:
train_data_filled_encoded = pd.get_dummies(train_data_filled, columns=['HomePlanet', 'Destination'], dtype=int)
test_data_filled_encoded = pd.get_dummies(test_data_filled, columns=['HomePlanet', 'Destination'], dtype=int)

# Dropping PassengerID column

train_data_filled_encoded = train_data_filled_encoded.drop(columns=['PassengerId'])
test_data_filled_encoded = test_data_filled_encoded.drop(columns=['PassengerId'])

In [164]:
columns_for_boolean = ['CryoSleep', 'VIP', 'Transported']
columns_for_boolean_test = ['CryoSleep', 'VIP']

for column in columns_for_boolean:
    train_data_filled_encoded[column] = train_data_filled_encoded[column].astype(int)

for column in columns_for_boolean_test:
    test_data_filled_encoded[column] = test_data_filled_encoded[column].astype(int)

In [170]:
train_data_filled_encoded['CabinGroup'] = train_data_filled_encoded['Cabin'].str[0]
test_data_filled_encoded['CabinGroup'] = test_data_filled_encoded['Cabin'].str[0]

train_data_filled_encoded = train_data_filled_encoded.drop(columns=['Cabin'])
test_data_filled_encoded = test_data_filled_encoded.drop(columns=['Cabin'])

In [173]:
train_data_filled_encoded = pd.get_dummies(train_data_filled_encoded, columns=['CabinGroup'], dtype=int)
test_data_filled_encoded = pd.get_dummies(test_data_filled_encoded, columns=['CabinGroup'], dtype=int)

# Standarization

In [186]:
scaler = StandardScaler()

train_data_filled_encoded_scaled = scaler.fit_transform(train_data_filled_encoded)
test_data_filled_encoded_scaled = scaler.transform(test_data_filled_encoded)