This notebook will be used as a base model for further models.

Contents

- [Load Data](#load-data)
    - [Confirm Concatenate Shape](#confirm-concatenated-shape)
- [Check Nulls](#check-null-value)
- 
- 
- 
- 
- 
-




# Imports

In [227]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load Data

- Load Train
- Load Test
- Combine Train and Test

In [228]:
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')

len_train = len(train)
len_test = len(test)

train.shape, test.shape

((8693, 14), (4277, 13))

In [229]:
df = pd.concat([train, test])

## Confirm Concatenated Shape

In [230]:
df.shape[0] == test.shape[0] + train.shape[0]

True

# Check Null Values

In [231]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [232]:
df.isnull().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
dtype: int64

# Data Preparation

## Drop Columns

In [233]:
df.drop(columns=['Name'], inplace=True)

## Cabin

This is taken from the challenge page:

*Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.*

Therefore, Cabin will be separated into three columns (Deck, Num, Side)

In [234]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df.drop(columns=['Cabin'], inplace=True)

## Fill Unknown Deck, Num and Side columns with 'U'

In [235]:
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

## Label Encode Deck, Side

In [236]:
df.Side.value_counts()

Side
S    6381
P    6290
U     299
Name: count, dtype: int64

In [237]:
df['Deck'] = df['Deck'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7, 'U': -1})
df['Side'] = df['Side'].map({'S': 0, 'P': 1, 'U': -1})

# Encode Categorical Features

In [238]:
df.isnull().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Transported     4277
Deck               0
Num                0
Side               0
dtype: int64

In [239]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,0,1
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,5,0,0
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0,0,0
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0,0,0
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,5,1,0


In [240]:
impute_list = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Num', 'Side']
rest_columns = list(set(df.columns) - set(impute_list))

df_rest = df[rest_columns]

# Impute Numerical Figures

In [241]:
imp = KNNImputer()

df_imputed = imp.fit_transform(df[impute_list])
df_imputed = pd.DataFrame(df_imputed, columns=impute_list)
df = pd.concat([df_rest.reset_index(drop=True), df_imputed.reset_index(drop=True)], axis=1)

# Impute Categorical Columns

In [242]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')

# Create Bills Column from RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

In [243]:
bills_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['AmountSpent'] = df[bills_columns].sum(axis=1)
df.drop(columns=bills_columns, inplace=True)
df.head()

Unnamed: 0,Transported,Destination,PassengerId,HomePlanet,CryoSleep,Age,VIP,Deck,Num,Side,AmountSpent
0,False,TRAPPIST-1e,0001_01,Europa,0.0,39.0,0.0,1.0,0.0,1.0,0.0
1,True,TRAPPIST-1e,0002_01,Earth,0.0,24.0,0.0,5.0,0.0,0.0,736.0
2,False,TRAPPIST-1e,0003_01,Europa,0.0,58.0,1.0,0.0,0.0,0.0,10383.0
3,False,TRAPPIST-1e,0003_02,Europa,0.0,33.0,0.0,0.0,0.0,0.0,5176.0
4,True,TRAPPIST-1e,0004_01,Earth,0.0,16.0,0.0,5.0,1.0,0.0,1091.0


# Label Encode Destination, HomePlanet

In [244]:
category_columns = ['HomePlanet', 'Destination']

for col in category_columns:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    
df.drop(columns=category_columns, inplace=True)

In [245]:
df.head()

Unnamed: 0,Transported,PassengerId,CryoSleep,Age,VIP,Deck,Num,Side,AmountSpent,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
0,False,0001_01,0.0,39.0,0.0,1.0,0.0,1.0,0.0,False,True,False,False,False,False,True,False
1,True,0002_01,0.0,24.0,0.0,5.0,0.0,0.0,736.0,True,False,False,False,False,False,True,False
2,False,0003_01,0.0,58.0,1.0,0.0,0.0,0.0,10383.0,False,True,False,False,False,False,True,False
3,False,0003_02,0.0,33.0,0.0,0.0,0.0,0.0,5176.0,False,True,False,False,False,False,True,False
4,True,0004_01,0.0,16.0,0.0,5.0,1.0,0.0,1091.0,True,False,False,False,False,False,True,False


In [246]:
train_preprocessed = df[:len_train].copy()
test_preprocessed = df[len_train:].copy()

train_preprocessed.reset_index(drop=True, inplace=True)
test_preprocessed.reset_index(drop=True, inplace=True)

test_preprocessed.drop(columns=['Transported'], inplace=True)

len(test) == len(test_preprocessed)
test_preprocessed.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,Deck,Num,Side,AmountSpent,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
0,0013_01,1.0,27.0,0.0,6.0,3.0,0.0,0.0,True,False,False,False,False,False,True,False
1,0018_01,0.0,19.0,0.0,5.0,4.0,0.0,2832.0,True,False,False,False,False,False,True,False
2,0019_01,1.0,31.0,0.0,2.0,0.0,0.0,0.0,False,True,False,False,True,False,False,False
3,0021_01,0.0,38.0,0.0,2.0,1.0,0.0,7418.0,False,True,False,False,False,False,True,False
4,0023_01,0.0,20.0,0.0,5.0,5.0,0.0,645.0,True,False,False,False,False,False,True,False


# Save New DFs

In [247]:
train_preprocessed.to_csv('../../data/preproc_data/train_1_0.csv', index=False)
test_preprocessed.to_csv('../../data/preproc_data/test_1_0.csv', index=False)