In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('data/train.csv')
kaggle_test = pd.read_csv('data/test.csv')

In [7]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [8]:
train.isnull().sum()

PassengerId       0
HomePlanet      140
CryoSleep       154
Cabin           142
Destination     122
Age             129
VIP             153
RoomService     114
FoodCourt       122
ShoppingMall    146
Spa             122
VRDeck          129
Name            138
Transported       0
dtype: int64

## Fill gaps

In [9]:
numeric_fields = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categ_fields = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

Fill numeric features with median values

In [10]:
num_imputer = SimpleImputer(strategy='median')
train[numeric_fields] = num_imputer.fit_transform(train[numeric_fields])
test[numeric_fields] = num_imputer.transform(test[numeric_fields])
kaggle_test[numeric_fields] = num_imputer.transform(kaggle_test[numeric_fields])

Fill categorial features with most frequent values

In [11]:
cat_imputer = SimpleImputer(strategy='most_frequent')
train[categ_fields] = cat_imputer.fit_transform(train[categ_fields])
test[categ_fields] = cat_imputer.transform(test[categ_fields])
kaggle_test[categ_fields] = cat_imputer.transform(kaggle_test[categ_fields])

In [19]:
train.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

Fill all null names with default name John Doe

In [16]:
train["Name"] = train["Name"].fillna("John Doe")
test["Name"] = test["Name"].fillna("John Doe")
kaggle_test["Name"] = kaggle_test["Name"].fillna("John Doe")

Fill all null cabin values as: Deck X Room 0 Side X

In [18]:
train["Cabin"] = train["Cabin"].fillna("X/0/X")
test["Cabin"] = test["Cabin"].fillna("X/0/X")
kaggle_test["Cabin"] = kaggle_test["Cabin"].fillna("X/0/X")

## Normalization

But firstly, we need to split cabin into 3 columns as it's more useful. Also otherwise cabin number can't be normalized easily.

In [20]:
train[["Deck", "Room", "Side"]] = train["Cabin"].str.split("/", expand=True)
train = train.drop("Cabin", axis=1)
test[["Deck", "Room", "Side"]] = test["Cabin"].str.split("/", expand=True)
test = test.drop("Cabin", axis=1)
kaggle_test[["Deck", "Room", "Side"]] = kaggle_test["Cabin"].str.split("/", expand=True)
kaggle_test = kaggle_test.drop("Cabin", axis=1)

Now normalization via some scaler

fit normalizer

In [None]:
scaler = MinMaxScaler()
scaler.fit(train[numeric_fields])

now transofrm data based on fitting

In [None]:
train[numeric_fields] = scaler.transform(train[numeric_fields])
test[numeric_fields] = scaler.transform(test[numeric_fields])
kaggle_test[numeric_fields] = scaler.transform(kaggle_test[numeric_fields])