In [36]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [37]:
data = pd.read_csv('data/train.csv')
kaggle_test = pd.read_csv('data/test.csv')

In [38]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [None]:
train.isnull().sum()

## Fill gaps

In [40]:
numeric_fields = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categ_fields = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

Fill numeric features with median values

In [41]:
num_imputer = SimpleImputer(strategy='median')
train[numeric_fields] = num_imputer.fit_transform(train[numeric_fields])
test[numeric_fields] = num_imputer.transform(test[numeric_fields])
kaggle_test[numeric_fields] = num_imputer.transform(kaggle_test[numeric_fields])

Fill categorial features with most frequent values

In [42]:
cat_imputer = SimpleImputer(strategy='most_frequent')
train[categ_fields] = cat_imputer.fit_transform(train[categ_fields])
test[categ_fields] = cat_imputer.transform(test[categ_fields])
kaggle_test[categ_fields] = cat_imputer.transform(kaggle_test[categ_fields])

In [None]:
train.isnull().sum()

Fill all null names with default name John Doe

In [44]:
train["Name"] = train["Name"].fillna("John Doe")
test["Name"] = test["Name"].fillna("John Doe")
kaggle_test["Name"] = kaggle_test["Name"].fillna("John Doe")

Fill all null cabin values as: Deck X Room 0 Side X

In [45]:
train["Cabin"] = train["Cabin"].fillna("X/0/X")
test["Cabin"] = test["Cabin"].fillna("X/0/X")
kaggle_test["Cabin"] = kaggle_test["Cabin"].fillna("X/0/X")

## Normalization

But firstly, we need to split cabin into 3 columns as it's more useful. Also otherwise cabin number can't be normalized easily.

In [46]:
train[["Deck", "Room", "Side"]] = train["Cabin"].str.split("/", expand=True)
train = train.drop("Cabin", axis=1)
test[["Deck", "Room", "Side"]] = test["Cabin"].str.split("/", expand=True)
test = test.drop("Cabin", axis=1)
kaggle_test[["Deck", "Room", "Side"]] = kaggle_test["Cabin"].str.split("/", expand=True)
kaggle_test = kaggle_test.drop("Cabin", axis=1)

In [47]:
numeric_fields = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Room']

Now normalization via some scaler

fit normalizer

In [None]:
scaler = MinMaxScaler()
scaler.fit(train[numeric_fields])

now transofrm data based on fitting

In [49]:
train[numeric_fields] = scaler.transform(train[numeric_fields])
test[numeric_fields] = scaler.transform(test[numeric_fields])
kaggle_test[numeric_fields] = scaler.transform(kaggle_test[numeric_fields])

## Replace categories with OHE

In [50]:
categ_fields = ['HomePlanet', 'CryoSleep', 'Destination', 'Deck', 'Side']

In [51]:
train_encoded = pd.get_dummies(train, columns=categ_fields)
test_encoded = pd.get_dummies(test, columns=categ_fields)
kaggle_test_encoded = pd.get_dummies(kaggle_test, columns=categ_fields)

I checked and found out that no Deck_T present in the test, so form uniform data we need to make sure columns match

In [52]:
for c in train_encoded.columns:
    if (c not in test_encoded.columns):
        test_encoded[c] = 0

for c in test_encoded.columns:
    if (c not in train_encoded.columns):
        train_encoded[c] = 0

for c in test_encoded.columns:
    if (c not in kaggle_test_encoded.columns):
        kaggle_test_encoded[c] = 0

In [None]:
test_encoded.head()

In [54]:
train_encoded.to_csv('data/train_preprocessed.csv', index=False)
test_encoded.to_csv('data/test_preprocessed.csv', index=False)
kaggle_test_encoded.to_csv('data/kaggle_test_preprocessed.csv', index=False)